# PySpark

## Cómo crear una sesión en Spark

In [10]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.master("local").appName("PySpark").config('spark.ui.port', '4050').getOrCreate()
spark

## Leer un csv 

In [11]:
df_titanic = spark.read.csv('titanic.csv',header=True)

df_titanic.printSchema()

root
 |-- PassengerId: string (nullable = true)
 |-- Survived: string (nullable = true)
 |-- Pclass: string (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: string (nullable = true)
 |-- SibSp: string (nullable = true)
 |-- Parch: string (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: string (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)



## Eliminando las columnas no necesarias

In [12]:
NUMERIC_COLUMNS = ["Age","SibSp","Parch","Fare"]
ENUM_COLUMNS = ["Embarked","Pclass"]
LABEL_COLUMN = ["Survived"]

df_titanic = df_titanic.select(NUMERIC_COLUMNS + ENUM_COLUMNS + LABEL_COLUMN)
df_titanic.printSchema()

root
 |-- Age: string (nullable = true)
 |-- SibSp: string (nullable = true)
 |-- Parch: string (nullable = true)
 |-- Fare: string (nullable = true)
 |-- Embarked: string (nullable = true)
 |-- Pclass: string (nullable = true)
 |-- Survived: string (nullable = true)



## Convirtiendo las columnas numéricas a flotante

In [13]:
from pyspark.sql.functions import col
def cast_to_float(input_df):
    return input_df.select([col(col_name).cast("float") if col_name in NUMERIC_COLUMNS 
                            else col(col_name)
                            for col_name in input_df.columns])

df_titanic = df_titanic.transform(cast_to_float)
df_titanic.printSchema()

root
 |-- Age: float (nullable = true)
 |-- SibSp: float (nullable = true)
 |-- Parch: float (nullable = true)
 |-- Fare: float (nullable = true)
 |-- Embarked: string (nullable = true)
 |-- Pclass: string (nullable = true)
 |-- Survived: string (nullable = true)



## Filtrando las columnas

In [19]:
df_survived = df_titanic.where(col("Survived") == "1")
df_survived.show() 

df_titanic.createOrReplaceTempView("df_titanic")
df_sql_survived = spark.sql("SELECT * FROM df_titanic WHERE survived='1'")
df_sql_survived.show()

+----+-----+-----+--------+--------+------+--------+
| Age|SibSp|Parch|    Fare|Embarked|Pclass|Survived|
+----+-----+-----+--------+--------+------+--------+
|38.0|  1.0|  0.0| 71.2833|       C|     1|       1|
|26.0|  0.0|  0.0|   7.925|       S|     3|       1|
|35.0|  1.0|  0.0|    53.1|       S|     1|       1|
|27.0|  0.0|  2.0| 11.1333|       S|     3|       1|
|14.0|  1.0|  0.0| 30.0708|       C|     2|       1|
| 4.0|  1.0|  1.0|    16.7|       S|     3|       1|
|58.0|  0.0|  0.0|   26.55|       S|     1|       1|
|55.0|  0.0|  0.0|    16.0|       S|     2|       1|
|null|  0.0|  0.0|    13.0|       S|     2|       1|
|null|  0.0|  0.0|   7.225|       C|     3|       1|
|34.0|  0.0|  0.0|    13.0|       S|     2|       1|
|15.0|  0.0|  0.0|  8.0292|       Q|     3|       1|
|28.0|  0.0|  0.0|    35.5|       S|     1|       1|
|38.0|  1.0|  5.0| 31.3875|       S|     3|       1|
|null|  0.0|  0.0|  7.8792|       Q|     3|       1|
|null|  1.0|  0.0|146.5208|       C|     1|   

## Resumen estadístico del DataFrame

In [15]:
df_titanic.select(["Age","SibSp","Parch","Pclass"]).summary().show()

+-------+------------------+------------------+-------------------+------------------+
|summary|               Age|             SibSp|              Parch|            Pclass|
+-------+------------------+------------------+-------------------+------------------+
|  count|               714|               891|                891|               891|
|   mean| 29.69911764704046|0.5230078563411896|0.38159371492704824| 2.308641975308642|
| stddev|14.526497332370992|1.1027434322934315| 0.8060572211299488|0.8360712409770491|
|    min|              0.42|               0.0|                0.0|                 1|
|    25%|              20.0|               0.0|                0.0|               2.0|
|    50%|              28.0|               0.0|                0.0|               3.0|
|    75%|              38.0|               1.0|                0.0|               3.0|
|    max|              80.0|               8.0|                6.0|                 3|
+-------+------------------+---------------

## Agrupando columnas

In [27]:
import pyspark.sql.functions as F
df_titanic_groupby_class = df_titanic.groupby("Pclass").agg(F.count("Pclass").alias("Number of ocurrences"),F.mean("Age").alias("Average age")).orderBy("Pclass",ascending=True)
df_titanic_groupby_class.show()

+------+--------------------+------------------+
|Pclass|Number of ocurrences|       Average age|
+------+--------------------+------------------+
|     1|                 216| 38.23344086030478|
|     2|                 184|29.877630057706998|
|     3|                 491| 25.14061971827292|
+------+--------------------+------------------+

