# Clasificación con data de hundimiento del Titanic

In [None]:
import findspark
findspark.init()

In [None]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('pyspark-titanic-exe').getOrCreate()

In [None]:
spark.version

In [None]:
dft = spark.read.csv('/dataset/titanic.csv', header=True)

In [None]:
dft.show(10)

In [None]:
dft.printSchema()

In [None]:
from pyspark.sql.types import DoubleType, IntegerType

In [None]:
dft = dft.withColumn('survived', dft['survived'].cast(IntegerType()))

In [None]:
dft.printSchema()

In [None]:
dft.select('survived').show(5)

In [None]:
dft.groupBy('survived').count().orderBy('count').show(10)

## Extraccion (Ejercicio 1)

In [None]:
from pathlib import Path
import pandas as pd

In [None]:
def extract_titanic_data(url, refresh_cache=False):
    
    return df

In [None]:
url = 'http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic3.csv'

In [None]:
df_raw = extract_titanic_data(url)

In [None]:
type(df_raw)

In [None]:
df_raw.show(10)

In [None]:
df_raw.printSchema()

In [None]:
# Rename home.dest column

df_raw.printSchema()

In [None]:
df_raw.count()

In [None]:
# Train/test split
train_df, test_df = df_raw.randomSplit([0.7,0.3],seed=1234)

In [None]:
#df_raw.randomSplit?

In [None]:
train_df.count()

In [None]:
from pyspark.sql.functions import rand, when
df_raw = df_raw.withColumn('train', when(rand(seed=1234) >= 0.3, True).otherwise(False))

In [None]:
#rand?

In [None]:
df_raw.select('train').groupby('train').count().show()

In [None]:
df_raw.select('train').printSchema()

In [None]:
import pyspark.sql.functions as f

In [None]:
df_train = df_raw.filter(f.col('train') == True)

In [None]:
df_train.count()

In [None]:
df_test = df_raw.filter(f.col('train') != True)

In [None]:
df_test.count()

## Ejercicio 2: Primer preproceso / EDA

In [None]:
# Casteo de datos
df_raw.printSchema()

In [None]:
def cast_cols(df, cols, new_type):
 
  return df

In [None]:
integer_cols = ['survived', 'sibsp', 'parch', 'body']
float_cols = ['age', 'fare']

In [None]:
df_raw = cast_cols(df_raw, integer_cols, IntegerType)
df_raw = cast_cols(df_raw, float_cols, DoubleType)

In [None]:
df_raw.printSchema()

In [None]:
survived_on_boat = df_raw.filter((f.col('boat').isNotNull()) & (f.col('survived') == 1)).count()
survived = df_raw.filter(f.col('survived') == 1).count()
survived_on_boat / survived

In [None]:
import logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s-%(name)s-%(levelname)s: %(message)s',
    handlers=[logging.FileHandler('/dataset/titanic_spark.log'), logging.StreamHandler()],
)
logger = logging.getLogger(__name__)

In [None]:
def _drop_unusable_cols():
    logger.info(
        f"Dropping the following {len(cols)} unusable columns:\n"
        f"{cols}"
    )
    
    logger.info(
        f"Remaining {len(df.columns)} columns:\n {sorted(df.columns)}"
    )
    return df

In [None]:
train_df = _drop_unusable_cols(train_df, cols=['boat', 'body', 'train'])

In [None]:
train_df.columns, test_df.columns

In [None]:
test_df = _drop_unusable_cols(test_df, cols=['boat', 'body', 'train'])

In [None]:
train_df = train_df.withColumn('train_new', f.lit(True))
test_df = test_df.withColumn('train_new', f.lit(False))

In [None]:
joined_df = train_df.unionByName(test_df)
joined_df.count()

In [None]:
df = _drop_unusable_cols(df_raw, cols=['boat', 'body', 'ticket'])

In [None]:
# Plotting
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
df.select('survived').groupBy('survived').count().toPandas().set_index('survived').plot.bar()

## EDA y Ejercicio 3

In [None]:
import seaborn as sns

In [None]:
g = sns.heatmap(df.select('survived', 'age', 'parch', 'fare', 'sibsp').toPandas().corr(),
                annot=True, fmt = ".2f", cmap = "coolwarm")

In [None]:
g = sns.FacetGrid(df.toPandas(), col='survived')
g = g.map(sns.distplot, 'age')

In [None]:
survived_age = df.filter((f.col('survived') == 1) & 
                         (f.col('age').isNotNull())).select('age').toPandas()
not_survived_age = df.filter((f.col('survived') == 0) & (f.col('age').isNotNull())).select('age').toPandas()

In [None]:
survived_age.squeeze()

In [None]:
type(survived_age.squeeze())

In [None]:
g = sns.kdeplot(not_survived_age.squeeze(), color='Red', shade=True)
g = sns.kdeplot(survived_age.squeeze(), color='Blue', shade=True)
g.set_xlabel('age')
g.set_ylabel('Frequency')
g.legend(['Not Survived', 'Survived'])

In [None]:
fare_mean = df.select(f.mean(f.col('fare'))).first()[0]
fare_mean

In [None]:
df.select('fare').fillna(fare_mean).filter(f.col('fare').isNull()).show()

In [None]:
# Distribución de precio de boletos 


In [None]:
df.select('fare').filter(f.col('fare') == 0).show()

In [None]:
import numpy as np

In [None]:
@f.udf('double')
def np_log(x):
    if x is None:
        return 0
    return float(np.log(x + 1)) 

In [None]:
df = df.withColumn('log_fare', np_log(df.fare))
df.select('fare', 'log_fare').show()

In [None]:
df = df.drop('fare')

In [None]:
g = sns.distplot(df.select('log_fare').fillna(np.log(fare_mean)).toPandas(), color='m')

In [None]:
g = sns.barplot(x='sex', y='survived', data=df.toPandas())
g = g.set_ylabel("Survival Probability")

In [None]:
# Probabilidad de supervivencia

In [None]:
g = sns.catplot(x='pclass', y='survived', hue='sex', data=df.toPandas(),
                   height=6, kind='bar')
g = g.set_ylabels("survival probability")

## Ejercicio 4: Valores nulos y constantes

In [None]:
type(df)

In [None]:
df.select('age').groupBy('age').count().show()

In [None]:
df.select(f.sum(f.col('age').isNull().cast('integer'))).show()

In [None]:
df_nulls = df.select([f.sum(f.col(c).isNull().cast('integer') / df.count()).alias(c) for c in df.columns])

In [None]:
df_nulls.show()

In [None]:
null_cols = [c for c in df_nulls.columns if df_nulls.select(c).first()[0] >= 0.5]
null_cols

In [None]:
def _drop_nulls(df, max_null_prop=0.5):
    logger.info(
        f"Dropping columns with null ratio greater than {max_null_prop * 100}%..."
    )
    df_nulls = df.select([f.sum(f.col(c).isNull().cast('integer') / df.count()).alias(c) 
                          for c in df.columns])
    logger.info(f"Null proportions:\n {df_nulls.show()}")
    null_cols = [c for c in df_nulls.columns if df_nulls.select(c).first()[0] > max_null_prop 
                 and c not in PROTECTED_COLS]
    logger.info(f"Dropping the following {len(null_cols)} columns:\n {null_cols}")
    df = df.drop(*null_cols)
    return df

In [None]:
PROTECTED_COLS = ['survived', 'train']
df = _drop_nulls(df)

In [None]:
df.toPandas().isnull().mean()

In [None]:
df.toPandas().info()

In [None]:
df.printSchema()

In [None]:
df.toPandas().std()

In [None]:
df.dtypes

In [None]:
num_cols = [c for c,dtype in df.dtypes if dtype.startswith(('int', 'double'))]
num_cols

In [None]:
df_std = df.select([f.stddev(f.col(c)).alias(c) for c in num_cols])
df_std.show()

In [None]:
def _drop_std(df, min_std_dev=1.5e-2):
   
    return df

In [None]:
df = _drop_std(df)

In [None]:
def _get_typed_cols(df, col_type='cat'):
    assert col_type in ('cat', 'num')
    dtypes = ('int', 'double') if col_type == 'num' else ('string')
    typed_cols = [c for c,dtype in df.dtypes if dtype.startswith(dtypes) 
                  and c not in PROTECTED_COLS]
    return typed_cols

In [None]:
# _get_typed_cols(df, col_type='foo')

In [None]:
num_cols = _get_typed_cols(df, col_type='num')
cat_cols = _get_typed_cols(df, col_type='cat')
num_cols, cat_cols

In [None]:
val_counts = df.filter(f.col('embarked').isNotNull()).select('embarked').groupBy('embarked').count().orderBy(f.desc('count'))
val_counts.show()

In [None]:
val_counts.select('embarked').first()[0]

In [None]:
df.toPandas()['age'].median()

In [None]:
# df.approxQuantile?

In [None]:
df.approxQuantile('age', [0.5], 0)[0]

In [None]:
def _fill_nulls(df):
    for t in ['num', 'cat']:
        cols = _get_typed_cols(df, col_type=t)
        for c in cols:
            if t == 'num':
                median_val = df.approxQuantile(c, [0.5], 0)[0]
                df = df.fillna(median_val, subset=[c])
            else:
                val_counts = df.filter(f.col(c).isNotNull()).select(c).groupBy(c).count().orderBy(f.desc('count'))
                common_val = val_counts.select(c).first()[0]
                df = df.fillna(common_val, subset=[c])
    return df

In [None]:
df.toPandas().info()

In [None]:
df.select('embarked').groupBy('embarked').count().orderBy(f.desc('count')).show()

In [None]:
df = _fill_nulls(df)

In [None]:
df.toPandas().info()

In [None]:
df.select('embarked').groupBy('embarked').count().orderBy(f.desc('count')).show()

In [None]:
df.select('age').groupBy('age').count().orderBy(f.desc('count')).show()

## Ejercicio 5: Ingenieria de Atributos

In [None]:
df.select('name').show(10)

In [None]:
df.select(f.trim(f.split(f.split(f.col('name'), ', ')[1], '. ')[0]).alias('title')).show(5)

In [None]:
df = df.withColumn('title', f.trim(f.split(f.split(df['name'], ', ')[1], '. ')[0]))

In [None]:
df.select('title').show()

In [None]:
title_valcounts = df.select('title').groupBy('title').count().orderBy(f.desc('count'))
title_valcounts.show(5)

In [None]:
other_titles = [str(i.title) for i in title_valcounts.select('title').collect()][4:]
other_titles

In [None]:
df = df.withColumn('title', when(df['title'].isin(*other_titles), 'other').otherwise(df['title']))

In [None]:
df.select('title').groupBy('title').count().show()

In [None]:
df = df.withColumn('title', when(df['title'] == 'Miss', 'Mrs').otherwise(df['title']))

In [None]:
df.select('title').groupBy('title').count().show()

In [None]:
df = df.drop('name')

In [None]:
g = sns.countplot(df.select('title').toPandas().squeeze())

In [None]:
g = sns.catplot(x='title',y='survived',data=df.toPandas(),kind="bar")

In [None]:
df.printSchema()

In [None]:
# Tamaño de familia
df.select('parch', 'sibsp', 'family_size').show(10)

In [None]:

df.select('family_size', 'family_single', 'family_small', 'family_large').show(10)
df = df.drop('family_size')

In [None]:
df.columns

In [None]:
for fsize in ['single', 'small', 'large']:
    g = sns.catplot(x=f'family_{fsize}',y='survived',data=df.toPandas(),kind="bar")
    g = g.set_ylabels("Survival Probability")

## Ejercicio 6: Fitteo de regresión logistica

In [None]:
# Data sintetica
df1 = spark.createDataFrame([
    (1, 10.1, 'a', 'i'),
    (0, 14.3, 'b', 'x'),
    (0, 3.0, 'c', 'iv'),
    (1, 2.5, 'c', 'iv'),
    (1, 5.4, 'b', 'i'),
    (0, 9.7, 'a', 'x')
], ['target', 'numerical', 'cat1', 'cat2'])
df1.show()

# Convertir strings en numericos
from pyspark.ml.feature import StringIndexer

cat_cols = ['cat1', 'cat2']
for cat in cat_cols:
    cat_suff = f'{cat}_num'
    if cat_suff not in df1.columns:
        indexer = StringIndexer(inputCol=cat, outputCol=cat_suff).fit(df1)
        df1 = indexer.transform(df1)
df1.show()

# Hacer OneHotEnconding
from pyspark.ml.feature import OneHotEncoderEstimator

encoder = OneHotEncoderEstimator(inputCols=['cat1_num', 'cat2_num'], outputCols=['cat1_vec', 'cat2_vec'])
ohem = encoder.fit(df1)
df1 = ohem.transform(df1)
df1.show()

# Armar columna de features
df1 = df1.select('target','numerical', 'cat1_vec', 'cat2_vec')
df1.show()
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(inputCols=[c for c in df1.columns if c != 'target'], outputCol='features')
assembler.transform(df1).show()

In [None]:
num_cols = _get_typed_cols(df, col_type='num')
cat_cols = _get_typed_cols(df, col_type='cat')
df.columns, cat_cols, num_cols

In [None]:
from pyspark.ml.feature import StringIndexer

def _encode_categorical(df):
    cat_cols = _get_typed_cols(df, col_type='cat')
    logger.info(f"Categorical columns:\n {cat_cols}")
    encoded_cols = []
    for cat in cat_cols:
        cat_suff = f'{cat}_num'
        encoded_cols.append(cat_suff)
        if cat_suff not in df.columns:
            indexer = StringIndexer(inputCol=cat, outputCol=cat_suff).fit(df)
            df = indexer.transform(df)
    return df, encoded_cols

In [None]:
df, encoded_cols = _encode_categorical(df)
df.show()

In [None]:
feature_cols = num_cols + encoded_cols
feature_cols

In [None]:
from pyspark.ml.feature import OneHotEncoderEstimator

ohe_cols = [f'{c}_vec' for c in encoded_cols]
encoder = OneHotEncoderEstimator(inputCols=encoded_cols, outputCols=ohe_cols)
ohem = encoder.fit(df)
df = ohem.transform(df)
df.show()
feature_cols = num_cols + ohe_cols

In [None]:
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(inputCols=feature_cols, outputCol='features')
df = assembler.transform(df)
df.select('features').show()

In [None]:
df.columns

In [None]:
df.select('train').take(2)

In [None]:
train_data = df.filter(f.col('train') == True).select('survived', 'features')
test_data = df.filter(f.col('train') == False).select('survived', 'features')
train_data.show()

In [None]:
from pyspark.ml.classification import LogisticRegression

lr = LogisticRegression(labelCol='survived', featuresCol='features')
lrm = lr.fit(train_data)

In [None]:
# Metricas de evaluacion (insample)


In [None]:
pred_df = lrm.transform(test_data)
pred_df.show()

In [None]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

evaluator = BinaryClassificationEvaluator(labelCol='survived')
evaluator.evaluate(pred_df)

## Ejercicio 7: árboles

In [None]:
from pyspark.ml.classification import DecisionTreeClassifier



In [None]:

pred_df.show()

In [None]:
evaluator.evaluate(pred_df)

In [None]:
from pyspark.ml.classification import RandomForestClassifier


pred_df.show()

In [None]:
evaluator.evaluate(pred_df)

In [None]:
dir(rfm)

In [None]:
rfm.featureImportances

In [None]:
from itertools import chain
attrs = sorted(
    (attr['idx'], attr['name'])
    for attr in (
        chain(*pred_df.schema['features'].metadata['ml_attr']['attrs'].values())
    )
)
feat_import = [(name, rfm.featureImportances[idx]) for idx, name in attrs if rfm.featureImportances[idx]]
feat_import = pd.DataFrame(feat_import, columns=['feature', 'importance']).sort_values(by='importance', ascending=False)
feat_import.head(15)

In [None]:
ax = feat_import[:20].plot(kind='bar')
ax.set_xticklabels(feat_import[:20]['feature'].tolist())

In [None]:
spark.stop()