In [1]:
from atm import ATM

atm = ATM()

In [2]:
df_train = spark.read.format('csv').options(header=True).load('/titanic/train.csv')
df_test = spark.read.format('csv').options(header=True).load('/titanic/test.csv')

In [3]:
# eliminamos las variables que no tienen valor predictivo

from functools import reduce
from pyspark.sql import DataFrame

train = reduce(DataFrame.drop, ['PassengerId','Name','Ticket', 'Cabin'], df_train)
test = reduce(DataFrame.drop, ['PassengerId','Name','Ticket', 'Cabin'], df_test)

In [4]:
# rellenamos los nulos con los valores medios estableciendo el tipo correcto de cada variable 

from pyspark.sql.functions import avg
from pyspark.sql.types import IntegerType, DoubleType
import pandas as pd
import numpy as np

def fill_with_mean(df, include=set()): 
  stats = df.agg(*(
    avg(c).alias(c) for c in include
  ))
  return df.na.fill(stats.first().asDict())

def fill_and_cast(df, survived):
  df = fill_with_mean(df, ["Age", "SibSp", "Parch", "Fare"])
  df = df.withColumn("Age", df["Age"].cast(IntegerType()))
  df = df.withColumn("SibSp", df["SibSp"].cast(IntegerType()))
  df = df.withColumn("Parch", df["Parch"].cast(IntegerType()))
  df = df.withColumn("Fare", df["Fare"].cast(DoubleType()))
  df = df.na.fill({'Embarked': 'S'})
  df = df.withColumn("Pclass",df["Pclass"].cast(IntegerType()))
  if survived == True:
      df = df.withColumn("Survived",df["Survived"].cast(IntegerType()))
  return df

train = fill_and_cast(train, True)
test = fill_and_cast(test, False)

display(train)

Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,male,22,1,0,7.25,S
1,1,female,38,1,0,71.2833,C
1,3,female,26,0,0,7.925,S
1,1,female,35,1,0,53.1,S
0,3,male,35,0,0,8.05,S
0,3,male,29,0,0,8.4583,Q
0,1,male,54,0,0,51.8625,S
0,3,male,2,3,1,21.075,S
1,3,female,27,0,2,11.1333,S
1,2,female,14,1,0,30.0708,C


In [5]:
train.printSchema()

In [6]:
train.withColumnRenamed("Survived","class").toPandas().to_csv('/dbfs/titanic/train-atm.csv', index = False)

In [7]:
results = atm.run(train_path='/dbfs/titanic/train-atm.csv')

In [8]:
results.describe()

In [9]:
results.get_best_classifier()

In [10]:
results.export_best_classifier('/dbfs/titanic/atm.pkl', force=True)

In [11]:
from atm import Model

model = Model.load('/dbfs/titanic/atm.pkl')

In [12]:
predictions = model.predict(test.toPandas())

In [13]:
preds = pd.DataFrame({'PassengerId' : range(892,1310), 'Survived' : pd.Series(predictions)})
preds.to_csv('/dbfs/titanic/results-atm.csv', index=False)