In [1]:
import pandas as pd
pd.set_option("display.max_colwidth", 80)
pd.set_option("display.max_columns", 80)
pd.set_option("display.max_rows", 6)

In [2]:
df = pd.read_csv('/dbfs/titanic/train.csv')
df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Thayer)",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C
890,891,0,3,"Dooley, Mr. Patrick",male,32.0,0,0,370376,7.7500,,Q


In [3]:
# specify the column type, categorical or numerical, for each column
C = 'Categorical'
N = 'Numerical'
label_name = 'Survived'
feature_dict = {
    'Pclass': C,
    'Sex': C,
    'Age': N,
    'SibSp': C,
    'Parch': C,
    'Ticket': C,
    'Fare': N,
    'Cabin': C,
    'Embarked': C
}

In [4]:
from sklearn import preprocessing

def fillna(df):
    na_list = [i for i in feature_dict.keys() if df[i].isnull().values.any()]
    for i in na_list:
        if feature_dict[i] == N:
            df[i] = df[i].fillna(value=df[i].mean())
        else:
            df[i] = df[i].fillna(method='backfill')
            df[i] = df[i].fillna(method='pad')
            # df[i] = df[i].fillna(value=df[i].value_counts().index[0])

categorical_names = list(map(lambda x: x[0], filter(lambda x: x[1] == C, feature_dict.items())))
def categoricalize(df):
    le = preprocessing.LabelEncoder()
    for column in categorical_names:
        # print((column, df[column].shape))
        df[column] = le.fit_transform(df[column])

In [5]:
fillna(df)
df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.000000,1,0,A/5 21171,7.2500,C85,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Thayer)",female,38.000000,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.000000,0,0,STON/O2. 3101282,7.9250,C123,S
...,...,...,...,...,...,...,...,...,...,...,...,...
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,29.699118,1,2,W./C. 6607,23.4500,C148,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.000000,0,0,111369,30.0000,C148,C
890,891,0,3,"Dooley, Mr. Patrick",male,32.000000,0,0,370376,7.7500,C148,Q


In [6]:
categoricalize(df)
df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,2,"Braund, Mr. Owen Harris",1,22.000000,1,0,523,7.2500,81,2
1,2,1,0,"Cumings, Mrs. John Bradley (Florence Briggs Thayer)",0,38.000000,1,0,596,71.2833,81,0
2,3,1,2,"Heikkinen, Miss. Laina",0,26.000000,0,0,669,7.9250,55,2
...,...,...,...,...,...,...,...,...,...,...,...,...
888,889,0,2,"Johnston, Miss. Catherine Helen ""Carrie""",0,29.699118,1,2,675,23.4500,60,2
889,890,1,0,"Behr, Mr. Karl Howell",1,26.000000,0,0,8,30.0000,60,0
890,891,0,2,"Dooley, Mr. Patrick",1,32.000000,0,0,466,7.7500,60,1


In [7]:
features = df[list(feature_dict.keys())]
feat_type = list(feature_dict.values())
labels = df[label_name]

In [8]:
import autokeras as ak

# It tries 10 different models.
clf = ak.StructuredDataClassifier(max_trials=10)
# Feed the structured data classifier with training data.
clf.fit(features, labels)

In [9]:
test_df = pd.read_csv('/dbfs/titanic/test.csv')[list(feature_dict.keys())]
test_df

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,3,male,34.5,0,0,330911,7.8292,,Q
1,3,female,47.0,1,0,363272,7.0000,,S
2,2,male,62.0,0,0,240276,9.6875,,Q
...,...,...,...,...,...,...,...,...,...
415,3,male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S
416,3,male,,0,0,359309,8.0500,,S
417,3,male,,1,1,2668,22.3583,,C


In [10]:
fillna(test_df)
categoricalize(test_df)

In [11]:
predictions = clf.predict(test_df)

In [12]:
import numpy as np

preds = pd.DataFrame({'PassengerId' : range(892,1310), 'Survived' : np.reshape(predictions, (1310-892))})
preds.to_csv('/dbfs/titanic/results-autokeras.csv', index=False)

In [13]:
preds

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
...,...,...
415,1307,0
416,1308,0
417,1309,0
