# Setup

In [1]:
!pip install kaggle > /dev/null
!pip install torch torchvision > /dev/null
!conda install -c conda-forge tpot -y > /dev/null
!conda install -c conda-forge tpot xgboost dask dask-ml scikit-mdr skrebate -y > /dev/null

  from cryptography.utils import int_from_bytes
  from cryptography.utils import int_from_bytes
  from cryptography.utils import int_from_bytes
  from cryptography.utils import int_from_bytes


In [2]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import time

from tpot import TPOTClassifier

In [3]:
pd_clean_train = pd.read_csv("./data/clean/train.csv", index_col='PassengerId')
pd_clean_test = pd.read_csv("./data/clean/test.csv", index_col='PassengerId')
pd_sample_submission = pd.read_csv("../input/sample_submission.csv")

In [4]:
pd_clean_train.info()
pd_clean_train.head(1)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100000 entries, 0 to 99999
Data columns (total 17 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   Survived     100000 non-null  int64  
 1   Pclass       100000 non-null  int64  
 2   Name         100000 non-null  object 
 3   Sex          100000 non-null  object 
 4   Age          96708 non-null   float64
 5   SibSp        100000 non-null  int64  
 6   Parch        100000 non-null  int64  
 7   Ticket       95377 non-null   object 
 8   Fare         99866 non-null   float64
 9   Cabin        32134 non-null   object 
 10  Embarked     99750 non-null   object 
 11  family       100000 non-null  int64  
 12  ticket_type  24655 non-null   object 
 13  ticket_num   94704 non-null   float64
 14  cabin_id     100000 non-null  object 
 15  surname      100000 non-null  object 
 16  age_bin      96708 non-null   float64
dtypes: float64(4), int64(5), object(8)
memory usage: 13.7+ MB


Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,family,ticket_type,ticket_num,cabin_id,surname,age_bin
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
0,1,1,"Oconnor, Frankie",male,,2,0,209245,27.14,C12239,S,2,,209245.0,C,Oconnor,


# Imputation

In [13]:
def impute_age(pd_df):
    """Fill missing age values with median value for each class"""
    age_map = pd_df[['Age', 'Pclass']].dropna().groupby('Pclass').median().to_dict()
    
    return pd_df.assign(Age=pd_df['Age'].fillna(pd_df['Pclass'].map(age_map['Age'])))


def impute_fare(pd_df):
    """Fill missing fare values with median value for each class"""
    fare_map = pd_df[['Fare', 'Pclass']].dropna().groupby('Pclass').median().to_dict()
    
    return pd_df.assign(Fare=pd_df['Fare'].fillna(pd_df['Pclass'].map(fare_map['Fare'])))

In [14]:
pd_clean_all = (pd.concat([pd_clean_train, pd_clean_test])
    .pipe(impute_age)
    .pipe(impute_fare))

# Create Modeling Datasets

In [15]:
def format_dataset(pd_df):
    dataset_cols = [
        'Sex',
        'Embarked',
        'Pclass',
        'Age',
        'SibSp',
        'Parch',
        'Fare',
        'family',
        'ticket_type',
        'cabin_id',
        'Survived']
    
    return pd_df.loc[:, [col for col in dataset_cols if col in pd_df.columns]]


def one_hot_encode(pd_df):
    pd_df = pd.get_dummies(pd_df, columns=['Sex', 'Embarked', 'ticket_type', 'cabin_id'], dummy_na=True)
    pd_df = pd_df.drop('Sex_nan', axis=1)  # no null values exist in training or test set
    
    return pd_df

In [16]:
pd_model_train = (pd_clean_all
    .loc[:pd_clean_train.index.max()]
    .pipe(format_dataset)
    .pipe(one_hot_encode))
pd_model_test = (pd_clean_test
    .loc[pd_clean_test.index.min():]
    .pipe(format_dataset)
    .pipe(one_hot_encode))

# AutoML

In [21]:
pipeline_optimizer = TPOTClassifier(
    scoring='accuracy',
    n_jobs=-1,
    memory='auto',
    max_time_mins=120,  # total time 
    max_eval_time_mins=20,  # max time per pipeline
    population_size=40,
    verbosity=3,
    random_state=3,
    warm_start=False,
    log_file="./models/2.2-tpot-log.txt"
)

In [None]:
%%time
pipeline_optimizer.fit(
    features=pd_model_train.drop('Survived', axis=1), 
    target=pd_model_train.Survived,
    groups=None)

32 operators have been imported by TPOT.


HBox(children=(FloatProgress(value=0.0, description='Optimization Progress', max=40.0, style=ProgressStyle(des…

In [56]:
# Write-out best pipeline
pipeline_optimizer.export('./models/2.2-tpot_pipeline.py')

# Create Submission

In [17]:
predictions = pipeline_optimizer.predict(pd_model_test)
pd_submission = pd_sample_submission.drop('Survived', axis=1)
pd_submission = pd_submission.assign(Survived=predictions)
pd_submission.to_csv('./submissions/2.2-modelling-tpot.csv', index=False)

In [18]:
# Submit
!kaggle competitions submit -c tabular-playground-series-apr-2021 -f ./submissions/2.2-modelling-tpot.csv -m "2.2-modellng-tpot"

100%|█████████████████████████████████████████| 879k/879k [00:01<00:00, 656kB/s]
Successfully submitted to Tabular Playground Series - Apr 2021