In [1]:
# import libraries
import os
import evalml
import numpy as np
import pandas as pd

In [2]:
# import titantic dataset
DATA_PATH = "~/Documents/evalml/demos/data/titanic"
titanic_train = pd.read_csv(os.path.join(DATA_PATH, 'train.csv'))
titanic_test = pd.read_csv(os.path.join(DATA_PATH, 'test.csv'))

display(titanic_train.head())
display(titanic_test.head())

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [3]:
# Show missing values
display(titanic_train[titanic_train.isnull().any(axis=1)].head())
display(titanic_test[titanic_test.isnull().any(axis=1)].head())

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S


Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [4]:
# fill missing data
titanic_train["Age"].fillna(titanic_train["Age"].mean(skipna=True), 
                            inplace=True)
titanic_train["Embarked"].fillna(titanic_train["Embarked"].mode()[0], 
                              inplace=True)

titanic_train.drop('Cabin', axis=1, inplace=True)
titanic_train.drop('Name', axis=1, inplace=True)
titanic_train.drop('Ticket', axis=1, inplace=True)
titanic_train.drop('PassengerId', axis=1, inplace=True)


# repeat for test data
titanic_test["Age"].fillna(titanic_test["Age"].mean(skipna=True), 
                            inplace=True)
titanic_test["Embarked"].fillna(titanic_test["Embarked"].mode()[0], 
                              inplace=True)

titanic_test['Fare'].fillna(titanic_test['Fare'].mean(),
                           inplace=True)

titanic_test.drop('Cabin', axis=1, inplace=True)
titanic_test.drop('Name', axis=1, inplace=True)
titanic_test.drop('Ticket', axis=1, inplace=True)
titanic_test.drop('PassengerId', axis=1, inplace=True)

In [5]:
# check if any is NaN
print(titanic_train.isnull().any(axis=1).any())
print(titanic_test.isnull().any(axis=1).any())

False
False


In [6]:
# create encodings
titanic_train = pd.get_dummies(titanic_train, columns=["Pclass"])
titanic_train = pd.get_dummies(titanic_train, columns=["Embarked"])
titanic_train = pd.get_dummies(titanic_train, columns=["Sex"])
titanic_train.drop('Sex_female', axis=1, inplace=True)

titanic_test = pd.get_dummies(titanic_test, columns=["Pclass"])
titanic_test = pd.get_dummies(titanic_test, columns=["Embarked"])
titanic_test = pd.get_dummies(titanic_test, columns=["Sex"])
titanic_test.drop('Sex_female', axis=1, inplace=True)


In [7]:
X_train = titanic_train.drop('Survived', axis=1)
y_train = titanic_train['Survived']

display(X_train.head())

Unnamed: 0,Age,SibSp,Parch,Fare,Pclass_1,Pclass_2,Pclass_3,Embarked_C,Embarked_Q,Embarked_S,Sex_male
0,22.0,1,0,7.25,0,0,1,0,0,1,1
1,38.0,1,0,71.2833,1,0,0,1,0,0,0
2,26.0,0,0,7.925,0,0,1,0,0,1,0
3,35.0,1,0,53.1,1,0,0,0,0,1,0
4,35.0,0,0,8.05,0,0,1,0,0,1,1


In [8]:
X_train, X_holdout, y_train, y_holdout = evalml.preprocessing.split_data(X_train, y_train, test_size=.2)

In [9]:
# use evalml
clf = evalml.AutoClassifier(objective="precision",
                            max_pipelines=50)

In [10]:
# fit using autoclassfier
clf.fit(X_train, y_train)

[1m*****************************[0m
[1m* Beginning pipeline search *[0m
[1m*****************************[0m

Optimizing for Precision. Greater score is better.

Searching up to 50 pipelines. No time limit is set. Set one using max_time parameter.

Possible model types: xgboost, linear_model, random_forest

Testing Random Forest w/ imputation: 100%|██████████| 50/50 [03:39<00:00,  4.39s/it]               

✔ Optimization finished


In [11]:
clf.rankings

Unnamed: 0,id,pipeline_name,score,high_variance_cv,parameters
0,44,XGBoostPipeline,0.802817,False,"{'eta': 0.0881634928804428, 'min_child_weight'..."
1,33,XGBoostPipeline,0.8,False,"{'eta': 0.35950790057378607, 'min_child_weight..."
2,43,XGBoostPipeline,0.8,False,"{'eta': 0.2023367474752678, 'min_child_weight'..."
3,41,XGBoostPipeline,0.791667,False,"{'eta': 0.12388813072769381, 'min_child_weight..."
4,36,XGBoostPipeline,0.780822,False,"{'eta': 0.40454228688038185, 'min_child_weight..."
5,10,XGBoostPipeline,0.779221,False,"{'eta': 0.38438170729269994, 'min_child_weight..."
6,28,RFClassificationPipeline,0.777778,False,"{'n_estimators': 761, 'max_depth': 273, 'imput..."
7,27,RFClassificationPipeline,0.777778,False,"{'n_estimators': 760, 'max_depth': 63, 'impute..."
8,46,RFClassificationPipeline,0.777778,False,"{'n_estimators': 740, 'max_depth': 51, 'impute..."
9,20,XGBoostPipeline,0.776316,False,"{'eta': 0.6481718720511973, 'min_child_weight'..."


In [12]:
pipeline = clf.best_pipeline
pipeline.score(X_holdout, y_holdout)

0.8524590163934426