In [35]:
import pandas as pd

from prepare_data import (
    explained_var,
    explanatory_vars,
    encode_variables,
    fill_gaps,
)

training_data = pd.read_csv('data/train.csv')
training_data.sample(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
115,116,0,3,"Pekoniemi, Mr. Edvard",male,21.0,0,0,STON/O 2. 3101294,7.925,,S
699,700,0,3,"Humblen, Mr. Adolf Mathias Nicolai Olsen",male,42.0,0,0,348121,7.65,F G63,S
46,47,0,3,"Lennon, Mr. Denis",male,,1,0,370371,15.5,,Q
511,512,0,3,"Webber, Mr. James",male,,0,0,SOTON/OQ 3101316,8.05,,S
20,21,0,2,"Fynney, Mr. Joseph J",male,35.0,0,0,239865,26.0,,S


In [36]:

training_data.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [37]:
fill_gaps(training_data)
training_data.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         0
dtype: int64

In [38]:
encode_variables(training_data)
training_data.sample(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
848,849,0,2,"Harper, Rev. John",1,28.0,0,1,248727,33.0,,2
772,773,0,2,"Mack, Mrs. (Mary)",0,57.0,0,0,S.O./P.P. 3,10.5,E77,2
857,858,1,1,"Daly, Mr. Peter Denis",1,51.0,0,0,113055,26.55,E17,2
869,870,1,3,"Johnson, Master. Harold Theodor",1,4.0,1,1,347742,11.1333,,2
767,768,0,3,"Mangan, Miss. Mary",0,30.5,0,0,364850,7.75,,1


In [39]:
training_data = training_data[explanatory_vars+[explained_var]]
training_data.sample(5)

Unnamed: 0,Age,Embarked,Parch,Sex,SibSp,Survived
746,16.0,2,1,1,1,0
716,38.0,0,0,0,0,1
472,33.0,2,2,0,1,1
594,37.0,2,0,1,1,0
208,16.0,1,0,0,0,1


In [40]:
training_data.isna().sum()

Age         0
Embarked    0
Parch       0
Sex         0
SibSp       0
Survived    0
dtype: int64

In [41]:
from sklearn.model_selection import train_test_split
num_test = 0.20
X_all = training_data[explanatory_vars]
y_all = training_data[explained_var]
X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=num_test, random_state=23)

In [42]:
X_train.describe()


Unnamed: 0,Age,Embarked,Parch,Sex,SibSp
count,712.0,712.0,712.0,712.0,712.0
mean,29.189494,1.522472,0.401685,0.640449,0.54073
std,13.155819,0.805781,0.824312,0.480206,1.1256
min,0.42,0.0,0.0,0.0,0.0
25%,22.0,1.0,0.0,0.0,0.0
50%,28.0,2.0,0.0,1.0,0.0
75%,35.0,2.0,0.25,1.0,1.0
max,80.0,2.0,6.0,1.0,8.0


In [43]:
X_test.describe()

Unnamed: 0,Age,Embarked,Parch,Sex,SibSp
count,179.0,179.0,179.0,179.0,179.0
mean,30.046089,1.592179,0.301676,0.675978,0.452514
std,12.475691,0.731433,0.725574,0.469321,1.006569
min,0.75,0.0,0.0,0.0,0.0
25%,23.5,1.0,0.0,0.0,0.0
50%,28.0,2.0,0.0,1.0,0.0
75%,36.0,2.0,0.0,1.0,1.0
max,74.0,2.0,5.0,1.0,8.0


In [44]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import make_scorer, accuracy_score
from sklearn.model_selection import GridSearchCV

# Random Forest 
rf = RandomForestClassifier()

# Choose some parameter combinations to try
parameters = {
  'n_estimators': [4, 6, 9], 
  'max_features': ['log2', 'sqrt','auto'], 
  'criterion': ['entropy', 'gini'],
  'max_depth': [2, 3, 5, 10], 
  'min_samples_split': [2, 3, 5],
  'min_samples_leaf': [1,5,8]
}

# Type of scoring used to compare parameter combinations
acc_scorer = make_scorer(accuracy_score)

# Run the grid search
grid_obj = GridSearchCV(rf, parameters, scoring=acc_scorer)
grid_obj = grid_obj.fit(X_train, y_train)

# Set the rf to the best combination of parameters
rf = grid_obj.best_estimator_

# Fit the best algorithm to the data. 
rf.fit(X_train, y_train)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=5, max_features='sqrt', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=3,
            min_weight_fraction_leaf=0.0, n_estimators=6, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [45]:
predictions = rf.predict(X_test)
print(accuracy_score(y_test, predictions))


0.7932960893854749


In [46]:
X_test_kaggle = pd.read_csv('data/test.csv')

In [47]:
X_test_kaggle = X_test_kaggle[explanatory_vars]

In [48]:
fill_gaps(X_test_kaggle)
encode_variables(X_test_kaggle)
X_test_kaggle.sample(5)

Unnamed: 0,Age,Embarked,Parch,Sex,SibSp
22,27.0,2,0,0,0
70,24.0,1,0,0,0
331,39.0,0,0,1,0
387,57.0,2,0,1,0
90,22.0,2,0,0,1


In [49]:
rf.predict(X_test_kaggle)  # accuracy is 0.77990

array([0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1,
       1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1,
       0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,

In [50]:
from sklearn.externals import joblib

joblib.dump(rf, 'models/random_forest.pkl')

['models/random_forest.pkl']

In [51]:
from sklearn import neighbors, datasets
knn = neighbors.KNeighborsClassifier()

In [52]:
knn.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform')

In [None]:
parameters = {
  'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
  'leaf_size': list(range(10, 50, 5)), 
  'metric': ['minkowski', 'euclidean'], 
  'p': [1, 2, 3], 
  'n_neighbors': [3, 4, 5, 6, 7, 8, 9, 10],
  'weights': ['uniform', 'distance']
}

# Type of scoring used to compare parameter combinations
acc_scorer = make_scorer(accuracy_score)

# Run the grid search
grid_obj = GridSearchCV(knn, parameters, scoring=acc_scorer)
grid_obj = grid_obj.fit(X_train, y_train)

# Set the rf to the best combination of parameters
knn = grid_obj.best_estimator_



In [None]:
knn.fit(X_train, y_train)
joblib.dump(knn, 'models/knn.pkl')

In [None]:
predictions = knn.predict(X_test)
print(accuracy_score(y_test, predictions))

In [None]:
knn.predict(X_test_kaggle)  # accuracy is 0.61722

In [None]:
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression()

logreg.fit(X_train, y_train)
predictions = logreg.predict(X_test)
print(accuracy_score(y_test, predictions))

In [None]:
logreg.fit(X_train, y_train)

In [None]:
logreg.predict(X_test_kaggle) # 0.77033

In [None]:
joblib.dump(logreg, 'models/logistic_regression.pkl')