In [88]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
import sys
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, StackingClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.neighbors import KNeighborsClassifier

sys.path.append('/Users/alexkelber/Documents/DataScience')
warnings.filterwarnings('ignore')

from custom_ml_utils.compare_models import ModelComparer

train, test = pd.read_csv('input/train.csv'), pd.read_csv('input/test.csv')

In [82]:
full = pd.concat([train.drop('Transported', axis=1), test], ignore_index=True)

# Get group and group size from PassengerId
full['Group'] = full['PassengerId'].apply(lambda x: x[:4])
full['GroupSize'] = full.groupby(['Group'])['PassengerId'].transform('count')

# Get Cabin deck and Cabin ship side from Cabin
def parse_cabin_info(row):
    if not pd.isnull(row['Cabin']):
        row['CabinDeck'] = ord(row['Cabin'].split('/')[0]) - 65
        row['CabinPort'] = 1 if row['Cabin'][-1] == 'P' else 0
    return row
full = full.apply(parse_cabin_info, axis=1)

# Convert boolean columns to numeric
for col in ['CryoSleep', 'VIP']:
    full[col] = full[col].replace({True: 1, False: 0})

# Get group and total information for various features:
#   1. Mode imputation categories
mode_categories = ['CryoSleep', 'VIP', 'HomePlanet', 'Destination', 'CabinDeck', 'CabinPort']
group_modes = [full.groupby('Group')[col].agg(pd.Series.mode) for col in mode_categories]
overall_modes = [full[col].mode()[0] for col in mode_categories]

#   2. Mean imputation categories (spending categories)
mean_categories = ['RoomService', 'Spa', 'VRDeck', 'FoodCourt', 'ShoppingMall']
group_means = [full.groupby('Group')[col].agg('mean') for col in mean_categories]
overall_means = [full[col].mean() for col in mean_categories]

# Impute values for all these categories based on group
def impute_data(row):
    for i, col in enumerate(mode_categories):
        if pd.isnull(row[col]):
            val = group_modes[i].loc[row['Group']]
            if not isinstance(val, str) and not isinstance(val, float):
                val = val[0] if len(val) > 0 else None
            if not isinstance(val, str) and not isinstance(val, float) and (val is None or np.isnan(val)):
                val = overall_modes[i]
            row[col] = val
    for i, col in enumerate(mean_categories):
        if pd.isnull(row[col]):
            if row['CryoSleep'] == 0:
                row[col] = 0
            else:
                val = group_means[i].loc[row['Group']]
                if np.isnan(val):
                    val = overall_means[i]
                row[col] = val
    return row
full = full.apply(impute_data, axis=1)

# Convert spending categories into bins
for col in mean_categories:
    full[col].replace(to_replace=0, value=np.nan, inplace=True)
    full[col] = pd.qcut(full[col], 10, labels=False).fillna(-1)

# Get mean age for each Room Service bin and use this to impute age
age_by_rs = full.groupby('RoomService')['Age'].agg('mean')
def impute_age(row):
    if pd.isnull(row['Age']):
        row['Age'] = age_by_rs.loc[row['RoomService']]
    return row
full = full.apply(impute_age, axis=1)

# Convert planet columns to ordinal encoding
planet_ordinal_map = {
    'Earth': 0,
    'Mars': 1,
    'Europa': 2,
    '55 Cancri e': 0,
    'PSO J318.5-22': 1,
    'TRAPPIST-1e': 2 
}
for col in ['HomePlanet', 'Destination']:
    full[col] = full[col].apply(lambda x: planet_ordinal_map[x])

# Drop unnecessary columns

full = full.drop(['Name', 'Group', 'PassengerId', 'Cabin'], axis=1)
full.isnull().sum()

Age             0
CabinDeck       0
CabinPort       0
CryoSleep       0
Destination     0
FoodCourt       0
GroupSize       0
HomePlanet      0
RoomService     0
ShoppingMall    0
Spa             0
VIP             0
VRDeck          0
dtype: int64

In [28]:
full.head()

Unnamed: 0,Age,CabinDeck,CabinPort,CryoSleep,Destination,FoodCourt,GroupSize,HomePlanet,RoomService,ShoppingMall,Spa,VIP,VRDeck
0,39.0,1.0,1.0,0.0,2,-1.0,1,2,-1.0,-1.0,-1.0,0.0,-1.0
1,24.0,5.0,0.0,0.0,2,1.0,1,0,3.0,2.0,6.0,0.0,2.0
2,58.0,0.0,0.0,0.0,2,9.0,2,2,2.0,-1.0,9.0,1.0,2.0
3,33.0,0.0,0.0,0.0,2,7.0,2,2,-1.0,6.0,9.0,0.0,4.0
4,16.0,5.0,0.0,0.0,2,2.0,1,0,5.0,4.0,6.0,0.0,0.0


In [85]:
train2 = full[:train.shape[0]]
test2 = full[train.shape[0]:]
y = train['Transported']

X = StandardScaler().fit_transform(train2)
X, X_test, y, y_test = train_test_split(X, y, test_size=1000, random_state=0)
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=1000, random_state=0)

models = [
    ('XGB', XGBClassifier(gamma=5, subsample=0.7, max_depth=6, reg_lambda=1, alpha=0.01)),
    ('Log Reg', LogisticRegression(penalty='l2', C=.1)),
    ('SVC', SVC(kernel='poly', degree=3, gamma=.1, C=.1, probability=True)),
    ('KNN', KNeighborsClassifier(n_neighbors=25)),
    ('Decision', DecisionTreeClassifier(max_depth=8, min_samples_leaf=5)),
    ('Forest', RandomForestClassifier(max_depth=5, min_samples_leaf=5, n_estimators=50)),
    ('GB', GradientBoostingClassifier(max_depth=5, min_samples_leaf=5))
]
predictions = []
for name, model in models:
    model.fit(X_train, y_train)
    # train_pred = model.predict(X_train)
    valid_pred = model.predict_proba(X_valid)
    predictions.append((name, valid_pred[:,1]))
    # print('Training Accuracy:', accuracy_score(train_pred, y_train))
    # print('Validation Accuracy:', accuracy_score(valid_pred, y_valid))
    # print('Validation Confusion Matrix:')
    # print(confusion_matrix(valid_pred, y_valid))
    # print('\n\n')
mc = ModelComparer(y_true=y_valid, probs=predictions)
results = mc.evaluatate_soft_voting(splits=3)
results.head(10)

Unnamed: 0,Model(s) used,Full score,Fold 0,Fold 1,Fold 2
12,"XGB, GB",0.811,0.805389,0.813814,0.813814
51,"Log Reg, Decision, GB",0.81,0.808383,0.813814,0.807808
36,"XGB, SVC, GB",0.808,0.799401,0.813814,0.810811
77,"XGB, SVC, Decision, GB",0.807,0.793413,0.807808,0.81982
82,"XGB, Decision, Forest, GB",0.807,0.793413,0.810811,0.816817
57,"SVC, Decision, GB",0.807,0.805389,0.801802,0.813814
71,"XGB, Log Reg, Decision, GB",0.807,0.802395,0.810811,0.807808
32,"XGB, Log Reg, GB",0.806,0.802395,0.804805,0.810811
81,"XGB, KNN, Forest, GB",0.806,0.787425,0.81982,0.810811
112,"XGB, KNN, Decision, Forest, GB",0.806,0.784431,0.816817,0.816817


In [90]:
st_clf = StackingClassifier(
    estimators=[
        ('xgc', XGBClassifier(gamma=5, subsample=0.7, max_depth=6, reg_lambda=1, alpha=0.01)),
        ('gb', GradientBoostingClassifier(max_depth=5, min_samples_leaf=5))
    ],
    final_estimator=LogisticRegression(penalty='l2', C=.1), cv=5
)
st_clf.fit(X, y)
y_test_pred = st_clf.predict(X_test)
accuracy_score(y_test_pred, y_test)

0.798

In [91]:
test2 = StandardScaler().fit_transform(test2)
y_final_predictions = st_clf.predict(test2)
output = pd.DataFrame({'PassengerId': test.PassengerId, 'Transported': y_final_predictions})
output.to_csv('submission3.csv', index=False)

In [42]:
params = {'gamma': [5], 
          'subsample': [.7],
          'sampling_method': ['uniform'],
          'lambda': [.5, 1],
          'alpha': [.001, .01,],
          'max_depth': [5, 6, 7, 8]
          }
xgb_clf = XGBClassifier()
gs = GridSearchCV(estimator=xgb_clf, 
                           param_grid=params,
                           scoring='accuracy',
                           cv=3)
gs.fit(X, y)
print('Best Score:', gs.best_score_, 'Best Params:', gs.best_params_)

Best Score: 0.7987777483808904 Best Params: {'alpha': 0.01, 'gamma': 5, 'lambda': 1, 'max_depth': 6, 'sampling_method': 'uniform', 'subsample': 0.7}


In [55]:
params = {'n_estimators': [50], 
          'subsample': [1],
          'min_samples_leaf': [1, 3, 5],
          'max_depth': [2, 5, 8]
          }
gb_clf = GradientBoostingClassifier()
gs = GridSearchCV(estimator=gb_clf, 
                           param_grid=params,
                           scoring='accuracy',
                           cv=3)
gs.fit(X, y)
print('Best Score:', gs.best_score_, 'Best Params:', gs.best_params_)

Best Score: 0.7969577769465555 Best Params: {'max_depth': 5, 'min_samples_leaf': 5, 'n_estimators': 50, 'subsample': 1}


In [78]:
params = {'min_samples_leaf': [5, 10],
          'max_depth': [8, 10, 15]
          }
dt_clf = DecisionTreeClassifier()
gs = GridSearchCV(estimator=dt_clf, 
                           param_grid=params,
                           scoring='accuracy',
                           cv=5)
gs.fit(X, y)
print('Best Score:', gs.best_score_, 'Best Params:', gs.best_params_)

Best Score: 0.7799296319110158 Best Params: {'max_depth': 8, 'min_samples_leaf': 5}


In [58]:
params = {'n_neighbors': [15, 20, 25, 30]}
kn_clf = KNeighborsClassifier()
gs = GridSearchCV(estimator=kn_clf, 
                           param_grid=params,
                           scoring='accuracy',
                           cv=3)
gs.fit(X, y)
print('Best Score:', gs.best_score_, 'Best Params:', gs.best_params_)

Best Score: 0.7773291204552666 Best Params: {'n_neighbors': 25}


In [61]:
params = {'C': [0, .001, .01, .1, 1, 10, 100]}
log_reg = LogisticRegression(penalty='l2')
gs = GridSearchCV(estimator=log_reg, 
                           param_grid=params,
                           scoring='accuracy',
                           cv=3)
gs.fit(X, y)
print('Best Score:', gs.best_score_, 'Best Params:', gs.best_params_)

Best Score: 0.7778490398875215 Best Params: {'C': 0.1}


In [62]:
params = {'n_estimators': [50, 100, 200], 
          'min_samples_leaf': [3, 5],
          'max_depth': [2, 5, 10, 15]
          }
gb_clf = GradientBoostingClassifier()
gs = GridSearchCV(estimator=gb_clf, 
                           param_grid=params,
                           scoring='accuracy',
                           cv=3)
gs.fit(X, y)
print('Best Score:', gs.best_score_, 'Best Params:', gs.best_params_)

Best Score: 0.7960480953351196 Best Params: {'max_depth': 5, 'min_samples_leaf': 5, 'n_estimators': 50}


In [65]:
rf_clf = RandomForestClassifier(max_depth=5, min_samples_leaf=5, n_estimators=50)
rf_clf.fit(X, y)
feature_names = [col for col in train2.columns]
importances = pd.Series(rf_clf.feature_importances_, index=feature_names).sort_values(ascending=False)
importances

CryoSleep       0.231544
RoomService     0.161912
VRDeck          0.158633
Spa             0.150215
FoodCourt       0.079624
HomePlanet      0.072895
ShoppingMall    0.059430
CabinDeck       0.041079
Age             0.026090
GroupSize       0.006576
CabinPort       0.005922
Destination     0.005504
VIP             0.000577
dtype: float64