In [204]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

train = pd.read_csv('/kaggle/input/titanic-competition/train.csv')
test = pd.read_csv('/kaggle/input/titanic-competition/test.csv')
joined = pd.concat([train.drop('Survived', axis=1), test])
print(joined.shape)

train.isnull().sum()

(1309, 11)


PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [205]:
from scipy.stats import pearsonr
from pandas.api.types import is_numeric_dtype
import re

joined2, train2, test2 = joined.copy(), train.copy(), test.copy()

# Get titles and fare first, since we will use them to impute age
common_titles = ['Mr', 'Mrs', 'Miss', 'Master', 'Doctor', 'Rev']
def get_titles_and_fare(df):
    df['Fare'] = df['Fare'].fillna(joined2['Fare'].mean())
    df['FareRank'] = df['Fare'].rank()
    for title in common_titles:
        df[title] = df['Name'].apply(lambda x: 1 if title == x.split(', ')[1].split('. ')[0] else 0)
    return df

joined2 = get_titles_and_fare(joined2)
train2 = get_titles_and_fare(train2)
test2 = get_titles_and_fare(test2)

# Create groupby objects for imputing age
title_gs = [joined2.groupby(['Pclass', title])['Age'] for title in common_titles]
class_g = joined2.groupby('Pclass')['Age']

def process_features(df):
    def impute_age(row):
        if np.isnan(row['Age']):
            row['Age'] = class_g.get_group(row['Pclass']).agg('mean')
            for i, title in enumerate(common_titles):
                if row[title] == 1:
                    row['Age'] = title_gs[i].get_group((row['Pclass'], 1)).agg('mean')
        return row
    df = df.apply(impute_age, axis=1)
    df['Wife'] = df['Name'].apply(lambda x: 1 if re.search(r"\(.+\)", x) else 0)
    df['Embarked'] = df['Embarked'].fillna('S').apply(lambda x: ['S', 'C', 'Q'].index(x))
    df['Sex'] = df['Sex'].apply(lambda x: 0 if x == 'male' else 1)
    df['Cabin Mates'] = df.groupby('Cabin')['Cabin'].transform('count').fillna(0)
    df['Cabin'] = df['Cabin'].fillna(0).apply(lambda x: x if x == 0 else ord(x[0]) - 64)
    df['Fam'] = df['Parch'] + train_df['SibSp']
    return df

train2 = process_features(train2)
test2 = process_features(test2)
test2.isnull().sum()


PassengerId    0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       0
FareRank       0
Mr             0
Mrs            0
Miss           0
Master         0
Doctor         0
Rev            0
Wife           0
Cabin Mates    0
Fam            0
dtype: int64

In [281]:
from sklearn.model_selection import cross_val_score 
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures

train3 = train2.copy()
test3 = test2.copy()

cols = ['Pclass', 'Fam', 'Parch', 
        'SibSp', 'Age',
        'FareRank', 'Cabin Mates',
       'Sex', 
       'Embarked']

X = train3[cols]
X_test = test3[cols]
X = scaler.fit_transform(X)
scaler = StandardScaler()
X_test = scaler.fit_transform(X_test)
y = train3['Survived']

In [272]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import LinearSVC, SVC
from sklearn.ensemble import GradientBoostingClassifier
import warnings

# warnings.filterwarnings('ignore')

models = [
    {
        'name': 'Random Forest',
        'model': RandomForestClassifier(),
        'params': {'max_depth': [2, 5, 10],
          'min_samples_leaf': [3, 4],
         'n_estimators': [100, 150]}
    },
    {
        'name': 'Gradient Boosting',
        'model': GradientBoostingClassifier(),
        'params': {'n_estimators': [100],  
              'max_depth': [5], 
              'min_samples_leaf': [5],
              'learning_rate': [.001, .01, .1, 1],
             'subsample': [.5]} 
    },
#     {
#         'name': 'Logistic Regression',
#         'model': LogisticRegression(max_iter=10000),
#         'params': {'penalty': [None, 'l2'], 
#           'C': [.5, 1, 2]}
#     },
#     {
#         'name': 'Linear Support Vector Machine',
#         'model': LinearSVC(max_iter=100000, dual=True),
#         'params': {'penalty': ['l2', 'l1'], 
#           'loss': ['hinge', 'hinge_squared'],
#           'C': [.01, .1, 1]}
#     },
#     {
#         'name': 'Support Vector Machine',
#         'model': SVC(),
#         'params': {'C': [100, 200, 500],  
#               'gamma': [.1, .03], 
#               'kernel': ['rbf']}
#     }
]
for model in models:
    print('Model:', model['name'])
    gs = GridSearchCV(estimator=model['model'],
                      param_grid=model['params'],
                      verbose=True, cv=5,
                      scoring='accuracy')
    gs.fit(X, y)
    results = gs.cv_results_
    print('Best Score:', gs.best_score_, 'Best Params:', gs.best_params_)

Model: Random Forest
Fitting 5 folds for each of 12 candidates, totalling 60 fits
Best Score: 0.8294017952419811 Best Params: {'max_depth': 5, 'min_samples_leaf': 3, 'n_estimators': 150}
Model: Gradient Boosting
Fitting 5 folds for each of 4 candidates, totalling 20 fits
Best Score: 0.8417362375243236 Best Params: {'learning_rate': 0.1, 'max_depth': 5, 'min_samples_leaf': 5, 'n_estimators': 100, 'subsample': 0.5}


In [282]:
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.ensemble import StackingClassifier
from lightgbm import LGBMClassifier
# Combine models

rf_clf = RandomForestClassifier(max_depth=55, min_samples_leaf=3, n_estimators=150, random_state=0)
log_reg = LogisticRegression(max_iter=10000, C=1, penalty='l2', random_state=0)
sv_clf = SVC(C=100, gamma=.03, kernel='rbf', probability=True)
gb_clf = GradientBoostingClassifier(max_depth=5, min_samples_leaf=5, n_estimators=100, learning_rate=.1, subsample=.5, random_state=0)
lgbm_clf = LGBMClassifier(random_state=10)
st_clf = StackingClassifier(
    estimators=[
        ('lr', LogisticRegression(max_iter=10000, C=1, penalty='l2', random_state=0)),
        ('gb', GradientBoostingClassifier(max_depth=5, min_samples_leaf=5, n_estimators=100, learning_rate=.01, subsample=.5, random_state=0)),
        ('rf', RandomForestClassifier(max_depth=15, min_samples_leaf=3, n_estimators=150, random_state=0)),
        ('svc', SVC(C=100, gamma=.03, kernel='rbf', probability=True))
    ],
    final_estimator=LogisticRegression(max_iter=10000, C=1, penalty='l2', random_state=0), cv=3
)

log_reg.fit(X, y)
sv_clf.fit(X, y)
gb_clf.fit(X, y)
st_clf.fit(X, y)
rf_clf.fit(X, y)
lgbm_clf.fit(X, y)

# y_pred_lr = log_reg.predict(X)
# y_pred_svc = sv_clf.predict(X)
# y_pred_lr_proba = log_reg.predict_proba(X)
# y_pred_svc_proba = sv_clf.predict_proba(X)
# y_pred_weighted = (y_pred_lr_proba[:,1] + y_pred_svc_proba[:,1]) // 1
y_pred_stacked = st_clf.predict(X)
y_pred_gb = gb_clf.predict(X)
y_pred_rf = rf_clf.predict(X)
y_pred_lgbm = lgbm_clf.predict(X)

lr = log_reg.predict(X_test)
sv = sv_clf.predict(X_test)
gb = gb_clf.predict(X_test)
rf = rf_clf.predict(X_test)
lgbm = lgbm_clf.predict(X_test)

y_final_predict = (lr + sv + gb + rf + lgbm) // 3

# print('Logreg model:')
# print(confusion_matrix(y_pred_lr, y))
# print(accuracy_score(y_pred_lr, y))
# print('SVC model:')
# print(confusion_matrix(y_pred_svc, y))
# print(accuracy_score(y_pred_svc, y))
# print('Weighted model:')
# print(confusion_matrix(y_pred_weighted, y))
# print(accuracy_score(y_pred_weighted, y))
print('Stacked model:')
print(confusion_matrix(y_pred_stacked, y))
print(accuracy_score(y_pred_stacked, y))
print('Gradient Boosted model:')
print(confusion_matrix(y_pred_gb, y))
print(accuracy_score(y_pred_gb, y))
print('Random Forest model:')
print(confusion_matrix(y_pred_rf, y))
print(accuracy_score(y_pred_rf, y))
print('LGBM model:')
print(confusion_matrix(y_pred_lgbm, y))
print(accuracy_score(y_pred_lgbm, y))
y_final_predict

Stacked model:
[[513  75]
 [ 36 267]]
0.8754208754208754
Gradient Boosted model:
[[529  46]
 [ 20 296]]
0.9259259259259259
Random Forest model:
[[526  60]
 [ 23 282]]
0.9068462401795735
LGBM model:
[[537  24]
 [ 12 318]]
0.9595959595959596


array([0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0,
       1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0,
       1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0,
       0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,

In [283]:
output = pd.DataFrame({'PassengerId': test.PassengerId, 'Survived': y_final_predict})
output.to_csv('submission16.csv', index=False)