In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/titanic/train.csv
/kaggle/input/titanic/test.csv
/kaggle/input/titanic/gender_submission.csv


Get the data

In [2]:
## Titanic competition in Kaggle
## data downloaded from Kaggle.
import pandas
from sklearn.model_selection import train_test_split
import numpy as np

train = pandas.read_csv('/kaggle/input/titanic/train.csv')
train.columns  

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [3]:
'''
Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')
'''  
surname = []
title = []
for txt in train['Name']:
      txt1 = txt.split(',')[0]
      txt2 = (txt.split(",")[1]).split(".")[0]
      surname.append(txt1)
      title.append(txt2)

train['Surname'] = pandas.DataFrame(surname)
train['Title'] = pandas.DataFrame(title, dtype='category')

# There are too many missing values in "Cabin" with uncertain impact 
# to the accuracy after imputation. (It seems improve a little bit 
# after a comparison.)

X = train.drop(['Survived','Name'],axis=1)
y = (train.copy())['Survived']

# Split the (training) data to train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=4686)

# missing values in X_train
np.sum(X_train.isna(), axis=0)

# Fix missing values with means of corresponding group.
# age => Title and Sex
# Embarked => Pclass and Sex.
# Imputation after splitting, in order not to overfit. 
X_train["Age"] = X_train["Age"].fillna(X_train.groupby(['Sex','Title'])['Age'].transform('mean'))
X_test["Age"] = X_test["Age"].fillna(X_test.groupby(['Sex','Title'])['Age'].transform('mean'))

from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import OneHotEncoder, PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score, RandomizedSearchCV, GridSearchCV
from sklearn.preprocessing import StandardScaler

Impute1 = SimpleImputer(strategy='constant')
Impute2 = SimpleImputer(strategy="most_frequent")
Encode = OneHotEncoder(handle_unknown="ignore")
Scale = StandardScaler()

Embarked = ['Embarked']
Cat_attribs = ['Pclass', 'Surname','Title', 'Sex', 'Ticket', 'Cabin']
Age = ['Age']
Num_attribs = ['Fare']

num_pipiline1 = Pipeline(steps=[
    ('scale1', StandardScaler())
])

num_pipiline2 = Pipeline(steps=[
      ('poly', PolynomialFeatures()),
    ('scale1', StandardScaler())
])

cat_pipeline1 = Pipeline(steps=[
    ('impute2', Impute2),
    ('enco', Encode)
])

cat_pipeline2 = Pipeline(steps=[ 
      ('impute1', Impute1),
    ('encode', Encode)
])

total_pipeline = ColumnTransformer(transformers=[
    ('num_imp', num_pipiline1, Age),
    ('num_scale', num_pipiline2, Num_attribs),
    ('cat_imp', cat_pipeline1, Embarked),
    ('cat_encode', cat_pipeline2, Cat_attribs)
], remainder="passthrough")

'''
Logistic regression model.
'''
Logit = Pipeline(steps=[ 
    ('preprocess', total_pipeline),
    ('logit', LogisticRegression(penalty='l2', C=1, max_iter=1000, solver="liblinear") )
]
)

titanic_logit = Logit.fit(X_train, y_train)
titanic_pred = Logit.predict(X_train)
print(classification_report(y_train, titanic_pred))
# accuracy 0.94

cross_logit = cross_val_score(Logit, X_train, y_train, cv=10)
np.mean(cross_logit)
# ~ 0.84

dist = {
      'preprocess__num_scale__poly__degree': range(2,4),
    'logit__C': range(0,1000,50), 
    'logit__penalty': ['l2','l1']
}

lr_clf = RandomizedSearchCV(Logit, param_distributions=dist, n_iter=10, cv=3)

lr_clf.fit(X_train, y_train)

pred_lg2 = lr_clf.predict(X_train)
print(classification_report(y_train, pred_lg2))
# 1 !!

cross_lg2 = cross_val_score(lr_clf, X_train, y_train, cv=10)
np.mean(cross_lg2)
# 0.842740841248304 Overfitting.

Logit2 = Pipeline(steps=[ 
    ('preprocess', total_pipeline),
    ('logit', LogisticRegression(penalty='l2', C=1000, max_iter=1000, solver="liblinear") )
]
)

titanic_logit2 = Logit2.fit(X_train, y_train)
titanic_pred2 = Logit2.predict(X_train)
print(classification_report(y_train, titanic_pred2))
# accuracy 1
print(np.mean(cross_val_score(Logit2, X_train, y_train, cv=10)))
# ~0.84
print(classification_report(y_test, Logit2.predict(X_test)))
# ~0.85

'''
Support Vector Machine
'''
from sklearn.svm import SVC

svc = Pipeline(steps=[ 
    ('preprocess', total_pipeline),
    ('svm', SVC(C=100, kernel="linear"))
])

titan_svc = svc.fit(X_train, y_train)
pred_svc = svc.predict(X_train)
print(classification_report(y_train, pred_svc))
## 0.98.  

'''
dist_svc= { 
    "svm__C": range(1, 1000, 100),
    "svm__kernel": ['linear', 'poly', 'rbf', 'sigmoid', 'precomputed']
}

svc_chosen = RandomizedSearchCV(svc, param_distributions=dist_svc, n_iter=50, random_state=4686)
svc_chosen.fit(X_train, y_train)

# svc_chosen.best_params_
#{'svm__kernel': 'linear', 'svm__C': 1}
# It takes 57 minutes. n*m*log_2(m)
'''
svc_chosen = Pipeline(steps=[ 
    ('preprocess', total_pipeline),
    ('svm', SVC(C=1, kernel="linear"))
])

svc_chosen.fit(X_train, y_train)
pred_svc_rs = svc_chosen.predict(X_train)
print(classification_report(y_train, pred_svc_rs))
# 1 
print(np.mean(cross_val_score(svc_chosen, X_train, y_train, cv=5)))
# 0.83826

print(classification_report(y_test, svc_chosen.predict(X_test)))
#0.84


'''
Random Forest
'''

random_forest = Pipeline(steps=[ 
    ('preprocess', total_pipeline),
    ('rf', RandomForestClassifier(criterion="entropy", min_samples_leaf=1, random_state=4686))
])

titan_rf = random_forest.fit(X_train, y_train)
pred_rf = random_forest.predict(X_train)
print(classification_report(y_train, pred_rf))
#  1
np.mean(cross_val_score(random_forest, X_train, y_train, cv=10))
# ~0.826. 
# own test data.
print(classification_report(y_test, random_forest.predict(X_test)))
# 0.84

dist_rf = {
      'preprocess__num_scale__poly__degree': range(2,4),
      'rf__criterion': ['gini','entropy'],
      'rf__max_depth': np.arange(5, 10),
      'rf__min_samples_leaf': np.arange(1, 15, 2)
}

rf_chosen = GridSearchCV(random_forest, param_grid=dist_rf, cv=10)
rf_chosen.fit(X_train, y_train)
print(classification_report(y_train, rf_chosen.predict(X_train)))
#0.88
'''
{'preprocess__num_scale__poly__degree': 2,
 'rf__criterion': 'gini',
 'rf__max_depth': 9,
 'rf__min_samples_leaf': 1}
'''

print(classification_report(y_test, rf_chosen.predict(X_test)))
#0.80 Not as good as the default model above; thus not used to 

# K-Nearest Neighbours
from sklearn.neighbors import KNeighborsClassifier

knc = Pipeline(steps=[ 
    ('preprocess', total_pipeline),
    ('knclf', KNeighborsClassifier())
])

knc.fit(X_train, y_train)
pred_knc = knc.predict(X_train)
print(classification_report(y_train, pred_knc))
#0.73

disc_knc = {
     'preprocess__num_scale__poly__degree': range(2,4),
    "knclf__n_neighbors": range(0,100,5),
    "knclf__weights":['uniform','distance']
}

knc_grid = RandomizedSearchCV(knc, param_distributions=disc_knc, cv=50, random_state=4686)
knc_grid.fit(X_train, y_train)
# 'preprocess__num_scale__poly__degree': 3, 'knclf__weights': 'distance', 'knclf__n_neighbors': 60

pred_knc_grid = knc_grid.predict(X_train)
print(classification_report(y_train, pred_knc_grid))
# 1

np.mean(cross_val_score(knc_grid, X_train, y_train, cv=10), 0)
# ~ 0.6. knc_grid much overfitting.

print(classification_report(y_test, knc_grid.predict(X_test)))
#0.59

# Try ensemble algorithm such as Voting classifiers.

from sklearn.ensemble import VotingClassifier

# Use the accuracies for test data as the weights
vote_clf = VotingClassifier(
    estimators= [('lr', lr_clf), ('knc', knc_grid), ('svc', svc_chosen), ('rf', random_forest)],
    voting='hard'
)

vote_clf.fit(X_train, y_train)
pred_vote = vote_clf.predict(X_train)
print(classification_report(y_train, pred_vote))
# 1
np.mean(cross_val_score(vote_clf, X_train, y_train, cv=10), 0)
# 0.8292175486205338
pred_vote_test = vote_clf.predict(X_test)
print(classification_report(y_test, pred_vote_test))
#0.83

vote_clf2 = VotingClassifier(
    estimators= [('lr', lr_clf), ('svc', svc_chosen), ('rf', random_forest)],
    voting='hard'
)

vote_clf2.fit(X_train, y_train)
pred_vote2 = vote_clf2.predict(X_train)
print(classification_report(y_train, pred_vote2))
#
np.mean(cross_val_score(vote_clf2, X_train, y_train, cv=10), 0)
#0.8442107643600181
print(classification_report(y_test, vote_clf2.predict(X_test)))
#0.84


              precision    recall  f1-score   support

           0       0.92      0.98      0.95       407
           1       0.96      0.87      0.92       261

    accuracy                           0.94       668
   macro avg       0.94      0.93      0.93       668
weighted avg       0.94      0.94      0.94       668

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       407
           1       1.00      1.00      1.00       261

    accuracy                           1.00       668
   macro avg       1.00      1.00      1.00       668
weighted avg       1.00      1.00      1.00       668



3 fits failed out of a total of 30.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
3 fits failed with the following error:
Traceback (most recent call last):
  File "/opt/conda/lib/python3.7/site-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/conda/lib/python3.7/site-packages/sklearn/pipeline.py", line 394, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/opt/conda/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py", line 1541, in fit
    sample_weight=sample_weight,
  File "/opt/conda/lib/python3.7/site-packages/sklearn/svm/_base.py", line 1198, in _fit_liblinear
    sample_weight,
  File "sklearn/svm/_li

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       407
           1       1.00      1.00      1.00       261

    accuracy                           1.00       668
   macro avg       1.00      1.00      1.00       668
weighted avg       1.00      1.00      1.00       668

0.8487109905020352
              precision    recall  f1-score   support

           0       0.88      0.86      0.87       142
           1       0.76      0.79      0.78        81

    accuracy                           0.83       223
   macro avg       0.82      0.82      0.82       223
weighted avg       0.84      0.83      0.83       223

              precision    recall  f1-score   support

           0       0.98      0.99      0.99       407
           1       0.98      0.97      0.98       261

    accuracy                           0.98       668
   macro avg       0.98      0.98      0.98       668
weighted avg       0.98      0.98      0.98       668



3 fits failed out of a total of 30.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
3 fits failed with the following error:
Traceback (most recent call last):
  File "/opt/conda/lib/python3.7/site-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/conda/lib/python3.7/site-packages/sklearn/pipeline.py", line 394, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/opt/conda/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py", line 1541, in fit
    sample_weight=sample_weight,
  File "/opt/conda/lib/python3.7/site-packages/sklearn/svm/_base.py", line 1198, in _fit_liblinear
    sample_weight,
  File "sklearn/svm/_li

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       407
           1       1.00      1.00      1.00       261

    accuracy                           1.00       668
   macro avg       1.00      1.00      1.00       668
weighted avg       1.00      1.00      1.00       668



3 fits failed out of a total of 30.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
3 fits failed with the following error:
Traceback (most recent call last):
  File "/opt/conda/lib/python3.7/site-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/conda/lib/python3.7/site-packages/sklearn/pipeline.py", line 394, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/opt/conda/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py", line 1541, in fit
    sample_weight=sample_weight,
  File "/opt/conda/lib/python3.7/site-packages/sklearn/svm/_base.py", line 1198, in _fit_liblinear
    sample_weight,
  File "sklearn/svm/_li

              precision    recall  f1-score   support

           0       0.85      0.92      0.89       142
           1       0.84      0.72      0.77        81

    accuracy                           0.85       223
   macro avg       0.85      0.82      0.83       223
weighted avg       0.85      0.85      0.84       223



3 fits failed out of a total of 30.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
3 fits failed with the following error:
Traceback (most recent call last):
  File "/opt/conda/lib/python3.7/site-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/conda/lib/python3.7/site-packages/sklearn/pipeline.py", line 394, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/opt/conda/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py", line 1541, in fit
    sample_weight=sample_weight,
  File "/opt/conda/lib/python3.7/site-packages/sklearn/svm/_base.py", line 1198, in _fit_liblinear
    sample_weight,
  File "sklearn/svm/_li

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       407
           1       1.00      1.00      1.00       261

    accuracy                           1.00       668
   macro avg       1.00      1.00      1.00       668
weighted avg       1.00      1.00      1.00       668



3 fits failed out of a total of 30.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
3 fits failed with the following error:
Traceback (most recent call last):
  File "/opt/conda/lib/python3.7/site-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/conda/lib/python3.7/site-packages/sklearn/pipeline.py", line 394, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/opt/conda/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py", line 1541, in fit
    sample_weight=sample_weight,
  File "/opt/conda/lib/python3.7/site-packages/sklearn/svm/_base.py", line 1198, in _fit_liblinear
    sample_weight,
  File "sklearn/svm/_li

              precision    recall  f1-score   support

           0       0.88      0.87      0.88       142
           1       0.77      0.80      0.79        81

    accuracy                           0.84       223
   macro avg       0.83      0.83      0.83       223
weighted avg       0.84      0.84      0.84       223



In [4]:
test = pandas.read_csv('/kaggle/input/titanic/test.csv')
# pred_test = random_forest.predict(test)
np.sum(test.isna(),0)
# Error: "Fare" has a missing value, in contrast to X_train and X_test.

surname = []
title = []
for txt in test['Name']:
      txt1 = txt.split(',')[0]
      txt2 = (txt.split(",")[1]).split(".")[0]
      surname.append(txt1)
      title.append(txt2)

test['Surname'] = pandas.DataFrame(surname)
test['Title'] = pandas.DataFrame(title, dtype='category')
test["Age"] = test["Age"].fillna(test.groupby(['Sex','Title'])['Age'].transform('mean'))
# Not all missing values can be filled by group-by means; some are missing.
test['Age'] = test['Age'].fillna(test['Age'].mean())
test['Fare'] = test['Fare'].fillna(test.groupby(['Pclass'], dropna=True)['Fare'].transform('mean'))

test2 = test.drop(['Name'], axis=1)
pred_test = vote_clf2.predict(test2)

res1 = (test.copy())['PassengerId']
result = (pandas.DataFrame(res1)).assign(Survived = pred_test)
result.to_csv('submission.csv',index=False)

## 