In [682]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [683]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
%matplotlib inline

In [684]:
train_df = pd.read_csv('/kaggle/input/titanic/train.csv')
#test_df = pd.read_csv('/kaggle/input/titanic/test.csv')

In [685]:
train_df.head()


In [686]:
train_df.describe()

In [687]:
train_df.shape


In [688]:
train_df.info()

In [689]:
# check the possible values of 'Survived' column
train_df['Survived'].unique()


In [690]:
train_df['Survived'].value_counts()
# 0: Not survived
# 1: Survived

In [691]:
# visualization of survived / not survived
sns.countplot(data = train_df, x = 'Survived')

In [692]:
train_df['Pclass'].unique()

In [693]:
# visualize the relationship of survived_male, survived_female in each Passenger class

survived_m = train_df[(train_df['Survived'] == 1)&(train_df['Sex']=='male')]
survived_f = train_df[(train_df['Survived'] == 1)&(train_df['Sex']=='female')]

plt.hist([train_df['Pclass'], survived_m['Pclass'], survived_f['Pclass']], bins=3)
plt.legend(['Total','Survived Males','Survived Females'])
plt.xlabel('Passenger class')
plt.ylabel('Number of people')
plt.xticks([1,2,3]) # Pclass unique values
plt.show()


In [694]:
# correlation of features
train_df.corr()

In [695]:
# visualize the correlation matrix
cmap = sns.diverging_palette(h_neg= 10, h_pos=240, as_cmap = True)
sns.heatmap(train_df.corr(), center = 0, cmap = cmap, linewidths = 1, annot = True, fmt=".2f")


In [696]:
# create boolean mask for the upper triangle of the plot
mask = np.triu(np.ones_like(train_df.corr(),dtype=bool))
sns.heatmap(train_df.corr(), mask = mask, center = 0, cmap = cmap, linewidths=1, annot = True, fmt = ".2f")

In [697]:
# it seems that there's no highly correlated features
# missing values
train_df.isnull().sum()

In [698]:
# check the ratio of missing values
train_df.isna().sum()/len(train_df)

In [699]:
# it seems that the 'Cabin' features has considerable of missing values
# we use a mask with threshold to filter out the features with high ratio of missing values
mask = train_df.isna().sum()/len(train_df) < 0.3
print(mask)


In [700]:
#retrain_df = train_df.loc[:,mask]

In [701]:
#PassId = train_df['PassengerId']
# drop unnecessary features which do not affect on the prediction
to_drop = ['Name','Ticket','Cabin', 'Embarked']
retrain_df = train_df.drop(to_drop, axis = 1)
retrain_df.head()

In [702]:
retrain_df.isna().sum()

In [703]:
#encoding sex column
retrain_df['Sex_enc'] = retrain_df['Sex'].apply(lambda x: 1 if x == 'male' else 0)
retrain_df.head()

In [704]:
# check the distribution of 'Age' by histogram
sns.displot(data = retrain_df, x = 'Age',kde= True, bins = 18, height = 6, aspect = 1.4, hue='Sex', multiple='dodge')
#sns.histplot(retrain_df["Age"], kde = True,bins = 18)
plt.axvline(retrain_df["Age"].median(), c="red", label="Median Age: {:.1f}".format(retrain_df["Age"].median()))
plt.axvline(retrain_df["Age"].mean(), c="blue", label="Mean Age: {:.1f}".format(retrain_df["Age"].mean()))
plt.legend()
plt.show()

In [705]:
retrain_df.drop('Sex', axis = 1, inplace = True)

In [706]:
#deal with missing values in 'Age' column
# data is right skewed: if we use mean, it will be affected more by outliers
retrain_df['Age'].fillna(retrain_df["Age"].median(), inplace = True)
retrain_df.isna().sum()

In [707]:
retrain_df.isnull().sum()

In [708]:
# check the distribution of data
# we draw a scatterplot, x column is Pclass, y column is Age, the dot color is distinguished by 'Survived' column
sns.scatterplot(x="Age", y='Fare', hue="Survived",
              data=retrain_df,palette=['green','blue'], legend='full')

#the distribution is not linear

In [709]:
X = retrain_df.drop('Survived', axis = 1)
y = retrain_df['Survived']

In [710]:
retrain_df.shape

In [711]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.3, stratify=y)
scaler = StandardScaler()
#scaler.fit(X_train)

In [712]:
scaled_X_train = scaler.fit_transform(X_train)
scaled_X_test = scaler.transform(X_test)

In [713]:
# SVM RBF kernel
# search for best parameter set (C, gamma)
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
svm = SVC()
# Instantiate the GridSearchCV object and run the search
parameters = {'C':[0.1, 1, 10], 'gamma':[0.00001, 0.0001, 0.001, 0.01, 0.1]}
searcher = GridSearchCV(svm, parameters)
searcher.fit(scaled_X_train,y_train)

# Report the best parameters and the corresponding score
print("Best CV params", searcher.best_params_)
print("Best CV accuracy", searcher.best_score_)

# Report the test accuracy using these best parameters
print("Test accuracy of best grid search hypers:", searcher.score(scaled_X_test, y_test))

In [714]:
# Decision Tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
# dt with max_depth = 6
dt = DecisionTreeClassifier(max_depth = 10, random_state=1)
dt.fit(scaled_X_train, y_train)

y_preds = dt.predict(scaled_X_test)
print(accuracy_score(y_test,y_preds))

In [715]:
# Modify Decision Tree: using entropy as a criterion
dt_entropy = DecisionTreeClassifier(max_depth = 10, criterion = 'entropy', random_state = 1)
dt_entropy.fit(X_train, y_train)
y_entropy_preds = dt_entropy.predict(X_test)
accuracy_entropy = accuracy_score(y_test, y_preds)
print('Accuracy achieved by using entropy: ', accuracy_entropy)

In [716]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier as KNN
# Ensemble learning
SEED = 1
lr = LogisticRegression(random_state = SEED)
knn = KNN(n_neighbors = 27)
dt = DecisionTreeClassifier(max_depth = 10, criterion = 'entropy', random_state = 1)

# Define the list classifiers
classifiers = [('Logistic Regression', lr), ('K Nearest Neighbours', knn), ('Classification Tree', dt)]

In [717]:
X_train.head()


In [718]:

X_train_dr = X_train.drop(['PassengerId'], axis=1)
X_test_dr = X_test.copy()
X_test_dr = X_test_dr.drop(['PassengerId'], axis=1)
for clf_name, clf in classifiers:    
 
    # Fit clf to the training set
    clf.fit(X_train_dr, y_train)    
   
    # Predict y_pred
    y_pred = clf.predict(X_test_dr)
    
    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred) 
   
    # Evaluate clf's accuracy on the test set
    print('{:s} : {:.3f}'.format(clf_name, accuracy))

In [719]:
# using VotingClassifier to improve individual classifiers

from sklearn.ensemble import VotingClassifier

# Instantiate a VotingClassifier vc
vc = VotingClassifier(estimators=classifiers)     

# Fit vc to the training set
vc.fit(X_train_dr, y_train)   

# Evaluate the test set predictions
y_pred = vc.predict(X_test_dr)

# Calculate accuracy score
accuracy = accuracy_score(y_test, y_pred)
print('Voting Classifier: {:.4f}'.format(accuracy))

In [720]:
# Bagging classifier
# Import BaggingClassifier
from sklearn.ensemble import BaggingClassifier

# Instantiate dt
dt = DecisionTreeClassifier(random_state=1)

# Instantiate bc
bc = BaggingClassifier(base_estimator=dt, n_estimators=50, random_state=1)
# Fit bc to the training set
bc.fit(X_train_dr, y_train)

# Predict test set labels
y_pred = bc.predict(X_test_dr)

# Evaluate acc_test
acc_test = accuracy_score(y_test, y_pred)
print('Test set accuracy of bc: {:.6f}'.format(acc_test)) 

In [724]:
submission = pd.DataFrame(X_test['PassengerId'])
submission["Survived"] = y_pred
submission.to_csv('submission.csv')