In [23]:
import sklearn as sl
import pandas as pd
import numpy as np 
import seaborn as sns
import matplotlib.pyplot as plt

## import data

In [24]:
train = pd.read_csv('train.csv')
test  = pd.read_csv('test.csv')
data_list = [train,test]

In [25]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [26]:
test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [27]:
print("Train data missed values:\n")
print(train.isnull().sum())
print('\n','#'*50 , '\n')
print("Test data missed values:")
print(test.isnull().sum())

Train data missed values:

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

 ################################################## 

Test data missed values:
PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64


## drop useless columns & fill null

In [28]:
train.drop(columns = ["PassengerId"] , inplace = True)

for dataset in data_list:
    dataset.drop(columns = ["Ticket" , "Cabin"] , inplace = True)

## convert data to numerical 

In [29]:
train.Embarked.fillna(train.Embarked.dropna().max(), inplace=True)
for dataset in data_list:
    dataset['Embarked'] = dataset['Embarked'].dropna().map({'S':0,'C':1,'Q':2}).astype(int)

In [30]:
for dataset in data_list:
    dataset['Sex'] = dataset['Sex'].map( {'female': 0, 'male': 1} ).astype(int) 

In [31]:
train["Title"] =train["Name"].str.extract('([A-Za-z]+)\.')
test["Title"] = test["Name"].str.extract('([A-Za-z]+)\.')

train["Title"].value_counts()

Mr          517
Miss        182
Mrs         125
Master       40
Dr            7
Rev           6
Mlle          2
Major         2
Col           2
Countess      1
Capt          1
Ms            1
Sir           1
Lady          1
Mme           1
Don           1
Jonkheer      1
Name: Title, dtype: int64

In [33]:
def convert_title(title):
    if title in ["Ms", "Mile", "Miss"]:
        return "Miss"
    elif title in ["Mme", "Mrs"]:
        return "Mrs"
    elif title == "Mr":
        return "Mr"
    elif title == "Master":
        return "Master"
    else:
        return "Other"

for dataset in data_list:
    dataset["Title"] = dataset["Title"].map(convert_title)
train["Title"].value_counts()

Mr        517
Miss      183
Mrs       126
Master     40
Other      25
Name: Title, dtype: int64

In [34]:
train.groupby('Title')['Age'].mean()

Title
Master     4.574167
Miss      21.816327
Mr        32.368090
Mrs       35.788991
Other     43.750000
Name: Age, dtype: float64

In [35]:
for df in data_list:
    df.loc[(df["Age"].isnull()) & (df["Title"]=='Master'), 'Age'] = 5
    df.loc[(df["Age"].isnull()) & (df["Title"]=='Miss'), 'Age'] = 22
    df.loc[(df["Age"].isnull()) & (df["Title"]=='Mr'), 'Age'] = 32
    df.loc[(df["Age"].isnull()) & (df["Title"]=='Mrs'), 'Age'] = 36
    df.loc[(df["Age"].isnull()) & (df["Title"]=='Other'), 'Age'] = 44

In [37]:
print(train.isnull().sum())
print("-" * 50)
print(test.isnull().sum())

Survived    0
Pclass      0
Name        0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Embarked    0
Title       0
dtype: int64
--------------------------------------------------
PassengerId    0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Fare           1
Embarked       0
Title          0
dtype: int64


In [38]:
test.Fare.fillna(test.Fare.dropna().median() , inplace= True)

In [39]:
for dataset in data_list:
    dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch'] + 1

train.drop(['Parch', 'SibSp'], axis=1 , inplace = True)
test.drop(['Parch', 'SibSp'], axis=1 , inplace = True)    

train[['FamilySize', 'Survived']].groupby(['FamilySize'], as_index=False).mean().sort_values(by='Survived', ascending=False)

Unnamed: 0,FamilySize,Survived
3,4,0.724138
2,3,0.578431
1,2,0.552795
6,7,0.333333
0,1,0.303538
4,5,0.2
5,6,0.136364
7,8,0.0
8,11,0.0


In [42]:
train.head()

Unnamed: 0,Survived,Pclass,Name,Sex,Age,Fare,Embarked,Title,FamilySize
0,0,3,"Braund, Mr. Owen Harris",1,22.0,7.25,0,Mr,2
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38.0,71.2833,1,Mrs,2
2,1,3,"Heikkinen, Miss. Laina",0,26.0,7.925,0,Miss,1
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,35.0,53.1,0,Mrs,2
4,0,3,"Allen, Mr. William Henry",1,35.0,8.05,0,Mr,1


In [43]:
title_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Other": 5}
for dataset in data_list:
    dataset['Title'] = dataset['Title'].map(title_mapping)
    dataset['Title'] = dataset['Title'].fillna(0)

In [45]:
train.drop(['Name'], axis=1 , inplace = True)
test.drop(['Name'], axis=1 , inplace = True)    

In [46]:
train.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Fare,Embarked,Title,FamilySize
0,0,3,1,22.0,7.25,0,1,2
1,1,1,0,38.0,71.2833,1,3,2
2,1,3,0,26.0,7.925,0,2,1
3,1,1,0,35.0,53.1,0,3,2
4,0,3,1,35.0,8.05,0,1,1


In [47]:
test.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,Fare,Embarked,Title,FamilySize
0,892,3,1,34.5,7.8292,2,1,1
1,893,3,0,47.0,7.0,0,3,2
2,894,2,1,62.0,9.6875,2,1,1
3,895,3,1,27.0,8.6625,0,1,1
4,896,3,0,22.0,12.2875,0,3,3


In [48]:
for dataset in data_list:
    dataset['Single'] = dataset['FamilySize'].map(lambda s: 1 if s == 1 else 0)
    dataset['SmallF'] = dataset['FamilySize'].map(lambda s: 1 if  s == 2  else 0)
    dataset['MedF'] = dataset['FamilySize'].map(lambda s: 1 if 3 <= s <= 4 else 0)
    dataset['LargeF'] = dataset['FamilySize'].map(lambda s: 1 if s >= 5 else 0)
    
train.drop(columns = ["FamilySize"] , inplace = True)
test.drop(columns = ["FamilySize"] , inplace = True)

In [49]:
train.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Fare,Embarked,Title,Single,SmallF,MedF,LargeF
0,0,3,1,22.0,7.25,0,1,0,1,0,0
1,1,1,0,38.0,71.2833,1,3,0,1,0,0
2,1,3,0,26.0,7.925,0,2,1,0,0,0
3,1,1,0,35.0,53.1,0,3,0,1,0,0
4,0,3,1,35.0,8.05,0,1,1,0,0,0


In [50]:
Y_train = train["Survived"]
X_train = train.drop(labels = ["Survived"],axis = 1)
Test = test.drop(labels = ["PassengerId"],axis = 1)

In [54]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_validate, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, accuracy_score, f1_score, confusion_matrix

In [55]:
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(Test)

In [57]:
classifiers = {
    "KNN": KNeighborsClassifier(), 
    "LR": LogisticRegression(max_iter=1000), 
    "DT": DecisionTreeClassifier(),
    "RF": RandomForestClassifier(),
    "SVM": SVC(),
    "MLP": MLPClassifier(max_iter=1000),
}

results = pd.DataFrame(columns=["Classifier", "Avg_Accuracy", "Avg_F1_Score"])
for name, clf in classifiers.items():
    model = clf
    cv_results = cross_validate(
        model, X_train_scaled, Y_train, cv=10,
        scoring=(['accuracy', 'f1'])
    )

    results = results.append({
        "Classifier": name,
        "Avg_Accuracy": cv_results['test_accuracy'].mean(),
        "Avg_F1_Score": cv_results['test_f1'].mean()
    }, ignore_index=True)
    
results["Avg_Overall"] = (results["Avg_Accuracy"] + results["Avg_F1_Score"]) / 2
results = results.sort_values("Avg_Overall", ascending=False)
results


  results = results.append({
  results = results.append({
  results = results.append({
  results = results.append({
  results = results.append({
  results = results.append({


Unnamed: 0,Classifier,Avg_Accuracy,Avg_F1_Score,Avg_Overall
4,SVM,0.82829,0.762132,0.795211
5,MLP,0.824931,0.757703,0.791317
1,LR,0.817066,0.75448,0.785773
3,RF,0.809251,0.74032,0.774786
0,KNN,0.803645,0.730435,0.76704
2,DT,0.775593,0.708933,0.742263


## SVM is the best accuracy

In [105]:
from sklearn.ensemble import GradientBoostingClassifier,StackingClassifier
RBF_SVM = SVC()
RBF_SVM.fit(X_train, Y_train)

RF = RandomForestClassifier()
RF.fit(X_train, Y_train)

GB = GradientBoostingClassifier()
GB.fit(X_train, Y_train)

MPL = MLPClassifier(max_iter=1000)
MPL.fit(X_train, Y_train)

LR = LogisticRegression(max_iter=1000)
LR.fit(X_train, Y_train)


In [106]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
estimators = [
     ('Random Forest', RF),
    ('Gradient Boosting', GB),
    ('RBF SVM', RBF_SVM),
    ('LR',LR),
    ('MPL',MPL)
 ]

clf = StackingClassifier(estimators=estimators, final_estimator = LinearDiscriminantAnalysis())
clf.fit(X_train,Y_train)
predected = clf.predict(Test)

In [107]:
len(predected)

418

In [108]:
test['Survived'] = predected

In [109]:
test.Survived.value_counts()

0    264
1    154
Name: Survived, dtype: int64

In [87]:
test.to_csv('result.csv',index=False)

In [110]:
test.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,Fare,Embarked,Title,Single,SmallF,MedF,LargeF,Survived
0,892,3,1,34.5,7.8292,2,1,1,0,0,0,0
1,893,3,0,47.0,7.0,0,3,0,1,0,0,0
2,894,2,1,62.0,9.6875,2,1,1,0,0,0,0
3,895,3,1,27.0,8.6625,0,1,1,0,0,0,0
4,896,3,0,22.0,12.2875,0,3,0,0,1,0,1


In [111]:
result = test.drop(columns=['Pclass','Sex','Age','Fare','Embarked','Title',"SmallF",'MedF','LargeF','Single'])
result.to_csv('result.csv',index=False)
result.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1


In [112]:
true_values = pd.read_csv('submission.csv')
true_values.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1


In [113]:
from sklearn.metrics import accuracy_score
accuracy_score(result.Survived,true_values.Survived)


0.7559808612440191