In [23]:
import sklearn as sl
import pandas as pd
import numpy as np 
import seaborn as sns
import matplotlib.pyplot as plt

## import data

In [180]:
train = pd.read_csv('train.csv')
test  = pd.read_csv('test.csv')
data_list = [train,test]

In [181]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [182]:
test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [183]:
print("Train data missed values:\n")
print(train.isnull().sum())
print('\n','#'*50 , '\n')
print("Test data missed values:")
print(test.isnull().sum())

Train data missed values:

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

 ################################################## 

Test data missed values:
PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64


## drop useless columns & fill null

In [184]:
train.drop(columns = ["PassengerId"] , inplace = True)

for dataset in data_list:
    dataset.drop(columns = ["Ticket" , "Cabin"] , inplace = True)

## convert data to numerical 

In [185]:
train.Embarked.fillna(train.Embarked.dropna().max(), inplace=True)
for dataset in data_list:
    dataset['Embarked'] = dataset['Embarked'].dropna().map({'S':0,'C':1,'Q':2}).astype(int)

In [186]:
for dataset in data_list:
    dataset['Sex'] = dataset['Sex'].map( {'female': 0, 'male': 1} ).astype(int) 

In [187]:
train["Title"] =train["Name"].str.extract('([A-Za-z]+)\.')
test["Title"] = test["Name"].str.extract('([A-Za-z]+)\.')

train["Title"].value_counts()

Mr          517
Miss        182
Mrs         125
Master       40
Dr            7
Rev           6
Mlle          2
Major         2
Col           2
Countess      1
Capt          1
Ms            1
Sir           1
Lady          1
Mme           1
Don           1
Jonkheer      1
Name: Title, dtype: int64

In [188]:
def convert_title(title):
    if title in ["Ms", "Mile", "Miss"]:
        return "Miss"
    elif title in ["Mme", "Mrs"]:
        return "Mrs"
    elif title == "Mr":
        return "Mr"
    elif title == "Master":
        return "Master"
    else:
        return "Other"

for dataset in data_list:
    dataset["Title"] = dataset["Title"].map(convert_title)
train["Title"].value_counts()

Mr        517
Miss      183
Mrs       126
Master     40
Other      25
Name: Title, dtype: int64

In [189]:
train.groupby('Title')['Age'].mean()

Title
Master     4.574167
Miss      21.816327
Mr        32.368090
Mrs       35.788991
Other     43.750000
Name: Age, dtype: float64

In [190]:
for df in data_list:
    df.loc[(df["Age"].isnull()) & (df["Title"]=='Master'), 'Age'] = 5
    df.loc[(df["Age"].isnull()) & (df["Title"]=='Miss'), 'Age'] = 22
    df.loc[(df["Age"].isnull()) & (df["Title"]=='Mr'), 'Age'] = 32
    df.loc[(df["Age"].isnull()) & (df["Title"]=='Mrs'), 'Age'] = 36
    df.loc[(df["Age"].isnull()) & (df["Title"]=='Other'), 'Age'] = 44

In [191]:
print(train.isnull().sum())
print("-" * 50)
print(test.isnull().sum())

Survived    0
Pclass      0
Name        0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Embarked    0
Title       0
dtype: int64
--------------------------------------------------
PassengerId    0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Fare           1
Embarked       0
Title          0
dtype: int64


In [192]:
test.Fare.fillna(test.Fare.dropna().median() , inplace= True)

In [193]:
for dataset in data_list:
    dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch'] + 1

train.drop(['Parch', 'SibSp'], axis=1 , inplace = True)
test.drop(['Parch', 'SibSp'], axis=1 , inplace = True)    

train[['FamilySize', 'Survived']].groupby(['FamilySize'], as_index=False).mean().sort_values(by='Survived', ascending=False)

Unnamed: 0,FamilySize,Survived
3,4,0.724138
2,3,0.578431
1,2,0.552795
6,7,0.333333
0,1,0.303538
4,5,0.2
5,6,0.136364
7,8,0.0
8,11,0.0


In [194]:
train.head()

Unnamed: 0,Survived,Pclass,Name,Sex,Age,Fare,Embarked,Title,FamilySize
0,0,3,"Braund, Mr. Owen Harris",1,22.0,7.25,0,Mr,2
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38.0,71.2833,1,Mrs,2
2,1,3,"Heikkinen, Miss. Laina",0,26.0,7.925,0,Miss,1
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,35.0,53.1,0,Mrs,2
4,0,3,"Allen, Mr. William Henry",1,35.0,8.05,0,Mr,1


In [195]:
title_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Other": 5}
for dataset in data_list:
    dataset['Title'] = dataset['Title'].map(title_mapping)
    dataset['Title'] = dataset['Title'].fillna(0)

In [196]:
train.drop(['Name'], axis=1 , inplace = True)
test.drop(['Name'], axis=1 , inplace = True)    

In [197]:
train.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Fare,Embarked,Title,FamilySize
0,0,3,1,22.0,7.25,0,1,2
1,1,1,0,38.0,71.2833,1,3,2
2,1,3,0,26.0,7.925,0,2,1
3,1,1,0,35.0,53.1,0,3,2
4,0,3,1,35.0,8.05,0,1,1


In [198]:
test.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,Fare,Embarked,Title,FamilySize
0,892,3,1,34.5,7.8292,2,1,1
1,893,3,0,47.0,7.0,0,3,2
2,894,2,1,62.0,9.6875,2,1,1
3,895,3,1,27.0,8.6625,0,1,1
4,896,3,0,22.0,12.2875,0,3,3


In [199]:
for dataset in data_list:
    dataset['Single'] = dataset['FamilySize'].map(lambda s: 1 if s == 1 else 0)
    dataset['SmallF'] = dataset['FamilySize'].map(lambda s: 1 if  s == 2  else 0)
    dataset['MedF'] = dataset['FamilySize'].map(lambda s: 1 if 3 <= s <= 4 else 0)
    dataset['LargeF'] = dataset['FamilySize'].map(lambda s: 1 if s >= 5 else 0)
    
train.drop(columns = ["FamilySize"] , inplace = True)
test.drop(columns = ["FamilySize"] , inplace = True)

In [200]:
train.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Fare,Embarked,Title,Single,SmallF,MedF,LargeF
0,0,3,1,22.0,7.25,0,1,0,1,0,0
1,1,1,0,38.0,71.2833,1,3,0,1,0,0
2,1,3,0,26.0,7.925,0,2,1,0,0,0
3,1,1,0,35.0,53.1,0,3,0,1,0,0
4,0,3,1,35.0,8.05,0,1,1,0,0,0


In [201]:
for dataset in data_list:    
    dataset.loc[ dataset['Age'] <= 16, 'Age'] = 0
    dataset.loc[(dataset['Age'] > 16) & (dataset['Age'] <= 32), 'Age'] = 1
    dataset.loc[(dataset['Age'] > 32) & (dataset['Age'] <= 48), 'Age'] = 2
    dataset.loc[(dataset['Age'] > 48) & (dataset['Age'] <= 64), 'Age'] = 3
    dataset.loc[ dataset['Age'] > 64, 'Age']
train.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Fare,Embarked,Title,Single,SmallF,MedF,LargeF
0,0,3,1,1.0,7.25,0,1,0,1,0,0
1,1,1,0,2.0,71.2833,1,3,0,1,0,0
2,1,3,0,1.0,7.925,0,2,1,0,0,0
3,1,1,0,2.0,53.1,0,3,0,1,0,0
4,0,3,1,2.0,8.05,0,1,1,0,0,0


In [205]:
print(train.isnull().sum())
print("-" * 50)
print(test.isnull().sum())

Survived    0
Pclass      0
Sex         0
Age         0
Fare        0
Embarked    0
Title       0
Single      0
SmallF      0
MedF        0
LargeF      0
dtype: int64
--------------------------------------------------
PassengerId    0
Pclass         0
Sex            0
Age            0
Fare           0
Embarked       0
Title          0
Single         0
SmallF         0
MedF           0
LargeF         0
dtype: int64


In [203]:
for dataset in data_list:
    dataset['Age']=dataset['Age'].astype(int)
train.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Fare,Embarked,Title,Single,SmallF,MedF,LargeF
0,0,3,1,1,7.25,0,1,0,1,0,0
1,1,1,0,2,71.2833,1,3,0,1,0,0
2,1,3,0,1,7.925,0,2,1,0,0,0
3,1,1,0,2,53.1,0,3,0,1,0,0
4,0,3,1,2,8.05,0,1,1,0,0,0


In [178]:
train.Fare.describe()

count    891.000000
mean      32.204208
std       49.693429
min        0.000000
25%        7.910400
50%       14.454200
75%       31.000000
max      512.329200
Name: Fare, dtype: float64

In [204]:
for dataset in data_list:
    dataset.loc[ dataset['Fare'] <= 7.91, 'Fare'] = 0
    dataset.loc[(dataset['Fare'] > 7.91) & (dataset['Fare'] <= 14.454), 'Fare'] = 1
    dataset.loc[(dataset['Fare'] > 14.454) & (dataset['Fare'] <= 31), 'Fare']   = 2
    dataset.loc[ dataset['Fare'] > 31, 'Fare'] = 3
    dataset['Fare'] = dataset['Fare'].astype(int)
train.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Fare,Embarked,Title,Single,SmallF,MedF,LargeF
0,0,3,1,1,0,0,1,0,1,0,0
1,1,1,0,2,3,1,3,0,1,0,0
2,1,3,0,1,1,0,2,1,0,0,0
3,1,1,0,2,3,0,3,0,1,0,0
4,0,3,1,2,1,0,1,1,0,0,0


In [206]:
Y_train = train["Survived"]
X_train = train.drop(labels = ["Survived"],axis = 1)
Test = test.drop(labels = ["PassengerId"],axis = 1)

In [207]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_validate, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, accuracy_score, f1_score, confusion_matrix

In [208]:
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(Test)

In [209]:
classifiers = {
    "KNN": KNeighborsClassifier(), 
    "LR": LogisticRegression(max_iter=1000), 
    "DT": DecisionTreeClassifier(),
    "RF": RandomForestClassifier(),
    "SVM": SVC(),
    "MLP": MLPClassifier(max_iter=1000),
}

results = pd.DataFrame(columns=["Classifier", "Avg_Accuracy", "Avg_F1_Score"])
for name, clf in classifiers.items():
    model = clf
    cv_results = cross_validate(
        model, X_train_scaled, Y_train, cv=10,
        scoring=(['accuracy', 'f1'])
    )

    results = results.append({
        "Classifier": name,
        "Avg_Accuracy": cv_results['test_accuracy'].mean(),
        "Avg_F1_Score": cv_results['test_f1'].mean()
    }, ignore_index=True)
    
results["Avg_Overall"] = (results["Avg_Accuracy"] + results["Avg_F1_Score"]) / 2
results = results.sort_values("Avg_Overall", ascending=False)
results


  results = results.append({
  results = results.append({
  results = results.append({
  results = results.append({
  results = results.append({
  results = results.append({


Unnamed: 0,Classifier,Avg_Accuracy,Avg_F1_Score,Avg_Overall
4,SVM,0.827154,0.760266,0.79371
3,RF,0.823833,0.758022,0.790927
2,DT,0.824944,0.753181,0.789062
5,MLP,0.820424,0.750293,0.785359
0,KNN,0.812647,0.74501,0.778828
1,LR,0.811448,0.742164,0.776806


## SVM is the best accuracy

In [238]:
from sklearn.ensemble import GradientBoostingClassifier,StackingClassifier
RBF_SVM = SVC()
RBF_SVM.fit(X_train, Y_train)

# RF = RandomForestClassifier()
# RF.fit(X_train, Y_train)

# GB = GradientBoostingClassifier()
# GB.fit(X_train, Y_train)

DT=DecisionTreeClassifier()
DT.fit(X_train, Y_train)

# MPL = MLPClassifier(max_iter=5000)
# MPL.fit(X_train, Y_train)

# LR = LogisticRegression(max_iter=5000)
# LR.fit(X_train, Y_train)


In [239]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
estimators = [
#      ('Random Forest', RF),
#     ('Gradient Boosting', GB),
    ('RBF SVM', RBF_SVM),
#     ('LR',LR),
    ('DT',DT),
#     ('MPL',MPL)
 ]

clf = StackingClassifier(estimators=estimators, final_estimator = LinearDiscriminantAnalysis())
clf.fit(X_train,Y_train)
predected = clf.predict(Test)

In [240]:
len(predected)

418

In [241]:
test['Survived'] = predected

In [242]:
test.Survived.value_counts()

0    255
1    163
Name: Survived, dtype: int64

In [243]:
test.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,Fare,Embarked,Title,Single,SmallF,MedF,LargeF,Survived
0,892,3,1,2,0,2,1,1,0,0,0,0
1,893,3,0,2,0,0,3,0,1,0,0,1
2,894,2,1,3,1,2,1,1,0,0,0,0
3,895,3,1,1,1,0,1,1,0,0,0,0
4,896,3,0,1,1,0,3,0,0,1,0,1


In [244]:
result = test.drop(columns=['Pclass','Sex','Age','Fare','Embarked','Title',"SmallF",'MedF','LargeF','Single'])
result.to_csv('result.csv',index=False)
result.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1


In [245]:
true_values = pd.read_csv('submission.csv')
true_values.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1


In [247]:
from sklearn.metrics import accuracy_score
print(round(accuracy_score(result.Survived,true_values.Survived),4)*100)


77.75
