In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv("train.csv")
test_data = pd.read_csv("test.csv")
data.head(4)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S


In [3]:
data.shape

(891, 12)

In [4]:
data.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [5]:
mean_age = data['Age'].mean()
data['Age'].fillna(mean_age, inplace=True)

In [6]:
data.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

Dropping the unimportant features from both train and test datasets


In [7]:
data.drop(['Name','Ticket','Cabin'], inplace = True, axis = 1) ##axis = 1 means dropping it on columns.
test_data.drop(['Name','Ticket','Cabin'], inplace = True, axis = 1) 

In [8]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Sex          891 non-null    object 
 4   Age          891 non-null    float64
 5   SibSp        891 non-null    int64  
 6   Parch        891 non-null    int64  
 7   Fare         891 non-null    float64
 8   Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(2)
memory usage: 62.8+ KB


Handle all the null values

In [9]:
test_data.isnull().sum()

PassengerId     0
Pclass          0
Sex             0
Age            86
SibSp           0
Parch           0
Fare            1
Embarked        0
dtype: int64

In [10]:
data['Embarked'].value_counts()

S    644
C    168
Q     77
Name: Embarked, dtype: int64

In [11]:
# filling the null values for Embarked  column
dataset = [data, test_data]

for df in dataset:
    df['Embarked'] = df['Embarked'].fillna('S')


In [12]:
# fill the null values of fare and age features by mean value
dataset = [data, test_data]

for df in dataset:
    df['Fare'] = df['Fare'].fillna(df['Fare'].mean())
    df['Age'] = df['Age'].fillna(df['Age'].mean())

In [13]:
data[['Age','Survived']].groupby(['Age'], as_index = False).mean().sort_values(by = 'Survived', ascending = False)

Unnamed: 0,Age,Survived
0,0.42,1.0
9,5.00,1.0
80,63.00,1.0
69,53.00,1.0
1,0.67,1.0
...,...,...
50,36.50,0.0
32,24.50,0.0
47,34.50,0.0
41,30.50,0.0


Data preprocessing on Fare and Age

In [14]:
temp_fare = data['Fare']
temp_fare = pd.qcut(temp_fare, 6)
temp_fare.value_counts()

(-0.001, 7.775]      156
(7.775, 8.662]       152
(14.454, 26.0]       149
(52.369, 512.329]    149
(26.0, 52.369]       146
(8.662, 14.454]      139
Name: Fare, dtype: int64

In [15]:
# simplify the column name
dataset = [data, test_data]

for df in dataset:
    df.loc[df["Fare"] <= 7.775, 'Fare'] = 0
    df.loc[(df["Fare"] > 7.775) & (df["Fare"] <= 8.662), 'Fare'] = 1
    df.loc[(df["Fare"] > 8.662) & (df["Fare"] <= 14.454), 'Fare'] = 2
    df.loc[(df["Fare"] > 14.454) & (df["Fare"] <= 26), 'Fare'] = 3
    df.loc[(df["Fare"] > 26) & (df["Fare"] <= 52.639), 'Fare'] = 4
    df.loc[df["Fare"] > 52.639, 'Fare'] = 5


In [16]:
data['Fare'].value_counts()

0.0    156
3.0    156
4.0    149
5.0    146
2.0    145
1.0    139
Name: Fare, dtype: int64

In [17]:
temp_age = data['Age']
temp_age = pd.qcut(temp_age, 5)
temp_age.value_counts()

(28.0, 29.699]    199
(20.0, 28.0]      183
(0.419, 20.0]     179
(38.0, 80.0]      177
(29.699, 38.0]    153
Name: Age, dtype: int64

In [18]:
dataset = [data, test_data]

for df in dataset:
    df.loc[df["Age"] <= 20, 'Age'] = 0
    df.loc[(df["Age"] > 20) & (df["Age"] <= 28), 'Age'] = 1
    df.loc[(df["Age"] > 28) & (df["Age"] <= 29.699), 'Age'] = 2
    df.loc[(df["Age"] > 29.699) & (df["Age"] <= 38), 'Age'] = 3
    df.loc[(df["Age"] > 38) & (df["Age"] <= 80), 'Age'] = 4

In [19]:
data['Age'].value_counts()

3.0    330
1.0    183
0.0    179
4.0    177
2.0     22
Name: Age, dtype: int64

In [20]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Sex          891 non-null    object 
 4   Age          891 non-null    float64
 5   SibSp        891 non-null    int64  
 6   Parch        891 non-null    int64  
 7   Fare         891 non-null    float64
 8   Embarked     891 non-null    object 
dtypes: float64(2), int64(5), object(2)
memory usage: 62.8+ KB


In [21]:
genderMap = {'male':0, 'female':1}
embarkedMap = {'S' : 0, 'C' : 1, 'Q': 2}
dataset = [data,test_data]

for df in dataset:
    df['Sex'] = df['Sex'].map(genderMap)
    df['Embarked'] = df['Embarked'].map(embarkedMap)


In [22]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Sex          891 non-null    int64  
 4   Age          891 non-null    float64
 5   SibSp        891 non-null    int64  
 6   Parch        891 non-null    int64  
 7   Fare         891 non-null    float64
 8   Embarked     891 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 62.8 KB


Separating X and Y i.e feature and labels


In [23]:
X_train = data.drop(['Survived','PassengerId'],axis = 1)
y_train = data['Survived']

X_test = test_data.drop("PassengerId",axis = 1)

Model training

In [24]:
# 1) logistic Regression

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

clf = LogisticRegression(random_state=0)
clf.fit(X_train, y_train)

acc_logistic = round(clf.score(X_train, y_train)*100,2)
print(acc_logistic)

80.7


In [25]:
# 2) SVM classifier

from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

param_grid = {
    'C': [0.1, 1],
    'kernel': ['linear', 'rbf'],
}

grid_search = GridSearchCV(SVC(random_state=42), param_grid, cv=5)
grid_search.fit(X_train, y_train)
best_params = grid_search.best_params_

# Step 2: Create and Train SVM model with the best parameters
best_svm = SVC(random_state=42, **best_params)
best_svm.fit(X_train, y_train)

acc_svm = round(best_svm.score(X_train, y_train) * 100, 2)
print(acc_svm)


81.82


In [26]:
# 3) XGBoost

import xgboost as xgb

clf_xgb = xgb.XGBClassifier(random_state=42)
clf_xgb.fit(X_train, y_train)

acc_logistic = round(clf_xgb.score(X_train, y_train)*100,2)
print(acc_logistic)

88.55


In [27]:
# 4) Decision Tree

from sklearn.tree import DecisionTreeClassifier

clf_dt = DecisionTreeClassifier(random_state=42)
clf_dt.fit(X_train, y_train)

acc_logistic = round(clf_dt.score(X_train, y_train)*100,2)
print(acc_logistic)

89.34


In [28]:
# 5) Random Forest Classifier

from sklearn.ensemble import RandomForestClassifier

clf_rf = RandomForestClassifier(random_state=42)
clf_rf.fit(X_train, y_train)

acc_logistic = round(clf_rf.score(X_train, y_train)*100,2)
print(acc_logistic)

89.34


In [32]:
y_pred = best_svm.predict(X_test)

In [34]:
output = pd.DataFrame({'PassengerId':test_data.PassengerId,'Survived':y_pred})
output

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0
