In [1]:
import pandas as pd
import numpy as np
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [2]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
df = pd.concat([train, test], sort=False)

In [4]:
df.head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0.0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1.0,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0.0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [5]:
df.isna().sum()
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1309 entries, 0 to 417
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  1309 non-null   int64  
 1   Survived     891 non-null    float64
 2   Pclass       1309 non-null   int64  
 3   Name         1309 non-null   object 
 4   Sex          1309 non-null   object 
 5   Age          1046 non-null   float64
 6   SibSp        1309 non-null   int64  
 7   Parch        1309 non-null   int64  
 8   Ticket       1309 non-null   object 
 9   Fare         1308 non-null   float64
 10  Cabin        295 non-null    object 
 11  Embarked     1307 non-null   object 
dtypes: float64(3), int64(4), object(5)
memory usage: 132.9+ KB


In [24]:
df.drop(['Cabin', 'Name', 'Ticket'], axis=1, inplace=True)


In [25]:
df.head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,0.0,3,1.0,22.0,1,0,7.75,2.0
1,2,1.0,1,0.0,38.0,1,0,56.9292,0.0
2,3,1.0,3,0.0,26.0,0,0,7.925,2.0
3,4,1.0,1,0.0,35.0,1,0,53.1,2.0
4,5,0.0,3,1.0,35.0,0,0,8.05,2.0


In [26]:
# all other features having missing vals less for than 25
# Value imputation method - simple imputer to replace with nan values
from sklearn.impute import SimpleImputer
simple_imp_mean = SimpleImputer(strategy='mean')
df['Age'] = simple_imp_mean.fit_transform(df[['Age']])
df['Survived'] = simple_imp_mean.fit_transform(df[['Survived']])


In [27]:
simple_imp_most_freq = SimpleImputer(strategy= 'most_frequent')
df['Embarked'] = simple_imp_most_freq.fit_transform(df[['Embarked']]).ravel()

In [28]:
df['Fare'] = simple_imp_most_freq.fit_transform(df[['Fare']])

In [29]:
df

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,0.000000,3,1.0,22.000000,1,0,7.7500,2.0
1,2,1.000000,1,0.0,38.000000,1,0,56.9292,0.0
2,3,1.000000,3,0.0,26.000000,0,0,7.9250,2.0
3,4,1.000000,1,0.0,35.000000,1,0,53.1000,2.0
4,5,0.000000,3,1.0,35.000000,0,0,8.0500,2.0
...,...,...,...,...,...,...,...,...,...
413,1305,0.383838,3,1.0,29.881138,0,0,8.0500,2.0
414,1306,0.383838,1,0.0,39.000000,0,0,56.9292,0.0
415,1307,0.383838,3,1.0,38.500000,0,0,7.7500,2.0
416,1308,0.383838,3,1.0,29.881138,0,0,8.0500,2.0


In [30]:
df.isna().sum()

PassengerId    0
Survived       0
Pclass         0
Sex            0
Age            0
SibSp          0
Parch          0
Fare           0
Embarked       0
dtype: int64

In [31]:
threshold = 3
from scipy.stats import zscore
numerical_features = ['Fare', 'Age']
z_scores = np.abs(zscore(df[numerical_features]))
print(z_scores)

[[0.85019689 0.96184978]
 [1.86868613 1.0871246 ]
 [0.84052197 0.44960618]
 ...
 [0.85019689 1.15115505]
 [0.83361132 0.0474158 ]
 [0.04257377 0.0474158 ]]


In [32]:
# Outliers
outliers = np.where(z_scores> threshold)
print(outliers)

(array([], dtype=int64), array([], dtype=int64))


In [33]:
# mitigation - to reduce the outliers
from scipy.stats.mstats import winsorize

df['Age'] = winsorize(df['Age'], limits=[0.15,0.15])
df['Fare'] = winsorize(df['Fare'], limits=[0.15,0.15])

In [35]:
# Cleared the outliers successfully
from sklearn.preprocessing import OrdinalEncoder
encoder = OrdinalEncoder()
df['Sex'] = encoder.fit_transform(df[['Sex']])
df['Embarked'] = encoder.fit_transform(df[['Embarked']])
# df['Survived'] = encoder.fit_transform(df[[['Survived']]])

In [36]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,0.0,3,1.0,22.0,1,0,7.75,2.0
1,2,1.0,1,0.0,38.0,1,0,56.9292,0.0
2,3,1.0,3,0.0,26.0,0,0,7.925,2.0
3,4,1.0,1,0.0,35.0,1,0,53.1,2.0
4,5,0.0,3,1.0,35.0,0,0,8.05,2.0


In [72]:
# independent and dependent features
x = df.drop('Survived', axis=1)
y = df['Survived'].astype(int)
# spliting the data
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.33)

In [73]:
y

0      0
1      1
2      1
3      1
4      0
      ..
413    0
414    0
415    0
416    0
417    0
Name: Survived, Length: 1309, dtype: int64

In [74]:
# training our model
from sklearn.ensemble import AdaBoostClassifier
classifier = AdaBoostClassifier()

In [75]:
param_grid = {'n_estimators': [10,50,100,500], 'learning_rate': [0.1,1.0, 0.01, 0.001, 0.0001]}

In [76]:
# hyper parameter tuning
from sklearn.model_selection import GridSearchCV
grid_ada = GridSearchCV(estimator=classifier,param_grid=param_grid ,scoring='accuracy', cv=5)
grid_ada.fit(x_train, y_train)

0,1,2
,estimator,AdaBoostClassifier()
,param_grid,"{'learning_rate': [0.1, 1.0, ...], 'n_estimators': [10, 50, ...]}"
,scoring,'accuracy'
,n_jobs,
,refit,True
,cv,5
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,estimator,
,n_estimators,100
,learning_rate,0.1
,algorithm,'deprecated'
,random_state,


In [77]:
grid_ada.best_score_

np.float64(0.8677467532467533)

In [78]:
y_pred = grid_ada.predict(x_test)
from sklearn.metrics import classification_report, accuracy_score
print(f"report: {classification_report(y_test, y_pred)}")
print(f'score: {accuracy_score(y_test, y_pred)}')

report:               precision    recall  f1-score   support

           0       0.87      0.90      0.89       310
           1       0.73      0.66      0.70       122

    accuracy                           0.84       432
   macro avg       0.80      0.78      0.79       432
weighted avg       0.83      0.84      0.83       432

score: 0.8356481481481481
