## Titanic Assignment

### 1. Import Necessary Libarary

In [150]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

### 2. Importing Data

In [151]:
df = pd.read_csv('train.csv')
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [152]:
df.shape

(891, 12)

In [153]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


There are some missing values here

### 3. Data Praparation

#### 3.1 Handeling Missing Values

In [154]:
df.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [155]:
def impute_nan(df,variable,median):
    df[variable] = df[variable].fillna(median)

In [156]:
median = df['Age'].median()
median

28.0

In [157]:
impute_nan(df,'Age',median)

In [158]:
df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [159]:
for label, content in df.items():
    if not pd.api.types.is_numeric_dtype(content):
        df[label] = pd.Categorical(content).codes+1

In [160]:
df.isnull().sum()

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       0
dtype: int64

In [161]:
for label, content in df.items():
    if pd.api.types.is_string_dtype(content):
        df[label] = content.astype("category").cat.as_ordered()

In [162]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,109,2,22.0,1,0,524,7.25,0,3
1,2,1,1,191,1,38.0,1,0,597,71.2833,82,1
2,3,1,3,354,1,26.0,0,0,670,7.925,0,3
3,4,1,1,273,1,35.0,1,0,50,53.1,56,3
4,5,0,3,16,2,35.0,0,0,473,8.05,0,3


#### 3.2 Train-Test Split

In [163]:
from sklearn.model_selection import train_test_split

In [164]:
df_train, df_test = train_test_split(df, test_size=0.2)

### 4. Model Building

In [165]:
X_train = df_train.drop('Survived', axis=1)
y_train = df_train['Survived']
X_test = df_test.drop('Survived', axis=1)
y_test = df_test['Survived']

In [166]:
from sklearn.ensemble import RandomForestClassifier

In [167]:
rf = RandomForestClassifier(random_state=42, n_estimators=50, oob_score=True)

In [168]:
rf.fit(X_train, y_train)

RandomForestClassifier(n_estimators=50, oob_score=True, random_state=42)

In [169]:
y_train_pred = rf.predict(X_train)

In [170]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
accuracy_score(y_train, y_train_pred)

0.9985955056179775

In [171]:
y_test_pred = rf.predict(X_test)

In [172]:
accuracy_score(y_test, y_test_pred)

0.8268156424581006

#### 4.1 Model evaluation: Cross validation

In [173]:
from sklearn.model_selection import cross_val_score

In [174]:
cross_val_score(rf, X_train, y_train, cv=5, n_jobs=-1).mean()

0.837082635674185

In [175]:
rf.oob_score_

0.8314606741573034

#### 4.2 Hyper-Parameter Tunning

In [176]:
from sklearn.model_selection import GridSearchCV

In [177]:
hyper_params = {'max_depth': [3, 5, 10, 15, 20],
                'max_features': [3, 5, 7, 11, 15],
                'min_samples_leaf': [20, 50, 100, 200, 400],
                'n_estimators': [10, 25, 50, 80, 100]
               }

In [178]:
model_cv = GridSearchCV(estimator=rf, 
             param_grid=hyper_params,
             verbose=1,
             cv=5,
             n_jobs=-1,
             return_train_score=True)

In [179]:
model_cv.fit(X_train, y_train)

Fitting 5 folds for each of 625 candidates, totalling 3125 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  52 tasks      | elapsed:    2.2s
[Parallel(n_jobs=-1)]: Done 352 tasks      | elapsed:    9.4s
[Parallel(n_jobs=-1)]: Done 1084 tasks      | elapsed:   26.1s
[Parallel(n_jobs=-1)]: Done 1600 tasks      | elapsed:   40.4s
[Parallel(n_jobs=-1)]: Done 2210 tasks      | elapsed:   58.3s
[Parallel(n_jobs=-1)]: Done 2920 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 3110 out of 3125 | elapsed:  1.4min remaining:    0.3s
[Parallel(n_jobs=-1)]: Done 3125 out of 3125 | elapsed:  1.4min finished


GridSearchCV(cv=5,
             estimator=RandomForestClassifier(n_estimators=50, oob_score=True,
                                              random_state=42),
             n_jobs=-1,
             param_grid={'max_depth': [3, 5, 10, 15, 20],
                         'max_features': [3, 5, 7, 11, 15],
                         'min_samples_leaf': [20, 50, 100, 200, 400],
                         'n_estimators': [10, 25, 50, 80, 100]},
             return_train_score=True, verbose=1)

In [180]:
model_cv.best_score_

0.818881118881119

#### 4.3 Fine-tuning using GridSearch

In [181]:
hyper_parameters = {'min_samples_leaf': [5, 10, 20, 50],
                    'n_estimators': [50, 60, 70],
                    'max_features': [10, 12, 14, 16]    
}

In [182]:
rf = RandomForestClassifier(max_depth=12, random_state=42, n_jobs=-1)

In [183]:
model_cv2 = GridSearchCV(estimator=rf,
                        param_grid=hyper_parameters,
                        verbose=1,
                        cv=5,
                        return_train_score=True,
                        n_jobs=-1)

In [184]:
model_cv2.fit(X_train, y_train)

Fitting 5 folds for each of 48 candidates, totalling 240 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    1.8s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:    5.1s
[Parallel(n_jobs=-1)]: Done 240 out of 240 | elapsed:    5.9s finished


GridSearchCV(cv=5,
             estimator=RandomForestClassifier(max_depth=12, n_jobs=-1,
                                              random_state=42),
             n_jobs=-1,
             param_grid={'max_features': [10, 12, 14, 16],
                         'min_samples_leaf': [5, 10, 20, 50],
                         'n_estimators': [50, 60, 70]},
             return_train_score=True, verbose=1)

In [185]:
model_cv2.best_score_

0.8287107258938246

#### 4.4 RandomizedSearchCV

In [186]:
from sklearn.model_selection import RandomizedSearchCV

In [187]:
hyper_params = {'max_depth': range(3, 20),
                'max_features': range(3, 17),
                'min_samples_leaf': range(20, 400, 50),
                'n_estimators': range(10, 101, 10)}

In [188]:
model_rcv = RandomizedSearchCV(estimator=rf, 
                              param_distributions=hyper_params,
                              verbose=1,
                              cv=5,
                              return_train_score=True,
                              n_jobs=-1,
                              n_iter=50)

In [189]:
model_rcv.fit(X_train, y_train)

Fitting 5 folds for each of 50 candidates, totalling 250 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  52 tasks      | elapsed:    1.3s
[Parallel(n_jobs=-1)]: Done 250 out of 250 | elapsed:    7.8s finished


RandomizedSearchCV(cv=5,
                   estimator=RandomForestClassifier(max_depth=12, n_jobs=-1,
                                                    random_state=42),
                   n_iter=50, n_jobs=-1,
                   param_distributions={'max_depth': range(3, 20),
                                        'max_features': range(3, 17),
                                        'min_samples_leaf': range(20, 400, 50),
                                        'n_estimators': range(10, 101, 10)},
                   return_train_score=True, verbose=1)

In [190]:
model_rcv.best_score_

0.8090712104796612

In [191]:
rf_best = model_cv2.best_estimator_

In [192]:
y_test_pred = rf_best.predict(X_test)

In [193]:
accuracy_score(y_test, y_test_pred)

0.7821229050279329

###### So Finally basic Random Forrest model gives us best score without Hyper Parameter Tunning

### 5. Importing Test set

In [194]:
df_testt = pd.read_csv("test.csv")

In [195]:
for label, content in df_testt.items():
    if not pd.api.types.is_numeric_dtype(content):
        df_testt[label] = pd.Categorical(content).codes+1

In [196]:
for label, content in df_testt.items():
    if pd.api.types.is_string_dtype(content):
        df_testt[label] = content.astype("category").cat.as_ordered()

In [197]:
df_testt.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,207,2,34.5,0,0,153,7.8292,0,2
1,893,3,404,1,47.0,1,0,222,7.0,0,3
2,894,2,270,2,62.0,0,0,74,9.6875,0,2
3,895,3,409,2,27.0,0,0,148,8.6625,0,3
4,896,3,179,1,22.0,1,1,139,12.2875,0,3


In [198]:
df_testt.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Name         418 non-null    int16  
 3   Sex          418 non-null    int8   
 4   Age          332 non-null    float64
 5   SibSp        418 non-null    int64  
 6   Parch        418 non-null    int64  
 7   Ticket       418 non-null    int16  
 8   Fare         417 non-null    float64
 9   Cabin        418 non-null    int8   
 10  Embarked     418 non-null    int8   
dtypes: float64(2), int16(2), int64(4), int8(3)
memory usage: 22.6 KB


In [199]:
df_testt.isnull().sum()

PassengerId     0
Pclass          0
Name            0
Sex             0
Age            86
SibSp           0
Parch           0
Ticket          0
Fare            1
Cabin           0
Embarked        0
dtype: int64

In [200]:
median = df_testt['Age'].median()
median

27.0

In [201]:
impute_nan(df_testt,'Age',median)

In [202]:
df_testt.Fare.value_counts()

7.7500     21
26.0000    19
8.0500     17
13.0000    17
7.8958     11
           ..
9.3250      1
14.4583     1
15.0333     1
25.4667     1
21.0750     1
Name: Fare, Length: 169, dtype: int64

In [203]:
median = df_testt['Fare'].median()
median

14.4542

In [204]:
impute_nan(df_testt,'Fare',median)

In [205]:
df_testt.isnull().sum()

PassengerId    0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       0
dtype: int64

In [206]:
X_train.shape, df_testt.shape

((712, 11), (418, 11))

In [208]:
test_preds = model_cv2.predict(df_testt)

In [210]:
df_preds = pd.DataFrame()
df_preds["PassengerId"] = df_testt["PassengerId"]
df_preds["Survived"] = test_preds
df_preds

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


In [211]:
df_preds.to_csv("Predictions Titanic.csv",
                index=False)