# Predicting Survival in the Titanic Data Set : Using only Pclass, Sex, Age, SibSp (Siblings aboard), Parch (Parents/children aboard), and Fare to predict whether a passenger survived.

In [6]:
import pandas as pd
from sklearn import tree

In [7]:
titanic_df = pd.read_csv(r'C:\Users\91842\Desktop\Pandas Test Folder\titanic.csv')

In [8]:
titanic_df.head()

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S,2.0,,"St Louis, MO"
1,1,1,"Allison, Master. Hudson Trevor",male,0.92,1,2,113781,151.55,C22 C26,S,11.0,,"Montreal, PQ / Chesterville, ON"
2,1,0,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,113781,151.55,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"


In [9]:
titanic_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 14 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   pclass     1309 non-null   int64  
 1   survived   1309 non-null   int64  
 2   name       1309 non-null   object 
 3   sex        1309 non-null   object 
 4   age        1046 non-null   float64
 5   sibsp      1309 non-null   int64  
 6   parch      1309 non-null   int64  
 7   ticket     1309 non-null   object 
 8   fare       1308 non-null   float64
 9   cabin      295 non-null    object 
 10  embarked   1307 non-null   object 
 11  boat       486 non-null    object 
 12  body       121 non-null    float64
 13  home.dest  745 non-null    object 
dtypes: float64(3), int64(4), object(7)
memory usage: 143.3+ KB


In [10]:
titanic_df.isnull().sum()

pclass          0
survived        0
name            0
sex             0
age           263
sibsp           0
parch           0
ticket          0
fare            1
cabin        1014
embarked        2
boat          823
body         1188
home.dest     564
dtype: int64

### Conclusion :
#We can see that there are some null values present in the column - age and fare. In age there are total 263 missing values and in fare there is only 1 missing value.
#For fare missing value we can ignore it as it's just 1 missing value.
#For age column we need to apply some smart imputation to handle the missing values.

In [11]:
#But before that dropping the unnecessary columns as per the problem statement :
titanic_df = titanic_df.drop(['name','ticket','cabin','embarked','boat','body','home.dest'], axis='columns')

### Missing value imputation for 'age' column:

In [12]:
#At first I have grouped the data based on sex. 
#Then I am storing the age for the groups created. 
sex_and_age = titanic_df.groupby('sex').age

In [13]:
#I extracted the mean age of all the female and male onboarded.
sex_and_age.mean()

sex
female    28.687088
male      30.585228
Name: age, dtype: float64

In [14]:
import numpy as np

#I have used these 2 values to fill the missing age values according to the gender :
titanic_df.age = np.where(titanic_df.sex=='female', titanic_df.age.fillna(28), titanic_df.age)   #fill missing values for female
titanic_df.age = np.where(titanic_df.sex=='male', titanic_df.age.fillna(30), titanic_df.age)     #fill missing values for male

In [15]:
#Check to see if all the missing values are now replaced or not:
titanic_df.isnull().sum()

pclass      0
survived    0
sex         0
age         0
sibsp       0
parch       0
fare        1
dtype: int64

In [16]:
#Drop the rest missing value for 'fare':
titanic_df = titanic_df.dropna()

In [17]:
titanic_df.isnull().sum()

pclass      0
survived    0
sex         0
age         0
sibsp       0
parch       0
fare        0
dtype: int64

Great, there are no more missing values present in our dataframe now. 

In [18]:
titanic_df.tail()

Unnamed: 0,pclass,survived,sex,age,sibsp,parch,fare
1304,3,0,female,14.5,1,0,14.4542
1305,3,0,female,28.0,1,0,14.4542
1306,3,0,male,26.5,0,0,7.225
1307,3,0,male,27.0,0,0,7.225
1308,3,0,male,29.0,0,0,7.875


### Convert the categorical columns into numerical : Used label encoder

#As we have categorical column 'sex' in our final dataframe, we need to convert it to a numerical column for further processing.


In [19]:
#import necessary lib:
from sklearn.preprocessing import LabelEncoder

#create an object for the column 'sex':
le_sex = LabelEncoder()

#create a new column:
titanic_df['sex_new']  = le_sex.fit_transform(titanic_df['sex'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [20]:
titanic_df.head()

Unnamed: 0,pclass,survived,sex,age,sibsp,parch,fare,sex_new
0,1,1,female,29.0,0,0,211.3375,0
1,1,1,male,0.92,1,2,151.55,1
2,1,0,female,2.0,1,2,151.55,0
3,1,0,male,30.0,1,2,151.55,1
4,1,0,female,25.0,1,2,151.55,0


### Prepare for ML model creation : X = predictors and y = prediction

In [21]:
X = titanic_df.drop('survived', axis='columns')
y = titanic_df['survived']

In [22]:
#Drop the unnecessary column 'sex' :
X = X.drop('sex', axis='columns')
X.head()

Unnamed: 0,pclass,age,sibsp,parch,fare,sex_new
0,1,29.0,0,0,211.3375,0
1,1,0.92,1,2,151.55,1
2,1,2.0,1,2,151.55,0
3,1,30.0,1,2,151.55,1
4,1,25.0,1,2,151.55,0


### Decision Tree model creation : Model will predict if a passenger survived or not based on the given predictors

In [23]:
#import necessary lib for decision tree:
from sklearn import tree

#create the model:
model = tree.DecisionTreeClassifier()

In [24]:
#train the model: We have not split the data here
model.fit(X,y)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [25]:
#Let's check the score of the model:
model.score(X,y)

0.9655963302752294

### Conclusion :

The model accuracy is 96.55%.

### Let's create another model now using train_test_split :

In [26]:
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split,GridSearchCV

In [27]:
#Using the same X nad y:
x_train,x_test,y_train,y_test = train_test_split(X,y,test_size = 0.25, random_state= 35)

In [28]:
#let's first visualize the tree on the data without doing any pre processing
model2 = DecisionTreeClassifier()
model2.fit(x_train,y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [29]:
model2.score(x_train,y_train)

0.9714576962283384

In [30]:
model2.score(x_test,y_test)

0.7675840978593272

### Conclusion :

#It seems that our model accuracy with train data is high, i.e 97.14%

#But for unseen data our model accuracy is lower, i.e 76.75%

#Therefore, we can say that our model is overfitted.

### Let's do hyper parameter tuning now and see how model score can be improved :

When we do hyperparameter tuning, we basically try to find those sets and values of hyperparameters which will give us a model with maximum accuracy. Let's go ahead and try to improve our model.

Preventing overfitting is pivotal while modeling a decision tree and it can be done in 2 ways:

1. Setting constraints on tree size
2. Tree pruning

In [34]:
#We will start with scaling our data.
from sklearn.preprocessing import StandardScaler

scalar = StandardScaler()

x_transform = scalar.fit_transform(X)

In [35]:
x_train,x_test,y_train,y_test = train_test_split(x_transform,y,test_size = 0.30, random_state= 35)

In [37]:
#let's first visualize the tree on the data without doing any pre processing
clf = DecisionTreeClassifier()

In [41]:
from sklearn.model_selection import train_test_split,GridSearchCV

In [38]:
# we are tuning three hyperparameters right now, we are passing the different values for both parameters
grid_param = {
    'criterion': ['gini', 'entropy'],
    'max_depth' : range(2,32,1),
    'min_samples_leaf' : range(1,10,1),
    'min_samples_split': range(2,10,1),
    'splitter' : ['best', 'random']
    
}

In [39]:
grid_search = GridSearchCV(estimator=clf,
                     param_grid=grid_param,
                     cv=5,
                    n_jobs =-1)

In [40]:
grid_search.fit(x_train,y_train)

GridSearchCV(cv=5, error_score=nan,
             estimator=DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features=None,
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              presort='deprecated',
                                              random_state=None,
                                              splitter='best'),
             iid='deprecated', n_jobs=-1,
             param_grid={'criterion': ['gini', 'entropy'],
                     

In [42]:
best_parameters = grid_search.best_params_
print(best_parameters)

{'criterion': 'gini', 'max_depth': 14, 'min_samples_leaf': 7, 'min_samples_split': 3, 'splitter': 'random'}


In [43]:
grid_search.best_score_

0.8185792349726777

In [44]:
clf = DecisionTreeClassifier(criterion = 'gini', max_depth =14, min_samples_leaf= 7, min_samples_split= 3, splitter ='random')
clf.fit(x_train,y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=14, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=7, min_samples_split=3,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='random')

In [45]:
clf.score(x_test,y_test)

0.7989821882951654

### Conclusion :

Great, Our test score has improved after using Gridsearch.