In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
train = pd.read_csv("C:/Programming_Py_ML_AI/data/titanic/train.csv")

In [3]:
test = pd.read_csv("C:/Programming_Py_ML_AI/data/titanic/test.csv")

In [4]:
pd.concat([train.isna().sum(), test.isna().sum()], axis=1).rename({0: 'train', 1: 'test'}, axis=1)

Unnamed: 0,train,test
PassengerId,0,0.0
Survived,0,
Pclass,0,0.0
Name,0,0.0
Sex,0,0.0
Age,177,86.0
SibSp,0,0.0
Parch,0,0.0
Ticket,0,0.0
Fare,0,1.0


In [5]:
train.head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [6]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


## **Cross Validation** for model selection

In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_validate, StratifiedKFold, KFold

In [8]:
cv_5_fold = cross_validate(estimator = LogisticRegression(),
                             scoring = 'accuracy',
                             cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=123),
                             X = train[['Pclass', 'SibSp', 'Parch', 'Fare']],
                             y = train['Survived'])#['test_score'].mean()

In [9]:
cv_5_fold

{'fit_time': array([0.00933099, 0.01139307, 0.00751138, 0.01663756, 0.00964594]),
 'score_time': array([0.        , 0.00615144, 0.        , 0.        , 0.        ]),
 'test_score': array([0.70391061, 0.70224719, 0.65168539, 0.6741573 , 0.69662921])}

In [10]:
cv_mean_acc = cv_5_fold['test_score'].mean()

In [11]:
print(f'CV accuracy by logistic regression classifier: {cv_mean_acc:0.3f}')

CV accuracy by logistic regression classifier: 0.686


In [12]:
cv_10 = cross_validate(estimator=LogisticRegression(),
                       scoring='accuracy',
                       cv=StratifiedKFold(n_splits=10, shuffle=True, random_state=42),
                       X=train[['Pclass', 'SibSp', 'Parch', 'Fare']],
                       y=train['Survived'])

In [13]:
cv_10['test_score'].mean()

0.6857553058676654

## Pipeline for nomalization 

In [14]:
from sklearn.preprocessing import PowerTransformer, StandardScaler

In [15]:
from sklearn.pipeline import Pipeline

In [16]:
simple_pipe = Pipeline(steps=[('normalize', PowerTransformer()),
                              ('liner_model', LogisticRegression())])

In [17]:
cv_pipe = cross_validate(estimator=simple_pipe,
                         cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
                         scoring='accuracy',
                         X=train[['Pclass', 'SibSp', 'Parch', 'Fare']],
                         y=train['Survived'])

In [18]:
cv_pipe['test_score'].mean()

0.7115749168288243

### Getting Mean

In [19]:
train_copy = train.copy()

In [20]:
train_copy.loc[train_copy['Age'].isna(), 'Age'] = train_copy['Age'].mean()

In [21]:
train_copy['Age'].mean()

29.69911764705882

### Simple imputation
#### Important parameter: *strategy*
strategy str, default=’mean’. 
Followings are the options:

mean: only for numeric data and suitable for non-skewed data

median: only for numeric but robust to outlier

most_frequent: suitable for both numeric and string data

constant: suitable for both numeric and string data but must also use parameter- 'fill_value'
rdata.

For details [click](https://scikit-learn.org/stable/modules/generated/sklearn.impute.SimpleImputer.html#sklearn.impute.Simple)c data.

In [22]:
from sklearn.impute import SimpleImputer

In [23]:
impute = SimpleImputer(strategy='mean')
imputed_train_age = impute.fit_transform(train[['Age']])
imputed_test_age = impute.transform(test[['Age']])

In [24]:
imputed_train_age.mean()

29.69911764705882

In [25]:
train['Age'].mean()

29.69911764705882

### Pipeline for OneHotEncoder

In [26]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler

In [27]:
one = OneHotEncoder(sparse_output=False,
                    handle_unknown='ignore')

In [28]:
pd.concat([train['Sex'], pd.DataFrame(one.fit_transform(train[['Sex']]), index=train.index)], axis=1)

Unnamed: 0,Sex,0,1
0,male,0.0,1.0
1,female,1.0,0.0
2,female,1.0,0.0
3,female,1.0,0.0
4,male,0.0,1.0
...,...,...,...
886,male,0.0,1.0
887,female,1.0,0.0
888,female,1.0,0.0
889,male,0.0,1.0


In [29]:
one_pipe = Pipeline(steps=[('encoder', OneHotEncoder(handle_unknown='ignore')),
                           ('model', LogisticRegression)])

### Column Transformer

In [30]:
from sklearn.compose import ColumnTransformer

In [31]:
impute_col = ColumnTransformer(
    [('impute', SimpleImputer(strategy='mean'), ['Age', 'Fare'])],
    remainder='passthrough'
)

In [32]:
scale_col = ColumnTransformer(
    [('normalize', StandardScaler(), ['Age','Fare'])],
    remainder='passthrough'
)

In [33]:
encode_col = ColumnTransformer(
    [('encode', OneHotEncoder(handle_unknown='ignore'), ['Sex', 'Embarked'])], remainder='passthrough')

In [34]:
multicol_transform = Pipeline(
    [('impute', impute_col),
     ('normalize', scale_col),
     ('encode', encode_col)
])

In [35]:
multicol_transform = ColumnTransformer([('impute', SimpleImputer(strategy='mean'), ['Age', 'Fare']),              
                                       ('encode', OneHotEncoder(handle_unknown='ignore'), ['Sex', 'Embarked'])], remainder='passthrough')

In [43]:
train_prepared = multicol_transform.fit_transform(train.drop(['Name', 'Ticket','Cabin', 'Survived'], axis=1))

In [39]:
from sklearn.ensemble import RandomForestClassifier

In [55]:
multi_pipe = Pipeline(steps=[('preprocess', multicol_transform),
                             ('model', RandomForestClassifier(n_estimators=10, max_features=15))])
multi_pipe

In [56]:
multi_pipe_logi = Pipeline(steps=[('preprocess', multicol_transform),
                             ('model', LogisticRegression(max_iter=2000))])
multi_pipe_logi           

In [57]:
from sklearn import set_config
set_config(display="diagram")
multi_pipe

In [58]:
#

In [59]:
cv_multi_pipe = cross_validate(estimator=multi_pipe,
                               X=train.drop(['Name', 'Ticket','Cabin', 'Survived'], axis=1),
                               y=train['Survived'],
                               cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=123),
                               scoring='accuracy')

In [60]:
accuracy = cv_multi_pipe['test_score'].mean()

In [61]:
print(f'CV accuracy rate with Logistic Regression: {accuracy*100:.2f}%')

CV accuracy rate with Logistic Regression: 80.59%


## Grid Search for Model Selection 

In [37]:
from sklearn.model_selection import GridSearchCV

In [48]:
param_grid = [{'n_estimators': [10, 15, 20], 'max_features': [10, 12, 14, 15]},
              {'bootstrap': [False], 'n_estimators': [2, 4, 6], 'max_features': [2, 4, 6]}
             ]

In [49]:
forest_class = RandomForestClassifier()

In [50]:
grid_search = GridSearchCV(forest_class, param_grid, cv=5, scoring='accuracy', return_train_score=True)

In [51]:
X = train_prepared

In [52]:
y = train['Survived']

In [53]:
grid_search.fit(X, y)

In [54]:
grid_search.best_params_

{'max_features': 15, 'n_estimators': 10}

In [66]:
train.shape

(891, 12)

In [67]:
feature_importance = grid_search.best_estimator_.feature_importances_

In [68]:
feature_importance

array([0.16843243, 0.1621294 , 0.16919895, 0.11297009, 0.01343469,
       0.00351967, 0.01146204, 0.        , 0.19208494, 0.10535735,
       0.04971274, 0.01169769])