In [1]:
#Libraries.
import pandas as pd

### Data Import.

In [4]:
#read the data. titanic data, from Kaggle.
data_input = pd.read_csv('./data/titanic_train.csv')
data_test  = pd.read_csv('./data/titanic_test.csv')

data_input.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


### Preprocessing.

In [151]:
#separate target var 'Survived' before preprocessing.
data_copy = data_input.copy()
data_y = data_copy['Survived']
data   = data_copy.drop(columns=['Survived'])

#1. Numerical data.
#1) Null check.
data.info()  #Null values in 'Age', 'Cabin', 'Embarked'.
#1 - Delete row without 'Embarked'.
data = data.dropna(subset=['Embarked'])
#2 - Delete col 'Cabin'.
data = data.drop(columns=['Cabin'])
#3 - Fill nulls in 'Age' with mean value.
mean_age              = data['Age'].mean()
data['Age']           = data['Age'].fillna(mean_age)
data.info()

#Null check (sklearn.impute).
cols_cat           = data.select_dtypes('object').columns  #col names of cat cols.
data_cat           = data[cols_cat]                        #get cat cols.
data_num           = data.drop(columns=cols_cat)           #get num cols.
cols_num           = data_num.columns                      #col names of num cols.

from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy='median')    #fill null values with median value for each col.
imputer.fit(data_num)                         #automatically calculates each median.
data_num_prep = imputer.transform(data_num)   #actually fill in given data.
imputer.statistics_                           #save median values for each num col.

#data_train_prep = imputer.fit_transform(data_train_cpy_num)   #at once!

data_num_prep = pd.DataFrame(data_num,        #returned dtype is np.ndarray -> pd.DataFrame.
                             columns=cols_num, index=data_num.index)


#2) Feature scaling.
from sklearn.preprocessing import MinMaxScaler, StandardScaler
mmx_sclr = MinMaxScaler()
mmx_sclr.fit_transform(data_num_prep)

#2. Categorical data.
#1) Null check.
#2) Cat -> Num.
    #Ordinal encoder.
from sklearn.preprocessing import OrdinalEncoder
ord_enc = OrdinalEncoder()
data_cat_prep = ord_enc.fit_transform(data_cat)
ord_enc.categories_    #saves which ordinal number is assigned for each col's value.

    #One-hot encoder.
'''
from sklearn.preprocessing import OneHotEncoder
one_enc = OneHotEncoder()
data_train_prep_cat = one_enc.fit_transform(data_train_cpy_cat)    
one_enc.categories_
data_train_prep_cat.toarray()   #scipy sparse matrix -> np.ndarray
'''


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Pclass       891 non-null    int64  
 2   Name         891 non-null    object 
 3   Sex          891 non-null    object 
 4   Age          714 non-null    float64
 5   SibSp        891 non-null    int64  
 6   Parch        891 non-null    int64  
 7   Ticket       891 non-null    object 
 8   Fare         891 non-null    float64
 9   Cabin        204 non-null    object 
 10  Embarked     889 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 76.7+ KB
<class 'pandas.core.frame.DataFrame'>
Int64Index: 889 entries, 0 to 890
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  889 non-null    int64  
 1   Pclass       889 non-null    int64  
 2   Name    

'\nfrom sklearn.preprocessing import OneHotEncoder\none_enc = OneHotEncoder()\ndata_train_prep_cat = one_enc.fit_transform(data_train_cpy_cat)    \none_enc.categories_\ndata_train_prep_cat.toarray()   #scipy sparse matrix -> np.ndarray\n'

In [153]:
#Pipeline.
#Each preprocessing unit should be independent & asynchronized.

#separate target var 'Survived' before preprocessing.
data_copy = data_input.copy()
data_y = data_copy['Survived']
data   = data_copy.drop(columns=['Survived'])

#Numerical data.
from sklearn.pipeline import Pipeline
pipe_num = Pipeline([
    ('imp_num', SimpleImputer(strategy='mean')),
    ('sclr_mmx', MinMaxScaler()),
])

#Categorical data.
pipe_cat = Pipeline([
    ('imp_cat', SimpleImputer(strategy='most_frequent')),
    ('enc_ord', OrdinalEncoder()),
    ('sclr_mmx', MinMaxScaler())
])

#Numerical + Categorical.
from sklearn.compose import ColumnTransformer
pipe_prep = ColumnTransformer([                 #<Note> order or cols change, cols_num + cols_cat!
    ('pipe_num', pipe_num, cols_num),
    ('pipe_cat', pipe_cat, cols_cat)
])
data_prep = pipe_prep.fit_transform(data)
cols_prep = cols_num.tolist() + cols_cat.tolist()
data_prep = pd.DataFrame(data_prep, columns=cols_prep, index=data.index)

### Model Selection.

In [188]:
#train-val split.
from sklearn.model_selection import train_test_split
train_x, val_x, train_y, val_y = train_test_split(data_prep, data_y, train_size=0.7, stratify=data_y)

#Try several candidates.
accuracy = {}

    #dt.
from sklearn.tree import DecisionTreeClassifier
clf_dt = DecisionTreeClassifier()
clf_dt.fit(train_x, train_y)
val_y_pred = clf_dt.predict(val_x)
accuracy['dt'] = round(sum(val_y.to_numpy() == val_y_pred) / len(val_y), 4)

    #svm.
from sklearn.svm import SVC
clf_svc = SVC()
clf_svc.fit(train_x, train_y)
val_y_pred = clf_svc.predict(val_x)
accuracy['svc'] = round(sum(val_y.to_numpy() == val_y_pred) / len(val_y), 4)

#k-cross validation.
from sklearn.model_selection import cross_val_score
accuracy_10 = cross_val_score(clf_dt, data_prep, data_y, cv=10, scoring='accuracy')

print(accuracy)

{'dt': 0.7761, 'svc': 0.7985}


### Model tunning.

In [204]:
#Hyperparams, grid search.
from sklearn.model_selection import GridSearchCV
hyperparams_grid = [
    {'C' : [1.0, 2.0, 3.0], 'degree' : [3, 5, 10]},
    {'C' : [1.0, 3.0], 'shrinking' : [True, False]}
]
clf_svc = SVC()
grid_search = GridSearchCV(clf_svc, hyperparams_grid, cv=5,
                           scoring='accuracy', return_train_score=True)
grid_search.fit(data_prep, data_y)
grid_search.best_params_
grid_search.best_estimator_
grid_search.best_score_
grid_search.cv_results_          #train & test scores for each combination. fit time.

#Randomized search.
from sklearn.model_selection import RandomizedSearchCV
clf_svc = SVC()
rand_search = RandomizedSearchCV(clf_svc, hyperparams_grid, cv=5,
                                 scoring='accuracy', return_train_score=True)
rand_search.fit(data_prep, data_y)

clf_svc = grid_search.best_estimator_