In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import warnings
warnings.simplefilter(action='ignore')

# Show 2 decimals
pd.set_option('display.float_format', lambda x: '%.4f' % x)
#pd.options.display.float_format = '{:,.2f}'.format

#Butun kolon ve satirlari gormek icin
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# Set plot background
sns.set_theme(style="whitegrid")

# Euro sign €
#print ("%s"%(u"\N{euro sign}"))

In [2]:
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score

# Step 1: Load the dataset

In [3]:
df = pd.read_csv('titanic.csv')
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
df.shape

(891, 12)

# Step 2: Select features

In [5]:
df.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [6]:
df.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [7]:
df = df.loc[df.Embarked.notna(), ['Survived', 'Pclass', 'Sex', 'Embarked']]
df.shape

(889, 4)

In [8]:
df.isnull().sum()

Survived    0
Pclass      0
Sex         0
Embarked    0
dtype: int64

# Step 3: Cross-validate a model with one feature

In [9]:
X = df.loc[:, ['Pclass']]
y = df.Survived

In [10]:
X.shape

(889, 1)

In [11]:
y.shape

(889,)

In [12]:
logreg = LogisticRegression()

In [13]:
cross_val_score(logreg, X, y, cv=5, scoring='accuracy').mean()

0.6783406335301212

In [14]:
y.value_counts(normalize=True)

0   0.6175
1   0.3825
Name: Survived, dtype: float64

# Step 4: Encode categorical features

In [15]:
df.head()

Unnamed: 0,Survived,Pclass,Sex,Embarked
0,0,3,male,S
1,1,1,female,C
2,1,3,female,S
3,1,1,female,S
4,0,3,male,S


In [17]:
ohe = OneHotEncoder(sparse=False)

In [18]:
ohe.fit_transform(df[['Sex']])

array([[0., 1.],
       [1., 0.],
       [1., 0.],
       ...,
       [1., 0.],
       [0., 1.],
       [0., 1.]])

In [19]:
ohe.categories_

[array(['female', 'male'], dtype=object)]

In [20]:
ohe.fit_transform(df[['Embarked']])

array([[0., 0., 1.],
       [1., 0., 0.],
       [0., 0., 1.],
       ...,
       [0., 0., 1.],
       [1., 0., 0.],
       [0., 1., 0.]])

In [21]:
ohe.categories_

[array(['C', 'Q', 'S'], dtype=object)]

# Step 5: Cross-validate a Pipeline with all features

In [22]:
X = df.drop('Survived', axis= 'columns')
X.head()

Unnamed: 0,Pclass,Sex,Embarked
0,3,male,S
1,1,female,C
2,3,female,S
3,1,female,S
4,3,male,S


In [23]:
column_trans = make_column_transformer(
    (OneHotEncoder(), ['Sex', 'Embarked']),
    remainder='passthrough')

In [24]:
column_trans.fit_transform(X)

array([[0., 1., 0., 0., 1., 3.],
       [1., 0., 1., 0., 0., 1.],
       [1., 0., 0., 0., 1., 3.],
       ...,
       [1., 0., 0., 0., 1., 3.],
       [0., 1., 1., 0., 0., 1.],
       [0., 1., 0., 1., 0., 3.]])

In [25]:
pipe = make_pipeline(column_trans, logreg)

In [26]:
cross_val_score(pipe, X, y, cv=5, scoring='accuracy').mean()

0.7727924839713071

# Step 6: Make predictions on "new" data

In [33]:
X_new = X.sample(5, random_state=99)
X_new

Unnamed: 0,Pclass,Sex,Embarked
599,1,male,C
512,1,male,S
273,1,male,C
215,1,female,C
790,3,male,Q


In [34]:
pipe.fit(X, y)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('onehotencoder',
                                                  OneHotEncoder(),
                                                  ['Sex', 'Embarked'])])),
                ('logisticregression', LogisticRegression())])

In [32]:
pipe.predict(X_new)

array([1, 1, 0, 1, 0])

# Recap

In [43]:
import pandas as pd
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score

In [None]:
df = pd.read_csv('http://bit.ly/kaggletrain')
df = df.loc[df.Embarked.notna(), ['Survived', 'Pclass', 'Sex', 'Embarked']]
X = df.drop('Survived', axis='columns')
y = df.Survived

In [None]:
column_trans = make_column_transformer(
    (OneHotEncoder(), ['Sex', 'Embarked']),
    remainder='passthrough')
logreg = LogisticRegression(solver='lbfgs')

In [None]:
pipe = make_pipeline(column_trans, logreg)

In [None]:
cross_val_score(pipe, X, y, cv=5, scoring='accuracy').mean()


In [None]:
X_new = X.sample(5, random_state=99)

In [None]:
pipe.fit(X, y)
pipe.predict(X_new)

In [37]:
sklearn.metrics.SCORERS.keys()

dict_keys(['explained_variance', 'r2', 'max_error', 'neg_median_absolute_error', 'neg_mean_absolute_error', 'neg_mean_absolute_percentage_error', 'neg_mean_squared_error', 'neg_mean_squared_log_error', 'neg_root_mean_squared_error', 'neg_mean_poisson_deviance', 'neg_mean_gamma_deviance', 'accuracy', 'top_k_accuracy', 'roc_auc', 'roc_auc_ovr', 'roc_auc_ovo', 'roc_auc_ovr_weighted', 'roc_auc_ovo_weighted', 'balanced_accuracy', 'average_precision', 'neg_log_loss', 'neg_brier_score', 'adjusted_rand_score', 'rand_score', 'homogeneity_score', 'completeness_score', 'v_measure_score', 'mutual_info_score', 'adjusted_mutual_info_score', 'normalized_mutual_info_score', 'fowlkes_mallows_score', 'precision', 'precision_macro', 'precision_micro', 'precision_samples', 'precision_weighted', 'recall', 'recall_macro', 'recall_micro', 'recall_samples', 'recall_weighted', 'f1', 'f1_macro', 'f1_micro', 'f1_samples', 'f1_weighted', 'jaccard', 'jaccard_macro', 'jaccard_micro', 'jaccard_samples', 'jaccard_wei

# GridSearchCV

In [47]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

from sklearn import set_config
set_config(display='diagram')

In [56]:
df = pd.read_csv('titanic.csv')
cols = ['Sex', 'Name', 'Age', 'Embarked']
X = df[cols]
y = df['Survived']

In [58]:
ct = ColumnTransformer(
    [('ohe', OneHotEncoder(), ['Sex', 'Embarked']),
     ('vectorizer', CountVectorizer(), 'Name'),
     ('imputer', SimpleImputer(), ['Age'])])

In [59]:
clf = LogisticRegression(solver='liblinear', random_state=1)

In [60]:
pipe = Pipeline ([('preprocessor', ct), ('classifier', clf)])

In [61]:
params = {}
params['preprocessor__ohe__drop'] = [None, 'first']
params['preprocessor__vectorizer__min_df'] = [1, 2, 3]
params['preprocessor__vectorizer__ngram_range'] = [(1, 1), (1, 2)]
params['classifier__C'] = [0.001, 0.01, 0.1, 1, 10, 100, 1000]
params['classifier__penalty'] = ['l1', 'l2']

In [62]:
grid = GridSearchCV(pipe, params)
%time grid.fit(X, y)

CPU times: user 29.3 s, sys: 289 ms, total: 29.6 s
Wall time: 30.6 s


In [68]:
grid = GridSearchCV(pipe, params, n_jobs=-1)
%time grid.fit(X, y)
grid.best_params_

CPU times: user 2.13 s, sys: 101 ms, total: 2.23 s
Wall time: 16.9 s


{'classifier__C': 10,
 'classifier__penalty': 'l1',
 'preprocessor__ohe__drop': None,
 'preprocessor__vectorizer__min_df': 1,
 'preprocessor__vectorizer__ngram_range': (1, 2)}

In [70]:
params_best = {}
params['preprocessor__ohe__drop'] = [None]
params['preprocessor__vectorizer__min_df'] = [1]
params['preprocessor__vectorizer__ngram_range'] = [(1, 2)]
params['classifier__C'] = [10]
params['classifier__penalty'] = ['l1']

In [73]:
grid_tuned = GridSearchCV(pipe, params_best, n_jobs=-1)
%time grid_tuned.fit(X, y)

CPU times: user 108 ms, sys: 21.6 ms, total: 130 ms
Wall time: 350 ms


In [81]:
X_new = X.sample(5)
df.loc[X_new.index]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
427,428,1,2,"Phillips, Miss. Kate Florence (""Mrs Kate Louis...",female,19.0,0,0,250655,26.0,,S
110,111,0,1,"Porter, Mr. Walter Chamberlain",male,47.0,0,0,110465,52.0,C110,S
504,505,1,1,"Maioni, Miss. Roberta",female,16.0,0,0,110152,86.5,B79,S
557,558,0,1,"Robbins, Mr. Victor",male,,0,0,PC 17757,227.525,,C
607,608,1,1,"Daniel, Mr. Robert Williams",male,27.0,0,0,113804,30.5,,S


In [82]:
grid_tuned.predict(X_new)

array([1, 0, 1, 0, 0])

In [79]:
df.loc[737]

PassengerId                       738
Survived                            1
Pclass                              1
Name           Lesurer, Mr. Gustave J
Sex                              male
Age                           35.0000
SibSp                               0
Parch                               0
Ticket                       PC 17755
Fare                         512.3292
Cabin                            B101
Embarked                            C
Name: 737, dtype: object