### Modelling

Currently setting this up to play around with some of the following:
* Implementing models
* Implementing different preproccessing techniques
* Visualising validation metrics
* Analysing feature importance
* Pickling models
* etc.

In [1]:
# Standard imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
%matplotlib inline

# Modelling
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV

# Metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [4]:
# Constants
test_size=0.3
random_state=101

In [5]:
df = pd.read_pickle("titanic_train_clean")

In [6]:
X = df.drop(["Survived"], axis=1)
y = df["Survived"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [11]:
def print_model_scores(model, X_test=X_test):
    predictions = model.predict(X_test)
    
    print(f"accuracy: {accuracy_score(y_test, predictions)}")
    print(f"precision: {precision_score(y_test, predictions)}")
    print(f"recall: {recall_score(y_test, predictions)}")
    print(f"f1 score: {f1_score(y_test, predictions)}")

### Pipeline model

In [8]:
model = make_pipeline(
    StandardScaler(),
    LogisticRegressionCV(scoring="f1", verbose=2)    
)

In [9]:
model.fit(X_train, y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.3s finished


Pipeline(memory=None,
         steps=[('standardscaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('logisticregressioncv',
                 LogisticRegressionCV(Cs=10, class_weight=None, cv=None,
                                      dual=False, fit_intercept=True,
                                      intercept_scaling=1.0, l1_ratios=None,
                                      max_iter=100, multi_class='auto',
                                      n_jobs=None, penalty='l2',
                                      random_state=None, refit=True,
                                      scoring='f1', solver='lbfgs', tol=0.0001,
                                      verbose=2))],
         verbose=False)

In [12]:
print_model_scores(model)

accuracy: 0.8127340823970037
precision: 0.813953488372093
recall: 0.6730769230769231
f1 score: 0.736842105263158


### Grid search

In [13]:
model_params = {
    "Cs": [1,5,10],
    "fit_intercept": [True, False],
    "penalty": ["l1", "l2", "elasticnet"],
    "solver": ['newton-cg', 'lbfgs', 'saga']
}



grid = GridSearchCV(LogisticRegressionCV(), model_params)

In [14]:
grid.fit(X_train_scaled, y_train)

ValueError: Solver newton-cg supports only 'l2' or 'none' penalties, got l1 penalty.


ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.


ValueError: Solver newton-cg supports only 'l2' or 'none' penalties, got elasticnet penalty.


ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got elasticnet penalty.


ValueError: l1_ratios must be a list of numbers between 0 and 1; got (l1_ratios=None)


ValueError: Solver newton-cg supports only 'l2' or 'none' penalties, got l1 penalty.


ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.


ValueError: Solver newton-cg supports only 'l2' or 'none' penalties, got elasticnet penalty.


ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got elasticnet penalty.


ValueError: l1_ratios must be a list of numbers between 0 and 1; got (l1_ratios=None)


ValueError: Solver newton-cg supports only 'l2' or 'none' penalties, got l1 penalty.


ValueError: Solver lbfgs 

ValueError: Solver newton-cg supports only 'l2' or 'none' penalties, got l1 penalty.


ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.


ValueError: Solver newton-cg supports only 'l2' or 'none' penalties, got elasticnet penalty.


ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got elasticnet penalty.


ValueError: l1_ratios must be a list of numbers between 0 and 1; got (l1_ratios=None)


ValueError: Solver newton-cg supports only 'l2' or 'none' penalties, got l1 penalty.


ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.


ValueError: Solver newton-cg supports only 'l2' or 'none' penalties, got elasticnet penalty.


ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got elasticnet penalty.


ValueError: l1_ratios must be a list of numbers between 0 and 1; got (l1_ratios=None)




GridSearchCV(cv=None, error_score=nan,
             estimator=LogisticRegressionCV(Cs=10, class_weight=None, cv=None,
                                            dual=False, fit_intercept=True,
                                            intercept_scaling=1.0,
                                            l1_ratios=None, max_iter=100,
                                            multi_class='auto', n_jobs=None,
                                            penalty='l2', random_state=None,
                                            refit=True, scoring=None,
                                            solver='lbfgs', tol=0.0001,
                                            verbose=0),
             iid='deprecated', n_jobs=None,
             param_grid={'Cs': [1, 5, 10], 'fit_intercept': [True, False],
                         'penalty': ['l1', 'l2', 'elasticnet'],
                         'solver': ['newton-cg', 'lbfgs', 'saga']},
             pre_dispatch='2*n_jobs', refit=True, return_train

In [15]:
grid.best_params_

{'Cs': 10, 'fit_intercept': True, 'penalty': 'l2', 'solver': 'newton-cg'}

In [16]:
print_model_scores(grid, X_test_scaled)

accuracy: 0.8127340823970037
precision: 0.813953488372093
recall: 0.6730769230769231
f1 score: 0.736842105263158


### Just model - no pipeline

In [17]:
model = LogisticRegressionCV(scoring="f1", verbose=1)

model.fit(X_train_scaled, y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.2s finished


LogisticRegressionCV(Cs=10, class_weight=None, cv=None, dual=False,
                     fit_intercept=True, intercept_scaling=1.0, l1_ratios=None,
                     max_iter=100, multi_class='auto', n_jobs=None,
                     penalty='l2', random_state=None, refit=True, scoring='f1',
                     solver='lbfgs', tol=0.0001, verbose=1)

In [18]:
coefs = pd.DataFrame(
    model.coef_[0],
    columns=['Coefficients'], index=X_train.columns
)

coefs

Unnamed: 0,Coefficients
Pclass,-0.469852
Age,-0.324801
SibSp,-0.165894
Parch,-0.054341
Fare,0.109002
Cabin_value_present,0.281322
male,-0.991706
Q,-0.023385
S,-0.141244


### Feature importance