# Model: Logistic Regression Model

I have tried many different models with a selection of features using Logistic Regression.  The best model (simplest model with highest accuracy) includes these features

important_features = ['title_Mr', 'title_Mrs', 'family_size', 'is_child', 'pclass_2', 'pclass_3'] # This is the best score




* title_Mr 
* title_Mrs 
* family_size 
* is_child
* pclass_2
* pclass_3

The family_size has been scaled with the transform_X function.

The highest accuracy and simplest model is shown below.

# Initialization

In [1]:
%run init.ipynb

In [2]:
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression 
from sklearn.preprocessing import scale

import great_expectations as ge
from progressbar import ProgressBar

RANDOM_STATE = 42

## Extract Clean Data

**Separate data into X (features) and y (label)**

In [3]:
from data.data import (transform_X_numerical, 
                       transform_X_categorical, 
                       transform_X)

In [4]:
Xy = pd.read_csv('../data/processed/train_v4.csv', index_col='passengerid', dtype={'pclass':str, 'is_child':int, 'is_traveling_alone':int})
Xy

Unnamed: 0_level_0,survived,pclass,name,sex,sibsp,parch,ticket,embarked,title,last_name,cabin_number,family_size,fare,fare_bin,age,age_bin,is_child,is_traveling_alone
passengerid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
1,0,3,"Braund, Mr. Owen Harris",male,1,0,A/5 21171,S,Mr,Braund,21171.0,2,7.2500,q1,22.0,student,0,0
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,1,0,PC 17599,C,Mrs,Cumings,17599.0,2,71.2833,q4,38.0,adult,0,0
3,1,3,"Heikkinen, Miss. Laina",female,0,0,STON/O2. 3101282,S,Miss,Heikkinen,3101282.0,1,7.9250,q1,26.0,young_adult,0,1
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,1,0,113803,S,Mrs,Futrelle,113803.0,2,53.1000,q4,35.0,young_adult,0,0
5,0,3,"Allen, Mr. William Henry",male,0,0,373450,S,Mr,Allen,373450.0,1,8.0500,q2,35.0,young_adult,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
887,0,2,"Montvila, Rev. Juozas",male,0,0,211536,S,Mr,Montvila,211536.0,1,13.0000,q2,27.0,young_adult,0,1
888,1,1,"Graham, Miss. Margaret Edith",female,0,0,112053,S,Miss,Graham,112053.0,1,30.0000,q3,19.0,student,0,1
889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,1,2,W./C. 6607,S,Miss,Johnston,6607.0,4,23.4500,q3,22.0,student,0,0
890,1,1,"Behr, Mr. Karl Howell",male,0,0,111369,C,Mr,Behr,111369.0,1,30.0000,q3,26.0,young_adult,0,1


In [5]:
Xy.title.value_counts()

Mr        537
Miss      186
Mrs       128
Master     40
Name: title, dtype: int64

## Train Test Split Data

In [252]:
all_features = ['age', 'fare', 'family_size', 'is_child', 'is_traveling_alone',
                'sex_male', 'embarked_Q', 'embarked_S', 'title_Miss', 'title_Mr',
                'title_Mrs', 'age_bin_baby', 'age_bin_child', 'age_bin_senior',
                'age_bin_student', 'age_bin_teen', 'age_bin_young_adult', 'fare_bin_q2',
                'fare_bin_q3', 'fare_bin_q4', 'pclass_2', 'pclass_3']

important_features = ['title_Mr', 'title_Mrs', 'family_size', 'is_child', 'pclass_2', 'pclass_3'] # This is the best score and model with the fewest features


In [253]:
X_all = transform_X(Xy.drop(['name'], axis=1))
X = X_all[important_features]
y = Xy['survived']
X.shape

X

(891, 6)

Unnamed: 0_level_0,title_Mr,title_Mrs,family_size,is_child,pclass_2,pclass_3
passengerid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,1,0,0.059160,0,0,1
2,0,1,0.059160,0,0,0
3,0,0,-0.560975,0,0,1
4,0,1,0.059160,0,0,0
5,1,0,-0.560975,0,0,1
...,...,...,...,...,...,...
887,1,0,-0.560975,0,1,0
888,0,0,-0.560975,0,0,0
889,0,0,1.299429,0,0,1
890,1,0,-0.560975,0,0,0


### Split data into train and test. 

In [254]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE)
y_test = y_test.to_frame()

print(f'Number of sample in training data = {len(X_train)}')
print(f'Number of sample in test data = {len(X_test)}')

Number of sample in training data = 712
Number of sample in test data = 179


### Logistic Regression with Age

In [255]:
X.columns

model = LogisticRegression(random_state=RANDOM_STATE, max_iter=500, fit_intercept=True,
                          penalty='l2', l1_ratio=1)

model.fit(X_train, y_train) 

y_pred, predicted_accuracy_score, cv_scores = pm.calc_model_rst_table_metrics(model, X_train, y_train, X_test, y_test, 
                                                      model_name='logreg_model_3b', cv=5, verbose=True)
    

Index(['title_Mr', 'title_Mrs', 'family_size', 'is_child', 'pclass_2',
       'pclass_3'],
      dtype='object')

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=1, max_iter=500,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=42, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)


Accuracy Score on X_test,y_test:  0.8268


Cross Validation Scores:
	Accuracy 	: 0.8384 (+/- 0.0352)
	Recall		: 0.7162 (+/- 0.0677)
	Precision	: 0.8306 (+/- 0.0424)
	F1		: 0.7691 (+/- 0.0571)


11/22/19, logreg_model_3b,  <kaggle_accuracy>, 0.8384, 0.7162,0.8306,0.7691


# Optimize HyperParameters

https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html 

**max_iter** : int, optional (default=100)
Maximum number of iterations taken for the solvers to converge.

**penalty** : str, ‘l1’, ‘l2’, ‘elasticnet’ or ‘none’, optional (default=’l2’)

    Used to specify the norm used in the penalization. The ‘newton-cg’, ‘sag’ and ‘lbfgs’ solvers support only l2 penalties. 
    ‘elasticnet’ is only supported by the ‘saga’ solver. If ‘none’ (not supported by the liblinear solver), no regularization is applied.
    New in version 0.19: l1 penalty with SAGA solver (allowing ‘multinomial’ + L1)
    
**C** : float, optional (default=1.0)
     Inverse of regularization strength; must be a positive float. Like in support vector machines, smaller values specify stronger regularization.
     
**l1_ratio**: float or None, optional (default=None)
The Elastic-Net mixing parameter, with 0 <= l1_ratio <= 1. Only used if penalty='elasticnet'`. Setting ``l1_ratio=0 is equivalent to using penalty='l2', while setting l1_ratio=1 is equivalent to using penalty='l1'. For 0 < l1_ratio <1, the penalty is a combination of L1 and L2.

In [256]:
from hyperopt import fmin, tpe, hp, STATUS_OK

In [257]:
def objective(hyperopt_space):
    
    params = {'max_iter':int(hyperopt_space['max_iter']),
              'C': float(hyperopt_space['C']),
              'penalty':str(hyperopt_space['penalty']),
              'l1_ratio':float(hyperopt_space['l1_ratio']),
              'solver':str(hyperopt_space['solver'])
             }
    
    model = LogisticRegression(class_weight=None, 
                               dual=False, 
                               fit_intercept=True,
                               intercept_scaling=1, 
                               multi_class='warn', 
                               n_jobs=2, 
                               random_state=42, 
                               tol=0.0001, 
                               verbose=1,
                               warm_start=False,
                               **params)
    
    best_score = (cross_val_score(model,
                                 X_train,
                                 y_train,
                                 scoring='accuracy', 
                                 cv=10, n_jobs=4)
                  .mean()
                 )
    
    loss = 1 - best_score
    
    return {'loss': loss, 'params': params, 'status': STATUS_OK, 'best_score': best_score}

In [258]:
SOLVERS = ['saga']
MAX_ITER_LIST = [500]

In [259]:
space = {'max_iter': hp.choice('max_iter', [500]),
         'C': hp.uniform('C', 0, 1),
         'penalty': hp.choice('penalty', ['elasticnet']),
         'solver': hp.choice('solver', SOLVERS),
         'l1_ratio':hp.uniform('l1_ratio', 0, 1),
        }

In [260]:
best_result = fmin(fn=objective,
                   space=space,
                   max_evals=100,
                   rstate=np.random.RandomState(42),
                   algo=tpe.suggest
                  )

best_result                   

100%|██████████| 100/100 [00:46<00:00,  2.14it/s, best loss: 0.16163257321708036]


{'C': 0.8326320323149597,
 'l1_ratio': 0.45470105222137214,
 'max_iter': 0,
 'penalty': 0,
 'solver': 0}

### Rerun Model & Check Feature Selection

In [261]:
model = LogisticRegression(class_weight=None, 
                               dual=False, 
                               fit_intercept=True,
                               intercept_scaling=1, 
                               multi_class='warn', 
                               n_jobs=2, 
                               random_state=42, 
                               tol=0.0001, 
                               verbose=1,
                               warm_start=False,
                               C=best_result['C'], 
                               l1_ratio= best_result['l1_ratio'],
                               solver=SOLVERS[best_result['solver']], 
                               max_iter= MAX_ITER_LIST[best_result['max_iter']],
                               )

model

LogisticRegression(C=0.8326320323149597, class_weight=None, dual=False,
                   fit_intercept=True, intercept_scaling=1,
                   l1_ratio=0.45470105222137214, max_iter=500,
                   multi_class='warn', n_jobs=2, penalty='l2', random_state=42,
                   solver='saga', tol=0.0001, verbose=1, warm_start=False)

In [262]:
model.fit(X_train, y_train) 

y_pred, predicted_accuracy_score, cv_scores = pm.calc_model_rst_table_metrics(model, X_train, y_train, X_test, y_test, 
                                                      model_name='logreg_model_3b', cv=5, verbose=True)

cv_scores

convergence after 28 epochs took 0 seconds


[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   1 out of   1 | elapsed:    0.0s finished


LogisticRegression(C=0.8326320323149597, class_weight=None, dual=False,
                   fit_intercept=True, intercept_scaling=1,
                   l1_ratio=0.45470105222137214, max_iter=500,
                   multi_class='warn', n_jobs=2, penalty='l2', random_state=42,
                   solver='saga', tol=0.0001, verbose=1, warm_start=False)

[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.



Accuracy Score on X_test,y_test:  0.8268


Cross Validation Scores:
	Accuracy 	: 0.8384 (+/- 0.0352)
	Recall		: 0.7238 (+/- 0.0744)
	Precision	: 0.8254 (+/- 0.0492)
	F1		: 0.7709 (+/- 0.0574)


11/22/19, logreg_model_3b,  <kaggle_accuracy>, 0.8384, 0.7238,0.8254,0.7709


[Parallel(n_jobs=2)]: Done   1 out of   1 | elapsed:    0.1s finished
[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   1 out of   1 | elapsed:    0.0s finished


{'fit_time': array([0.19197798, 0.01017118, 0.00626993, 0.00675511, 0.01964521]),
 'score_time': array([0.00596786, 0.00677776, 0.00570703, 0.0052979 , 0.00702786]),
 'test_accuracy': array([0.85314685, 0.86013986, 0.83216783, 0.80985915, 0.83687943]),
 'test_recall': array([0.74074074, 0.75925926, 0.7037037 , 0.66037736, 0.75471698]),
 'test_precision': array([0.85106383, 0.85416667, 0.82608696, 0.79545455, 0.8       ]),
 'test_f1': array([0.79207921, 0.80392157, 0.76      , 0.72164948, 0.77669903])}

In [263]:
abs_coefficients_ranked = pd.DataFrame(data = abs(model.coef_), columns = X_train.columns).T.sort_values(by=0, ascending=False)
with pd.option_context('display.max_rows', 30):
	abs_coefficients_ranked.reset_index()#.head(10)['index'].values

Unnamed: 0,index,0
0,title_Mr,2.747685
1,pclass_3,1.755936
2,title_Mrs,0.963517
3,pclass_2,0.744827
4,is_child,0.71444
5,family_size,0.644649


# Prepare Submission

In [267]:
from models import kaggle

filename = 'logres_original_without_age_data_v4.csv'

X_holdout = pd.read_csv('../data/processed/holdout_v4.csv', 
                        index_col='passengerid', 
                        dtype={'pclass':str, 'is_child':int, 'is_traveling_alone':int})

X_test_kaggle_public = transform_X(X_holdout).reindex(X_test.columns, axis=1)

X_test_kaggle_public.describe()

y_pred = (pd.Series(model.predict(X_test_kaggle_public), 
                   index=X_test_kaggle_public.index, name='Survived').to_frame().sort_index()
         )

y_pred.index.names = ['PassengerId']

y_pred.to_csv(filename)

message = (f'{filename} \n\n'
           f'This is a submission test via the Kaggle API. \n\n'
           f'{model} \n\n'
           f'{X_test_kaggle_public.columns} \n\n'
          )

print(message)

In [268]:
kaggle.submit_to_kaggle_titanic_competition(filename, message, upload=False);

logres_original_without_age_data_v4.csv
logres_original_without_age_data_v4.csv 

This is a submission test via the Kaggle API. 

LogisticRegression(C=0.8326320323149597, class_weight=None, dual=False,
                   fit_intercept=True, intercept_scaling=1,
                   l1_ratio=0.45470105222137214, max_iter=500,
                   multi_class='warn', n_jobs=2, penalty='l2', random_state=42,
                   solver='saga', tol=0.0001, verbose=1, warm_start=False) 

Index(['title_Mr', 'title_Mrs', 'family_size', 'is_child', 'pclass_2',
       'pclass_3'],
      dtype='object') 


None
None


In [None]:
filename