# Model: Gender Only

https://www.kaggle.com/c/titanic/overview

Features included in this model are:
    
* age 
* sibsp 
* parch 
* fare


* pclass
* sex
* ticket
* embarked

The numerical features are scaled.  

# Initialization

In [6]:
%run init.ipynb

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [7]:
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression 
from sklearn.preprocessing import scale

import great_expectations as ge
from progressbar import ProgressBar

RANDOM_STATE = 42

## Extract Clean Data

**Separate data into X (features) and y (label)**

In [8]:
from data.data import (transform_X_numerical, 
                       transform_X_categorical, 
                       transform_X)

In [9]:
Xy = pd.read_csv('../data/processed/train_v4.csv', index_col='passengerid', dtype={'pclass':str, 'is_child':int, 'is_traveling_alone':int})
Xy

Unnamed: 0_level_0,survived,pclass,name,sex,sibsp,parch,ticket,embarked,title,last_name,cabin_number,family_size,fare,fare_bin,age,age_bin,is_child,is_traveling_alone
passengerid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
1,0,3,"Braund, Mr. Owen Harris",male,1,0,A/5 21171,S,Mr,Braund,21171.0,2,7.2500,q1,22.0,student,0,0
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,1,0,PC 17599,C,Mrs,Cumings,17599.0,2,71.2833,q4,38.0,adult,0,0
3,1,3,"Heikkinen, Miss. Laina",female,0,0,STON/O2. 3101282,S,Miss,Heikkinen,3101282.0,1,7.9250,q1,26.0,young_adult,0,1
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,1,0,113803,S,Mrs,Futrelle,113803.0,2,53.1000,q4,35.0,young_adult,0,0
5,0,3,"Allen, Mr. William Henry",male,0,0,373450,S,Mr,Allen,373450.0,1,8.0500,q2,35.0,young_adult,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
887,0,2,"Montvila, Rev. Juozas",male,0,0,211536,S,Mr,Montvila,211536.0,1,13.0000,q2,27.0,young_adult,0,1
888,1,1,"Graham, Miss. Margaret Edith",female,0,0,112053,S,Miss,Graham,112053.0,1,30.0000,q3,19.0,student,0,1
889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,1,2,W./C. 6607,S,Miss,Johnston,6607.0,4,23.4500,q3,22.0,student,0,0
890,1,1,"Behr, Mr. Karl Howell",male,0,0,111369,C,Mr,Behr,111369.0,1,30.0000,q3,26.0,young_adult,0,1


In [10]:
Xy.title.value_counts()

Mr        537
Miss      186
Mrs       128
Master     40
Name: title, dtype: int64

## Train Test Split Data

In [12]:
important_features = ['title_Mr', 'title_Mrs', 'family_size', 'is_child', 'pclass_2', 'pclass_3']

In [13]:
X_all = transform_X(Xy.drop(['name'], axis=1))
X = X_all[important_features]
y = Xy['survived']
X.shape

X

(891, 6)

Unnamed: 0_level_0,title_Mr,title_Mrs,family_size,is_child,pclass_2,pclass_3
passengerid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,1,0,0.059160,0,0,1
2,0,1,0.059160,0,0,0
3,0,0,-0.560975,0,0,1
4,0,1,0.059160,0,0,0
5,1,0,-0.560975,0,0,1
...,...,...,...,...,...,...
887,1,0,-0.560975,0,1,0
888,0,0,-0.560975,0,0,0
889,0,0,1.299429,0,0,1
890,1,0,-0.560975,0,0,0


### Split data into train and test. 

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE)
y_test = y_test.to_frame()

print(f'Number of sample in training data = {len(X_train)}')
print(f'Number of sample in test data = {len(X_test)}')

Number of sample in training data = 712
Number of sample in test data = 179


### Logistic Regression with Age

In [15]:
X.columns

model = LogisticRegression(random_state=RANDOM_STATE, max_iter=500, fit_intercept=True,
                          penalty='l2', l1_ratio=1)

model.fit(X_train, y_train) 

y_pred, predicted_accuracy_score, cv_scores = pm.calc_model_rst_table_metrics(model, X_train, y_train, X_test, y_test, 
                                                      model_name='logreg_model_3b', cv=5, verbose=True)
    

Index(['title_Mr', 'title_Mrs', 'family_size', 'is_child', 'pclass_2',
       'pclass_3'],
      dtype='object')

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=1, max_iter=500,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=42, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)


Accuracy Score on X_test,y_test:  0.8268


Cross Validation Scores:
	Accuracy 	: 0.8384 (+/- 0.0352)
	Recall		: 0.7162 (+/- 0.0677)
	Precision	: 0.8306 (+/- 0.0424)
	F1		: 0.7691 (+/- 0.0571)


11/18/19, logreg_model_3b,  <kaggle_accuracy>, 0.8384, 0.7162,0.8306,0.7691


# Prepare Submission

In [19]:
from models import kaggle

filename = 'logres_model_3_data_v4.csv'

In [20]:
X_holdout = pd.read_csv('../data/processed/holdout_v4.csv', 
                        index_col='passengerid', 
                        dtype={'pclass':str, 'is_child':int, 'is_traveling_alone':int})

X_test_kaggle_public = transform_X(X_holdout).reindex(X_test.columns, axis=1)

X_test_kaggle_public.describe()

Unnamed: 0,title_Mr,title_Mrs,family_size,is_child,pclass_2,pclass_3
count,418.0,418.0,418.0,418.0,418.0,418.0
mean,0.586124,0.177033,-2.9747600000000003e-17,0.069378,0.222488,0.521531
std,0.493117,0.382154,1.001198,0.2544,0.416416,0.500135
min,0.0,0.0,-0.5534426,0.0,0.0,0.0
25%,0.0,0.0,-0.5534426,0.0,0.0,0.0
50%,1.0,0.0,-0.5534426,0.0,0.0,1.0
75%,1.0,0.0,0.1056429,0.0,0.0,1.0
max,1.0,1.0,6.037412,1.0,1.0,1.0


In [21]:
y_pred = (pd.Series(model.predict(X_test_kaggle_public), 
                   index=X_test_kaggle_public.index, name='Survived').to_frame().sort_index()
         )

y_pred.index.names = ['PassengerId']

y_pred.to_csv(filename)

message = (f'{filename} \n\n'
           f'This is a submission test via the Kaggle API. \n\n'
           f'{model} \n\n'
           f'{X_test_kaggle_public.columns} \n\n'
          )

print(message)

logres_model_3_data_v4.csv 

This is a submission test via the Kaggle API. 

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=1, max_iter=500,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=42, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False) 

Index(['title_Mr', 'title_Mrs', 'family_size', 'is_child', 'pclass_2',
       'pclass_3'],
      dtype='object') 




In [24]:
kaggle.submit_to_kaggle_titanic_competition(filename, message);

logres_model_3_data_v4.csv
logres_model_3_data_v4.csv 

This is a submission test via the Kaggle API. 

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=1, max_iter=500,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=42, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False) 

Index(['title_Mr', 'title_Mrs', 'family_size', 'is_child', 'pclass_2',
       'pclass_3'],
      dtype='object') 


b'Successfully submitted to Titanic: Machine Learning from Disaster'
b'\r  0%|          | 0.00/2.77k [00:00<?, ?B/s]\r100%|\xe2\x96\x88\xe2\x96\x88\xe2\x96\x88\xe2\x96\x88\xe2\x96\x88\xe2\x96\x88\xe2\x96\x88\xe2\x96\x88\xe2\x96\x88\xe2\x96\x88| 2.77k/2.77k [00:00<00:00, 12.1kB/s]\r100%|\xe2\x96\x88\xe2\x96\x88\xe2\x96\x88\xe2\x96\x88\xe2\x96\x88\xe2\x96\x88\xe2\x96\x88\xe2\x96\x88\xe2\x96\x88\xe2\x96\x88| 2.77k/2.77k [00:02<00:00, 1.15kB/s]\n'


In [None]:
filename