# Model: Gender Only

https://www.kaggle.com/c/titanic/overview

Features included in this model are:
    
* age 
* sibsp 
* parch 
* fare


* pclass
* sex
* ticket
* embarked

The numerical features are scaled.  

# Initialization

In [1]:
%run init.ipynb

In [2]:
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression 
from sklearn.preprocessing import scale

import great_expectations as ge

RANDOM_STATE = 42

# Define

In [19]:

def transform_X_numerical(Xy,
                          columns=['age', 'fare', 'family_size']
                         ):

    # Scale the numerical columns.
    return pd.DataFrame(scale(Xy[columns]), index=Xy.index, columns=columns )
    

def transform_X_categorical(Xy,
                columns=['sex', 'embarked', 'title', 'age_bin', 'is_child', 'is_travelling_alone']):
    
    # Encode the categorical features. The first category will be dropped.
     return pd.get_dummies(Xy[columns], drop_first=True)
    

def transform_X(Xy,
                numerical_columns=['age', 'fare', 'family_size'],
                categorical_columns=['sex', 'embarked', 'title', 'age_bin', 'is_child', 'is_travelling_alone']):

    # Scale the numerical columns.
    X_numerical = transform_X_numerical(Xy, numerical_columns)
    
    # Encode the categorical features. The first category will be dropped.
    X_cat_encoded =  transform_X_categorical(Xy, categorical_columns)
    
    return X_numerical.join(X_cat_encoded)

## Extract Clean Data

**Separate data into X (features) and y (label)**

In [43]:
Xy = pd.read_csv('../data/processed/train.csv', index_col='passengerid')
Xy

Unnamed: 0_level_0,survived,pclass,name,sex,sibsp,parch,ticket,fare,embarked,title,last_name,cabin_number,family_size,age_estimate,age,age_bin,is_child,is_travelling_alone
passengerid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
1,0,3,"Braund, Mr. Owen Harris",male,1,0,A/5 21171,7.2500,S,Mr,Braund,21171.0,2,33.0,22.0,"(20.0, 30.0]",False,False
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,1,0,PC 17599,71.2833,C,Mrs,Cumings,17599.0,2,35.9,38.0,"(30.0, 40.0]",False,False
3,1,3,"Heikkinen, Miss. Laina",female,0,0,STON/O2. 3101282,7.9250,S,Miss,Heikkinen,3101282.0,1,22.0,26.0,"(20.0, 30.0]",False,True
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,1,0,113803,53.1000,S,Mrs,Futrelle,113803.0,2,35.9,35.0,"(30.0, 40.0]",False,False
5,0,3,"Allen, Mr. William Henry",male,0,0,373450,8.0500,S,Mr,Allen,373450.0,1,33.0,35.0,"(30.0, 40.0]",False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
887,0,2,"Montvila, Rev. Juozas",male,0,0,211536,13.0000,S,Mr,Montvila,211536.0,1,33.0,27.0,"(20.0, 30.0]",False,True
888,1,1,"Graham, Miss. Margaret Edith",female,0,0,112053,30.0000,S,Miss,Graham,112053.0,1,22.0,19.0,"(10.0, 20.0]",False,True
889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,1,2,W./C. 6607,23.4500,S,Miss,Johnston,6607.0,4,22.0,22.0,"(20.0, 30.0]",False,False
890,1,1,"Behr, Mr. Karl Howell",male,0,0,111369,30.0000,C,Mr,Behr,111369.0,1,33.0,26.0,"(20.0, 30.0]",False,True


## Train Test Split Data

In [44]:
Xy_input = Xy.drop(['name'], axis=1)
Xy_input

Unnamed: 0_level_0,survived,pclass,sex,sibsp,parch,ticket,fare,embarked,title,last_name,cabin_number,family_size,age_estimate,age,age_bin,is_child,is_travelling_alone
passengerid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
1,0,3,male,1,0,A/5 21171,7.2500,S,Mr,Braund,21171.0,2,33.0,22.0,"(20.0, 30.0]",False,False
2,1,1,female,1,0,PC 17599,71.2833,C,Mrs,Cumings,17599.0,2,35.9,38.0,"(30.0, 40.0]",False,False
3,1,3,female,0,0,STON/O2. 3101282,7.9250,S,Miss,Heikkinen,3101282.0,1,22.0,26.0,"(20.0, 30.0]",False,True
4,1,1,female,1,0,113803,53.1000,S,Mrs,Futrelle,113803.0,2,35.9,35.0,"(30.0, 40.0]",False,False
5,0,3,male,0,0,373450,8.0500,S,Mr,Allen,373450.0,1,33.0,35.0,"(30.0, 40.0]",False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
887,0,2,male,0,0,211536,13.0000,S,Mr,Montvila,211536.0,1,33.0,27.0,"(20.0, 30.0]",False,True
888,1,1,female,0,0,112053,30.0000,S,Miss,Graham,112053.0,1,22.0,19.0,"(10.0, 20.0]",False,True
889,0,3,female,1,2,W./C. 6607,23.4500,S,Miss,Johnston,6607.0,4,22.0,22.0,"(20.0, 30.0]",False,False
890,1,1,male,0,0,111369,30.0000,C,Mr,Behr,111369.0,1,33.0,26.0,"(20.0, 30.0]",False,True


In [26]:
X = transform_X(Xy.drop(['name'], axis=1))
y = Xy['survived']
X.shape
X

(891, 17)

Unnamed: 0_level_0,age,fare,family_size,is_child,is_travelling_alone,sex_male,embarked_Q,embarked_S,title_Miss,title_Mr,title_Mrs,"age_bin_(10.0, 20.0]","age_bin_(20.0, 30.0]","age_bin_(30.0, 40.0]","age_bin_(40.0, 50.0]","age_bin_(50.0, 60.0]","age_bin_(60.0, inf]"
passengerid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
1,-0.590381,-0.502445,0.059160,False,False,1,0,1,0,1,0,0,1,0,0,0,0
2,0.614749,0.786845,0.059160,False,False,0,0,0,0,0,1,0,0,1,0,0,0
3,-0.289098,-0.488854,-0.560975,False,True,0,0,1,1,0,0,0,1,0,0,0,0
4,0.388787,0.420730,0.059160,False,False,0,0,1,0,0,1,0,0,1,0,0,0
5,0.388787,-0.486337,-0.560975,False,True,1,0,1,0,1,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
887,-0.213778,-0.386671,-0.560975,False,True,1,0,1,0,1,0,0,1,0,0,0,0
888,-0.816343,-0.044381,-0.560975,False,True,0,0,1,1,0,0,1,0,0,0,0,0
889,-0.590381,-0.176263,1.299429,False,False,0,0,1,1,0,0,0,1,0,0,0,0
890,-0.289098,-0.044381,-0.560975,False,True,1,0,0,0,1,0,0,1,0,0,0,0


### Split data into train and test. 

In [27]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE)
y_test = y_test.to_frame()

print(f'Number of sample in training data = {len(X_train)}')
print(f'Number of sample in test data = {len(X_test)}')

Number of sample in training data = 712
Number of sample in test data = 179


### Logistic Regression with Age

In [31]:
X.columns

model = LogisticRegression()
model.fit(X_train, y_train) 

y_pred = pd.Series(model.predict(X_test), 
                   index=y_test.index, name='survived_pred').to_frame()
y_pred

Index(['age', 'fare', 'family_size', 'is_child', 'is_travelling_alone',
       'sex_male', 'embarked_Q', 'embarked_S', 'title_Miss', 'title_Mr',
       'title_Mrs', 'age_bin_(10.0, 20.0]', 'age_bin_(20.0, 30.0]',
       'age_bin_(30.0, 40.0]', 'age_bin_(40.0, 50.0]', 'age_bin_(50.0, 60.0]',
       'age_bin_(60.0, inf]'],
      dtype='object')

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

Unnamed: 0_level_0,survived_pred
passengerid,Unnamed: 1_level_1
710,1
440,0
841,0
721,1
40,1
...,...
434,0
774,0
26,0
85,1


## Calculate Metrics

In [32]:
pm.calc_model_rst_table_metrics(model, X_train, y_train,)



Accuracy: 0.8286 (+/- 0.0443)
Recall: 0.7238 (+/- 0.0887)
Precision: 0.8013 (+/- 0.0607)
F1: 0.7602 (+/- 0.0710)


10/28/19, <model>, 0.8286, 0.7238,0.8013,0.7602, NS


# Prepare Submission

In [61]:
X = pd.read_csv('../data/processed/holdout.csv', index_col='passengerid')
X

Unnamed: 0_level_0,pclass,name,sex,sibsp,parch,ticket,fare,embarked,title,last_name,cabin_number,family_size,age_estimate,age,age_bin,is_child,is_travelling_alone
passengerid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
892,3,"Kelly, Mr. James",male,0,0,330911,7.8292,Q,Mr,Kelly,330911,1,33.0,34.5,"(30.0, 40.0]",False,True
893,3,"Wilkes, Mrs. James (Ellen Needs)",female,1,0,363272,7.0000,S,Mrs,Wilkes,363272,2,35.9,47.0,"(40.0, 50.0]",False,False
894,2,"Myles, Mr. Thomas Francis",male,0,0,240276,9.6875,Q,Mr,Myles,240276,1,33.0,62.0,"(60.0, inf]",False,True
895,3,"Wirz, Mr. Albert",male,0,0,315154,8.6625,S,Mr,Wirz,315154,1,33.0,27.0,"(20.0, 30.0]",False,True
896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,1,1,3101298,12.2875,S,Mrs,Hirvonen,3101298,3,35.9,22.0,"(20.0, 30.0]",False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1305,3,"Spector, Mr. Woolf",male,0,0,A.5. 3236,8.0500,S,Mr,Spector,3236,1,33.0,33.0,"(30.0, 40.0]",False,True
1306,1,"Oliva y Ocana, Dona. Fermina",female,0,0,PC 17758,108.9000,C,Mrs,Oliva y Ocana,17758,1,35.9,39.0,"(30.0, 40.0]",False,True
1307,3,"Saether, Mr. Simon Sivertsen",male,0,0,SOTON/O.Q. 3101262,7.2500,S,Mr,Saether,3101262,1,33.0,38.5,"(30.0, 40.0]",False,True
1308,3,"Ware, Mr. Frederick",male,0,0,359309,8.0500,S,Mr,Ware,359309,1,33.0,33.0,"(30.0, 40.0]",False,True


In [62]:
X_test_kaggle_public = transform_X(X).reindex(X_test.columns, axis=1).fillna(0)
X_test_kaggle_public

Unnamed: 0_level_0,age,fare,family_size,is_child,is_travelling_alone,sex_male,embarked_Q,embarked_S,title_Miss,title_Mr,title_Mrs,"age_bin_(10.0, 20.0]","age_bin_(20.0, 30.0]","age_bin_(30.0, 40.0]","age_bin_(40.0, 50.0]","age_bin_(50.0, 60.0]","age_bin_(60.0, inf]"
passengerid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
892,0.324727,-0.497079,-0.553443,False,True,1,1,0,0,1,0,0,0,1,0,0,0
893,1.284291,-0.511942,0.105643,False,False,0,0,1,0,0,1,0,0,0,1,0,0
894,2.435767,-0.463770,-0.553443,False,True,1,1,0,0,1,0,0,0,0,0,0,1
895,-0.251011,-0.482143,-0.553443,False,True,1,0,1,0,1,0,0,1,0,0,0,0
896,-0.634836,-0.417167,0.764728,False,False,0,0,1,0,0,1,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1305,0.209580,-0.493121,-0.553443,False,True,1,0,1,0,1,0,0,0,1,0,0,0
1306,0.670170,1.314552,-0.553443,False,True,0,0,0,0,0,1,0,0,1,0,0,0
1307,0.631788,-0.507461,-0.553443,False,True,1,0,1,0,1,0,0,0,1,0,0,0
1308,0.209580,-0.493121,-0.553443,False,True,1,0,1,0,1,0,0,0,1,0,0,0


In [63]:
y_pred = (pd.Series(model.predict(X_test_kaggle_public), 
                   index=X.index, name='Survived').to_frame().sort_index()
         )

y_pred.index.names = ['PassengerId']
y_pred

Unnamed: 0_level_0,Survived
PassengerId,Unnamed: 1_level_1
892,0
893,1
894,0
895,0
896,1
...,...
1305,0
1306,1
1307,0
1308,0


In [64]:
y_submission = (pd.read_csv('../data/raw/gender_submission.csv')
                .set_index('PassengerId')
               )
y_submission

Unnamed: 0_level_0,Survived
PassengerId,Unnamed: 1_level_1
892,0
893,1
894,0
895,0
896,1
...,...
1305,0
1306,1
1307,0
1308,0


In [66]:
(y_pred.index == y_submission.index).all()
y_pred.index.names == y_submission.index.names
(y_pred.columns == y_submission.columns).all()

True

True

True

In [69]:
filename = 'logreg_model_1.csv'
y_pred.to_csv(filename)

In [70]:
y_pred_file = (pd.read_csv(filename)
                .set_index('PassengerId')
               )

In [71]:
(y_pred_file.index == y_submission.index).all()
y_pred_file.index.names == y_submission.index.names
(y_pred_file.columns == y_submission.columns).all()

True

True

True