# Logistic Regression with basis expansions

In [33]:
import os
os.chdir('C:\Users\Lundi\Documents\Programming\Python\Kaggle\Titanic - 2015')

import TitanicPreprocessor as tp
import TitanicPredictor as tpred
import sklearn.ensemble as skl_ensemble
import sklearn.cross_validation as skl_cv
import sklearn.preprocessing as skl_pre
import sklearn.linear_model as skl_lm
import sklearn.grid_search as skl_gs
import numpy as np
import pandas as pd

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

X, y, X_test, X_test_ids = tp.getData()

## Scaling data

In [18]:
X.head(2)

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Is_male,Embarked Q,Embarked S
0,3,22,1,0,7.25,1,0,1
1,1,38,1,0,71.2833,0,0,0


In [17]:
scaler = skl_pre.StandardScaler()
scaler.fit(X[['Age','Fare']])

X_scaled = X.copy()
X_scaled[['Age', 'Fare']] = pd.DataFrame(scaler.transform(X[['Age','Fare']]), columns=['Age','Fare'])
X_scaled.head(2)

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Is_male,Embarked Q,Embarked S
0,3,-0.549963,1,0,-0.502445,1,0,1
1,1,0.63296,1,0,0.786845,0,0,0


#### Fitting the model

In [31]:
lr_clf = skl_lm.LogisticRegression()
unscaled_score = skl_cv.cross_val_score(lr_clf, X, y, cv=7)
scaled_score = skl_cv.cross_val_score(lr_clf, X_scaled, y, cv=7)
print 'Unscaled Scores:', np.mean(unscaled_score)
print 'Scaled Scores:', np.mean(scaled_score)

Unscaled Scores: 0.800237721401
Scaled Scores: 0.804719722535


So scaling the data does improve the predictions a bit

In [32]:
scaler = skl_pre.StandardScaler()
scaler.fit(X[['Age','Fare']])

def scaleData(X_to_scale):
    X_scaled = X_to_scale.copy()
    X_scaled[['Age', 'Fare']] = pd.DataFrame(scaler.transform(X[['Age','Fare']]), columns=['Age','Fare'])
    return X_scaled

## Polynomial basis expansion

In [75]:
poly_transform = skl_pre.PolynomialFeatures(degree=2, include_bias=False)
X_poly = poly_transform.fit_transform(X)
pd.DataFrame(X_poly)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,34,35,36,37,38,39,40,41,42,43
0,3,22.000000,1,0,7.2500,1,0,1,9,66.000000,...,52.562500,7.2500,0.0000,7.2500,1,0,1,0,0,1
1,1,38.000000,1,0,71.2833,0,0,0,1,38.000000,...,5081.308859,0.0000,0.0000,0.0000,0,0,0,0,0,0
2,3,26.000000,0,0,7.9250,0,0,1,9,78.000000,...,62.805625,0.0000,0.0000,7.9250,0,0,0,0,0,1
3,1,35.000000,1,0,53.1000,0,0,1,1,35.000000,...,2819.610000,0.0000,0.0000,53.1000,0,0,0,0,0,1
4,3,35.000000,0,0,8.0500,1,0,1,9,105.000000,...,64.802500,8.0500,0.0000,8.0500,1,0,1,0,0,1
5,3,31.275746,0,0,8.4583,1,1,0,9,93.827239,...,71.542839,8.4583,8.4583,0.0000,1,1,0,1,0,0
6,1,54.000000,0,0,51.8625,1,0,1,1,54.000000,...,2689.718906,51.8625,0.0000,51.8625,1,0,1,0,0,1
7,3,2.000000,3,1,21.0750,1,0,1,9,6.000000,...,444.155625,21.0750,0.0000,21.0750,1,0,1,0,0,1
8,3,27.000000,0,2,11.1333,0,0,1,9,81.000000,...,123.950369,0.0000,0.0000,11.1333,0,0,0,0,0,1
9,2,14.000000,1,0,30.0708,0,0,0,4,28.000000,...,904.253013,0.0000,0.0000,0.0000,0,0,0,0,0,0


In [76]:
poly_transform.powers_

array([[1, 0, 0, 0, 0, 0, 0, 0],
       [0, 1, 0, 0, 0, 0, 0, 0],
       [0, 0, 1, 0, 0, 0, 0, 0],
       [0, 0, 0, 1, 0, 0, 0, 0],
       [0, 0, 0, 0, 1, 0, 0, 0],
       [0, 0, 0, 0, 0, 1, 0, 0],
       [0, 0, 0, 0, 0, 0, 1, 0],
       [0, 0, 0, 0, 0, 0, 0, 1],
       [2, 0, 0, 0, 0, 0, 0, 0],
       [1, 1, 0, 0, 0, 0, 0, 0],
       [1, 0, 1, 0, 0, 0, 0, 0],
       [1, 0, 0, 1, 0, 0, 0, 0],
       [1, 0, 0, 0, 1, 0, 0, 0],
       [1, 0, 0, 0, 0, 1, 0, 0],
       [1, 0, 0, 0, 0, 0, 1, 0],
       [1, 0, 0, 0, 0, 0, 0, 1],
       [0, 2, 0, 0, 0, 0, 0, 0],
       [0, 1, 1, 0, 0, 0, 0, 0],
       [0, 1, 0, 1, 0, 0, 0, 0],
       [0, 1, 0, 0, 1, 0, 0, 0],
       [0, 1, 0, 0, 0, 1, 0, 0],
       [0, 1, 0, 0, 0, 0, 1, 0],
       [0, 1, 0, 0, 0, 0, 0, 1],
       [0, 0, 2, 0, 0, 0, 0, 0],
       [0, 0, 1, 1, 0, 0, 0, 0],
       [0, 0, 1, 0, 1, 0, 0, 0],
       [0, 0, 1, 0, 0, 1, 0, 0],
       [0, 0, 1, 0, 0, 0, 1, 0],
       [0, 0, 1, 0, 0, 0, 0, 1],
       [0, 0, 0, 2, 0, 0, 0, 0],
       [0,

#### Filtering out interaction terms

For powers of two:

In [78]:
X_poly_2 = pd.DataFrame(X_poly).ix[:, map(lambda powers: np.std(powers) > 0.65, poly_transform.powers_)]
X_poly_2.columns = map(lambda name: name + '^2', X.columns)
X_poly_2 = X_poly_2[['Age^2','Fare^2']]
X_poly_2

Unnamed: 0,Age^2,Fare^2
0,484.000000,52.562500
1,1444.000000,5081.308859
2,676.000000,62.805625
3,1225.000000,2819.610000
4,1225.000000,64.802500
5,978.172318,71.542839
6,2916.000000,2689.718906
7,4.000000,444.155625
8,729.000000,123.950369
9,196.000000,904.253013


Merging these powers with the original data:

In [70]:
X_poly_full = pd.concat([X, X_poly_2], axis=1)
X_poly_full.head(2)

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Is_male,Embarked Q,Embarked S,Age^2,Fare^2
0,3,22,1,0,7.25,1,0,1,484,52.5625
1,1,38,1,0,71.2833,0,0,0,1444,5081.308859


#### Scaling the polynomial data

In [81]:
X_poly_scaled = X_poly_full.copy()
X_poly_scaled[['Age', 'Fare','Age^2','Fare^2']] = pd.DataFrame(skl_pre.scale(X_poly_scaled[['Age','Fare','Age^2','Fare^2']]), columns=['Age','Fare','Age^2','Fare^2'])
X_poly_scaled.head(2)

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Is_male,Embarked Q,Embarked S,Age^2,Fare^2
0,3,-0.549963,1,0,-0.502445,1,0,1,-0.621561,-0.199305
1,1,0.63296,1,0,0.786845,0,0,0,0.433451,0.091101


#### Fitting the data with polynomials

In [82]:
lr_clf = skl_lm.LogisticRegression()
unscaled_score = skl_cv.cross_val_score(lr_clf, X, y, cv=7)
scaled_score = skl_cv.cross_val_score(lr_clf, X_scaled, y, cv=7)
scaled_poly_score = skl_cv.cross_val_score(lr_clf, X_poly_scaled, y, cv=7)
print 'Unscaled Scores:', np.mean(unscaled_score)
print 'Scaled Scores:', np.mean(scaled_score)
print 'Scaled Poly Scores:', np.mean(scaled_poly_score)

Unscaled Scores: 0.800237721401
Scaled Scores: 0.804719722535
Scaled Poly Scores: 0.802496507133


## Interaction Terms

In [89]:
interaction_transform = skl_pre.PolynomialFeatures(interaction_only=True)
X_interactions = pd.DataFrame(interaction_transform.fit_transform(X))

#### Fitting to logistic regression

In [90]:
unscaled_score = skl_cv.cross_val_score(lr_clf, X, y, cv=7)
scaled_score = skl_cv.cross_val_score(lr_clf, X_scaled, y, cv=7)
scaled_poly_score = skl_cv.cross_val_score(lr_clf, X_poly_scaled, y, cv=7)
unscaled_interaction_score = skl_cv.cross_val_score(lr_clf, X_interactions, y, cv=7)


print 'Unscaled Scores:', np.mean(unscaled_score)
print 'Scaled Scores:', np.mean(scaled_score)
print 'Scaled Poly Scores:', np.mean(scaled_poly_score)
print 'Unscaled Interactions Scores:', np.mean(unscaled_interaction_score)

Unscaled Scores: 0.800237721401
Scaled Scores: 0.804719722535
Scaled Poly Scores: 0.802496507133
Unscaled Interactions Scores: 0.814826160123


Adding the interaction terms definitely helps the prediction capacity

## Conclusions

1. Scaling improves prediction slightly (+0.44%)
2. Polynomial (degree = 2) does not improve predictions
3. Using interaction terms improves prediction (+1.4%)