In [1]:
%load_ext autoreload
%autoreload 2

## Objective

Does standardization, and then unstandardizing it, lead to same coefs as without?

In [2]:
from __future__ import division
import pandas as pd
import numpy as np
import warnings
import seaborn as sns
import matplotlib.pyplot as plt
from pylab import rcParams
%matplotlib inline

warnings.filterwarnings("ignore", category=DeprecationWarning)
sns.set_style("whitegrid")
sns.set_context("poster")
rcParams['figure.figsize'] = 20, 5

pd.set_option('display.float_format', lambda x: '%.3f' % x)

In [3]:
df_dummified = pd.DataFrame(
    {'NUM_ISSUES_PER_1000_POP': [1, 4, 6, 10, 15],
    'x1': [2, 8, 5, 25, 33]}
)

## Running model

In [4]:
from sklearn.cross_validation import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression, LassoCV
from sklearn.cross_validation import ShuffleSplit
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.metrics import make_scorer




In [5]:
X_train, X_test, y_train, y_test = train_test_split(
    df_dummified.drop(['NUM_ISSUES_PER_1000_POP'], axis=1), 
    df_dummified.NUM_ISSUES_PER_1000_POP, 
    test_size=0.2, 
    random_state=500
)

In [6]:
pipe = make_pipeline(StandardScaler(), LinearRegression())

In [7]:
cv = ShuffleSplit(X_train.shape[0], n_iter=1, test_size=0.2, random_state=300)

In [8]:
params = {}
model = GridSearchCV(pipe, param_grid=params, n_jobs=-1, cv=cv)
model.fit(X_train, y_train);

### Coef values?

In [9]:
coef_values = pd.DataFrame({
    'name': X_train.columns,
    'coef': model.best_estimator_.steps[-1][-1].coef_
})

coef_values['abs_coef'] = pd.np.abs(coef_values.coef)

In [19]:
# I'm assuming the column names and the values line up correctly
coef_values['coef_unstandardized'] = coef_values2.coef / model.best_estimator_.steps[0][-1].var_**0.5

In [20]:
coef_values

Unnamed: 0,coef,name,abs_coef,coef_unstandardized
0,3.987,x1,3.987,0.029


## And then if I don't standardize?

In [16]:
pipe2 = make_pipeline(LinearRegression())

In [17]:
model2 = GridSearchCV(pipe2, param_grid=params, n_jobs=-1, cv=cv)
model2.fit(X_train, y_train)

coef_values2 = pd.DataFrame({
    'name': X_train.columns,
    'coef': model2.best_estimator_.steps[-1][-1].coef_
})

coef_values2['abs_coef'] = pd.np.abs(coef_values2.coef)

In [18]:
coef_values2[coef_values2.coef != 0].sort_values('abs_coef', ascending=False)

Unnamed: 0,coef,name,abs_coef
0,0.342,x1,0.342


## Conclusion

Our baseline model's performance seems pretty bad, with $R^2=0.28$.

## Testing std scaler by itself

In [62]:
l = np.array([1,2,2,9,5,20,9,100]).reshape(4,2)
l

array([[  1,   2],
       [  2,   9],
       [  5,  20],
       [  9, 100]])

In [45]:
l.mean()

4.25

In [46]:
l.std()

3.1124748994971831

In [50]:
((l - l.mean())**2).mean()**0.5

3.1124748994971831

In [55]:
aa = (l - l.mean()) / l.std()
aa

array([-1.04418513, -0.7228974 ,  0.2409658 ,  1.52611672])

In [57]:
aa * l.std() + l.mean()

array([ 1.,  2.,  5.,  9.])

In [53]:
s = StandardScaler()

In [63]:
s.fit_transform(l)

array([[-1.04418513, -0.78138168],
       [-0.7228974 , -0.60350617],
       [ 0.2409658 , -0.32398753],
       [ 1.52611672,  1.70887538]])

In [65]:
s.inverse_transform(_63)

array([[   1.,    2.],
       [   2.,    9.],
       [   5.,   20.],
       [   9.,  100.]])

In [70]:
s.inverse_transform(np.ones((1,2)))

array([[  7.3624749 ,  72.10336707]])

In [71]:
3.11 + 4.25

7.359999999999999