In [24]:
# Initializing

# data processing. pandas as alias pd
import pandas as pd 
# linear algebra. numpy as alias np
import numpy as np 

# If you're working with a notebook, don't forget to use Matplotlib magic! 
%matplotlib inline
# matlab-style plotting
import matplotlib.pyplot as plt 
import seaborn as sns
color = sns.color_palette()
# Set the Seaborn theme if desired
sns.set_style("darkgrid")

# ignore sklearn & seaborn warnings
import warnings
def ignore_warn(*args, **kwargs):
    pass
warnings.warn = ignore_warn 

#for some statistics
from scipy import stats
from scipy.stats import norm, skew  #ex. sns(fit = norm)


# Format scientific notation from pandas aggregation
pd.set_option('display.float_format', lambda x: '%.3f' % x)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000) 

In [25]:
df_dummies = pd.read_csv('data/all_data.csv')
df_no_dummies = pd.read_csv('data/all_data_no_dummies.csv') 

In [26]:
targets = pd.read_csv('data/y_train.csv') 

In [48]:
from sklearn.model_selection import GridSearchCV, train_test_split 

from sklearn import linear_model as lm 
from sklearn.ensemble import GradientBoostingRegressor as gbr, RandomForestRegressor as rfr

from sklearn.metrics import mean_squared_error  

#rmse error: 
def rmse(y, y_pred):
    return np.sqrt(mean_squared_error(y, y_pred))

### Split all_data into train and test: 

In [52]:
ntrain = targets.shape[0] 
x_d = df_dummies[:ntrain] 
x_nd = df_no_dummies[:ntrain] 
targets_log = targets.apply(np.log)


In [44]:
from sklearn.preprocessing import StandardScaler 

scaler = StandardScaler()
scaler.fit(x_d)
x_d_std = scaler.transform(x_d)


### Split x_ and x_nd + targets into train and test using train_test_split: 

In [58]:
x_d_train, x_d_test, y_d_train, y_d_test = train_test_split(x_d_std, targets_log, test_size = 0.2, random_state = 42) 
x_nd_train, x_nd_test, y_nd_train, y_nd_test = train_test_split(x_nd, targets_log, test_size = 0.2, random_state = 42)  


## MODELLING

### 1. Multiple Linear Regression: 

In [59]:
lm_model =lm.LinearRegression()
lm_model.fit(x_d_train, y_d_train)

#RMS error 
print("RMSE train: {}".format(rmse(y_d_train, lm_model.predict(x_d_train)))) 
print("RMSE test : {}".format(rmse(y_d_test,  lm_model.predict(x_d_test)))) 
print('R^2 score: {}'.format(lm_model.score(x_d_train, y_d_train)))




RMSE train: 0.17318243900299812
RMSE test : 15562705841228.213
R^2 score: 0.8014776983376941


### 2. Ridge/Lasso/Elastic Net : Penalised LR 

In [62]:
#1. ridge: 

grid_param = [{'alpha': np.logspace(-4, 4, 20)}]

gs = GridSearchCV(estimator=lm.Ridge(random_state=42), param_grid=grid_param, cv=5)

gs.fit(x_d_std, targets_log)

#cv_results_, grid_scores_ (to obsolete), best_estimator_, best_params_, best_score_
print('Best params: {}'.format(gs.best_params_))
print('Best score : {}'.format(gs.best_score_))
#print('')
model = gs.best_estimator_
print("RMSE train: {}".format(rmse(y_d_train, model.predict(x_d_train))))
print("RMSE test : {}".format(rmse(y_d_test,  model.predict(x_d_test))))
print('R^2 score: {}'.format(model.score(x_d_train, y_d_train))) 

Best params: {'alpha': 206.913808111479}
Best score : 0.7509540568316588
RMSE train: 0.1742420492610361
RMSE test : 0.18741398863428388
R^2 score: 0.7990409640247749


In [None]:
#2. Lasso: 

