### Importing libraries

In [1]:
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer, mean_squared_error, mean_absolute_error
from sklearn.model_selection import KFold, train_test_split
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
matplotlib.style.use('ggplot')

%matplotlib inline

### Loading the training data

In [9]:
train = np.load('../Models/train.npy') # we only want to work with the training data
X = train[0:,0:-1]
y = train[:, -1]

In [10]:
print "X: ", X.shape
print "y: ", y.shape

X:  (3184, 49)
y:  (3184,)


### Train-Test Split

In [31]:
X_train = np.load("../Models/X_train.npy")
X_test = np.load("../Models/X_test.npy")
y_train = np.load("../Models/y_train.npy")
y_test = np.load("../Models/y_test.npy")

In [33]:
print X_train.shape
print X_test.shape
print y_train.shape
print y_test.shape

(2547, 49)
(637, 49)
(2547,)
(637,)


In [34]:
def root_mean_squared(act_y, pred_y):
    """ Root Mean Squared Error """
    rmse = np.sqrt(mean_squared_error(act_y, pred_y))
    return rmse

def absolute_error(act_y, pred_y):
    """ Mean Absolute Error"""
    mae = mean_absolute_error(act_y, pred_y)
    return mae

In [35]:
rmse_score_function = make_scorer(root_mean_squared, greater_is_better=False)
mae_score_function = make_scorer(absolute_error, greater_is_better=False)

## Build Baseline Model 

In [36]:
baseline_linreg = LinearRegression()

###  Cross Validation with Linear Regression

In [37]:
kf = KFold(n_splits=10)

In [39]:
rmse_average = []
mae_average = []
for train_index, test_index in kf.split(X_train):
    X_train_cv, X_test_cv = X_train[train_index], X_train[test_index]
    Y_train_cv, Y_test_cv = y_train[train_index], y_train[test_index]

    
    baseline_linreg.fit(X_train_cv, Y_train_cv)
    
    pred = baseline_linreg.predict(X_test_cv)
    
    
    rmse = root_mean_squared(Y_test_cv, pred)
    mae = absolute_error(Y_test_cv, pred)
    
    rmse_average.append(rmse)
    mae_average.append(mae)

print "Average RMSE: {0} +/- {1}, Average MAE: {2} +/- {3}".format(np.mean(rmse_average), np.std(rmse_average), np.mean(mae_average), np.std(mae_average))

Average RMSE: 4.6584594516 +/- 0.208704544705, Average MAE: 3.62468549038 +/- 0.195646252535


##  Baseline Model Scores
 * Average CV RMSE: 4.65
 * Average CV MAE: 3.624
 