# Linear Regression OLS

In [5]:
import pandas as pd 
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression

df = pd.read_csv(r"data/df_final.csv") 
df.head()

X = df.drop("SalePrice", axis = 1)
y = df.SalePrice

XTrain, XTest, yTrain, yTest = train_test_split(X, y, random_state = 0, test_size = 0.2)

reg = LinearRegression()
reg.fit(XTrain, yTrain)

score = reg.score(XTest, yTest)
print('r2 Score:', score)
yPred = reg.predict(XTest)
rmse = mean_squared_error(yTest, yPred, squared=False)
print('Root Mean Squared Error:',  rmse)

r2 Score: -1.6072620141061802e+19
Root Mean Squared Error: 269348542045818.62


## Running Grid Search

In [6]:
from sklearn.model_selection import GridSearchCV

reg_gridSearch = LinearRegression()

fit_intercept = [True, False]
normalize = [True, False]
positive = [True, False]

parameter_space = {
    'fit_intercept' : fit_intercept,
    'normalize' : normalize,
    'positive' : positive
}

gs = GridSearchCV(reg_gridSearch, param_grid=parameter_space, scoring='neg_root_mean_squared_error', refit=True, 
                                  return_train_score=True, cv=5)

gs.fit(X, y)
print('Optimal parameters:', gs.best_params_)
print('neg_root_mean_squared_error with above optimal parameters:', gs.best_score_)

Optimal parameters: {'fit_intercept': False, 'normalize': True, 'positive': False}
neg_root_mean_squared_error with above optimal parameters: -334195701450967.06


## Running with Optimal Parameters

In [7]:
reg_optimal = LinearRegression(fit_intercept = True, normalize = False, positive = False)
reg_optimal.fit(XTrain, yTrain)

scoreOptimal = reg_optimal.score(XTest, yTest)
print("r2 Score(test): ", str(scoreOptimal))
yPredOptimal = reg_optimal.predict(XTest)
rmse = mean_squared_error(yTest, yPredOptimal, squared=False)
print('Root Mean Squared Error(test):', rmse)

r2 Score(test):  -1.6072620141061802e+19
Root Mean Squared Error(test): 269348542045818.62


## Cross Validation

In [8]:
import numpy as np
from sklearn.model_selection import cross_validate

scoring= ['r2', 'neg_root_mean_squared_error',]

scores = cross_validate(reg_optimal, X, y, scoring=scoring, 
                                        cv=10, return_train_score=True)

print('Mean of r2(train):', np.mean(scores['train_r2']))
print('Mean of r2(test):', np.mean(scores['test_r2']))

print('Mean of RMSE(train):', -(np.mean(scores['train_neg_root_mean_squared_error'])))
print('Mean of RMSE(test):', -(np.mean(scores['test_neg_root_mean_squared_error'])))

Mean of r2(train): 0.9549061513871878
Mean of r2(test): -2.400254881161357e+22
Mean of RMSE(train): 14704.540107790366
Mean of RMSE(test): 3983610033102375.0
