# First Model 

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.preprocessing import OrdinalEncoder

### I will use a LinearReggresion model because our target variable is numerical

In [2]:
eda_train = pd.read_csv('../datasets/feat_train.csv')
eda_test = pd.read_csv('../datasets/feat_test.csv')

In [3]:
eda_train.shape

(2051, 106)

In [4]:
eda_test.shape

(878, 105)

### Create our features matrix (X) and target vector (y)

In [5]:
features = ['lot_area', 'overall_qual', 'overall_cond', 'year_built', 'year_remod/add', 
            'exter_qual', 'exter_cond', 'bsmt_qual', 'bsmt_cond', 'heating_qc', 'kitchen_qual', 'garage_area', 
           'totrms_abvgrd', 'full_bath', '1st_flr_sf', 'gr_liv_area', 'garage_cars', 'land_contour', 
            'bldg_type', 'house_style']

In [6]:
X = eda_train[features]
y = eda_train['saleprice']

### Train/test split

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)

### Baseline saleprice for target variable

In [8]:
print(np.exp(y_train.mean()))

166512.566580161


In [9]:
print(f'Train Shape: {X_train.shape, y_train.shape}')
print(f'Test shape: {X_test.shape, y_test.shape}')

Train Shape: ((1538, 20), (1538,))
Test shape: ((513, 20), (513,))


In [10]:
sc = StandardScaler()

X_train_sc = sc.fit_transform(X_train) 

X_test_sc = sc.transform(X_test)

In [11]:
lr = LinearRegression()

In [12]:
cross_val_score(lr, X_train, y_train).mean()

0.8732708437957738

In [13]:
lr.fit(X_train_sc, y_train)

### Based on the close relationship bewtween our R2 scores we have a well built model

In [14]:
# Our Test data only underperformed by 2% meaning our model is not overfit. 

In [15]:
print(f'Training R2: {lr.score(X_train_sc, y_train)}')
print(f'Test R2: {lr.score(X_test_sc, y_test)}')

Training R2: 0.8790322348655626
Test R2: 0.8688589741575223


In [16]:
preds = lr.predict(X_test_sc)
preds = np.exp(preds)
print(f'MSE: {mean_squared_error(y_test, preds)}')
print(f'RMSE: {mean_squared_error(y_test, preds, squared = False)}')

MSE: 38059805298.22502
RMSE: 195089.2239418288


In [17]:
test_sc = sc.transform(eda_test[features])
new_preds = np.exp(lr.predict(test_sc))
new_preds

array([ 134173.36769054,  176510.70921474,  209806.59518478,
        113513.18926429,  159922.92533462,   98101.59965548,
        115529.79790211,  163521.7609649 ,  191043.68991417,
        157946.93579486,  161834.6063349 ,  129901.84357137,
        155748.98352656,  279721.86644681,  158144.6127224 ,
        121701.40740683,  133167.28670726,  115157.0728376 ,
        190807.07320922,  206378.39893015,  151411.37243675,
        120662.71736283,  200824.05023862,  160963.54488701,
        209545.92382111,  119502.91421412,  119716.38374101,
        118160.48325668,  159901.08973858,   67829.05270143,
        106085.61669419,   99144.40655688,  228116.88365362,
        155607.42537559,  221218.18658878,  189499.15198167,
        108180.36542032,   96089.26718427,  134150.59148853,
        199779.73278643,  153662.74983416,  211685.97271081,
        148912.01745753,  162940.94463555,  202438.74129091,
        104810.69498814,  216545.65121583,  112221.54313501,
        122613.78374667,

In [18]:
eda_test['saleprice'] = new_preds 

In [19]:
preds = preds.round(4)

In [20]:
coefs = pd.DataFrame(data = lr.coef_, index = X.columns, columns = ['Coefs'])

In [21]:
coefs.sort_values(by = 'Coefs', ascending=False)


Unnamed: 0,Coefs
gr_liv_area,0.122022
overall_qual,0.102136
year_built,0.08833
1st_flr_sf,0.074259
overall_cond,0.057246
garage_cars,0.033304
lot_area,0.030575
bsmt_qual,0.028556
kitchen_qual,0.023362
heating_qc,0.01814


In [22]:
coefs.to_csv('../datasets/coefs_lr.csv', index=False)