In [51]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
import sklearn
%matplotlib inline

In [19]:
df = pd.read_csv('./datasets/train.csv')

In [20]:
df['Total SF'] = df['1st Flr SF'] + df['BsmtFin SF 1'] + df['2nd Flr SF'] + df['Gr Liv Area'] + df['Garage Area'] + df['Total Bsmt SF'] + df['Mas Vnr Area'] + df['Bsmt Unf SF']

In [21]:
dummies = pd.get_dummies(df['Neighborhood'])

In [22]:
df_all = pd.concat([df, dummies], axis = 1)

In [23]:
df_all.shape

(2051, 110)

In [24]:
pd.options.display.max_rows = 4000

In [25]:
correl = df_all.corr()
correl['SalePrice'].sort_values(ascending=False)

SalePrice          1.000000
Total SF           0.810161
Overall Qual       0.800207
Gr Liv Area        0.697038
Garage Area        0.650270
Garage Cars        0.648220
Total Bsmt SF      0.628925
1st Flr SF         0.618486
Year Built         0.571849
Year Remod/Add     0.550370
Full Bath          0.537969
Garage Yr Blt      0.533922
Mas Vnr Area       0.512230
TotRms AbvGrd      0.504014
Fireplaces         0.471093
NridgHt            0.448647
BsmtFin SF 1       0.423519
Lot Frontage       0.341842
Open Porch SF      0.333476
Wood Deck SF       0.326490
Lot Area           0.296566
Bsmt Full Bath     0.283662
Half Bath          0.283001
NoRidge            0.263395
StoneBr            0.256977
2nd Flr SF         0.248452
Bsmt Unf SF        0.190210
Somerst            0.150078
Bedroom AbvGr      0.137067
Screen Porch       0.134581
Timber             0.116400
Veenker            0.083186
CollgCr            0.082309
Crawfor            0.058386
ClearCr            0.052503
3Ssn Porch         0

In [69]:
finalpoke = ['Overall Qual', 'Total SF', 'Year Built', 'Year Remod/Add', 'Garage Yr Blt', 'Full Bath', 'TotRms AbvGrd', 'Fireplaces', 'NridgHt', 'Lot Frontage', 'Open Porch SF', 'Lot Area']

In [70]:
finalize = df_all[finalpoke]

In [71]:
finalize = finalize.apply(lambda x: x.fillna(x.mean()),axis=0)

In [72]:
target = df_all['SalePrice']

In [73]:
lr = LinearRegression()

In [74]:
lr.fit(finalize,target)

LinearRegression()

In [75]:
preds = lr.predict(finalize)

In [76]:
rmse = np.sqrt(mean_squared_error(target,preds))
rmse

34968.423303859716

In [77]:
X = finalize
y = target

In [78]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)

In [79]:
newlr = LinearRegression()
newlr.fit(X_train, y_train);

In [80]:
newlr.score(X_train, y_train)

0.7920848664249863

In [81]:
newlr.score(X_test, y_test)

0.834526016264211

In [82]:
cross_val_score(lr, X_train, y_train, cv = 5)

array([0.80818825, 0.80542045, 0.82211936, 0.77554294, 0.60292245])

In [83]:
newlr.coef_

array([ 1.99357830e+04,  1.22826102e+01,  2.38468291e+02,  3.43944453e+02,
        7.35356505e+01,  1.41539040e+02,  3.53915305e+03,  1.12093246e+04,
        4.30122185e+04, -2.90752978e+01,  3.68743437e+00,  1.56266393e+00])

In [84]:
test = pd.read_csv('./datasets/test.csv')

In [85]:
test['Total SF'] = test['1st Flr SF'] + test['BsmtFin SF 1'] + test['2nd Flr SF'] + test['Gr Liv Area'] + test['Garage Area'] + test['Total Bsmt SF'] + test['Mas Vnr Area'] + test['Bsmt Unf SF']

In [86]:
dummies = pd.get_dummies(test['Neighborhood'])

In [87]:
test_df = pd.concat([test, dummies], axis = 1)

In [88]:
test_df = test_df[finalpoke]

In [90]:
test_df = test_df.apply(lambda x: x.fillna(x.mean()),axis=0)

In [91]:
X_kaggle = test_df
kaggle_predictions = newlr.predict(X_kaggle)

In [92]:
kaggle_submission = pd.DataFrame()

kaggle_submission['Id'] = test['Id']
kaggle_submission['SalePrice'] = kaggle_predictions

In [93]:
kaggle_submission.head()

Unnamed: 0,Id,SalePrice
0,2658,155273.832467
1,2718,195584.304777
2,2414,221407.910019
3,1989,119939.022558
4,625,188555.811458


In [94]:
kaggle_submission.shape, test.shape

((878, 2), (878, 81))

In [95]:
kaggle_submission.to_csv('./datasets/kaggle_submission.csv', index=False)