In [1]:
import numpy as np
import pandas as pd
from scipy import stats
from sklearn.linear_model import Ridge, Lasso, ElasticNet, LinearRegression

import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.model_selection import cross_val_score

from sklearn import ensemble
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

import time



import re

from statsmodels.stats.outliers_influence import variance_inflation_factor

from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.svm import SVR

import pickle

In [2]:
def dummify(df, non_dummies, dummies):
    for dummified in dummies:
        for original in non_dummies:
            if original in dummified:
                orig_name = f'{original}_'
                value = dummified.replace(orig_name, '')
                df[dummified] = df[original].map(lambda x: 1 if x == value else 0)
    df=df.drop(non_dummies,axis=1)
    return df

In [3]:
pd.set_option('display.max_rows', 500)

In [4]:
df = pd.read_csv('Ames_HousePrice.csv', index_col=0)

In [5]:
# drop outliers
df = df[(df['PID'] != 902207130) & (df['PID'] != 908154205)]
# we do not have address/location data for these houses, excluding
pid_drop_list = [
    905450020,
    902477120,
    531477050,
    916403040,
    916252170,
    916253320,
    902401130,
    902205020,
    907230240,
    916477060,
    912251110,
    902103150,
    911175360,
    908154040,
    909129100,
    904101170,
    923125030,
    902205010,
    902401120,
    535300120,
    535426150
]

mask = df['PID'].map(lambda x: False if x in pid_drop_list else True)
df = df[mask]
# excluding abnormal sale conditions like sale between family members or foreclosures
df = df[(df['SaleCondition'] == 'Normal') | (df['SaleCondition'] == 'Partial')]
# excluding houses with bedrooms above ground = 0, we suspect these houses are basements being sold as separate units
# they all have roughly equal total living area
df = df[df['BedroomAbvGr'] != 0]
# excluding properties with nonresidential zoning, we want to focus on residential properties
df = df[df['MSZoning_com'] != 'Nonresidential']
df.reset_index(drop=True, inplace = True)

In [6]:
#separate price from table
price=df['SalePrice']
price_log = np.log10(price)
pid = df['PID']


In [7]:
pd.isnull(df).sum()

PID                            0
GrLivArea                      0
SalePrice                      0
MSSubClass                     0
maybe_MSZoning                 0
LotFrontage                  444
LotArea                        0
Street_paved                   0
Alley                          0
LotShape                       0
LandContour                    0
Utilities                      0
LandSlope                      0
Neighborhood                   0
BldgType                       0
HouseStyle                     0
OverallQual                    0
OverallCond                    0
YearBuilt                      0
YearRemodAdd                   0
RoofStyle                      0
RoofMatl                       0
Exterior1st                    0
Exterior2nd                    0
MasVnrType                     0
MasVnrArea                    14
ExterQual                      0
ExterCond                      0
Foundation                     0
BsmtExposure                   0
BsmtFinTyp

In [8]:
'''Looking at the average sale price by month and year sold'''
avgsaleprice = df.groupby(['MoSold', 'YrSold']).mean()['SalePrice']
print(avgsaleprice)

MoSold  YrSold
1       2006      193051.666667
        2007      207887.576923
        2008      181573.736842
        2009      188005.812500
        2010      206802.333333
2       2006      186408.250000
        2007      166785.347826
        2008      195405.666667
        2009      165239.291667
        2010      173094.000000
3       2006      177161.846154
        2007      175899.288889
        2008      167686.516129
        2009      184511.538462
        2010      180846.822222
4       2006      166571.804878
        2007      160935.121951
        2008      161730.150943
        2009      188629.069767
        2010      169053.550000
5       2006      169070.000000
        2007      172710.237500
        2008      173673.733333
        2009      177906.423729
        2010      177211.041096
6       2006      174147.500000
        2007      183765.393617
        2008      184502.836735
        2009      190600.531915
        2010      175968.178082
7       2006      173504.

In [9]:
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']

newdf = df.select_dtypes(include=numerics)

In [10]:
newdf['LotFrontage'] = newdf['LotFrontage'].fillna(0)
newdf['BsmtFullBath'] = newdf['BsmtFullBath'].fillna(0)
newdf['MasVnrArea'] = newdf['MasVnrArea'].fillna(0)
newdf['BsmtHalfBath'] = newdf['BsmtHalfBath'].fillna(0)
newdf['GarageYrBlt'] = newdf['GarageYrBlt'].fillna(0)
newdf['LotFrontage_log'] = newdf['LotFrontage_log'].fillna(0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  newdf['LotFrontage'] = newdf['LotFrontage'].fillna(0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  newdf['BsmtFullBath'] = newdf['BsmtFullBath'].fillna(0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  newdf['MasVnrArea'] = newdf['MasVnrArea'].fillna(0)
A value is trying to be set on a copy of a 

In [11]:
pd.isnull(newdf
         ).sum()

PID                         0
GrLivArea                   0
SalePrice                   0
MSSubClass                  0
LotFrontage                 0
LotArea                     0
OverallQual                 0
OverallCond                 0
YearBuilt                   0
YearRemodAdd                0
Exterior1st                 0
Exterior2nd                 0
MasVnrArea                  0
ExterQual                   0
ExterCond                   0
BsmtExposure                0
BsmtFinSF1                  0
BsmtFinSF2                  0
BsmtUnfSF                   0
TotalBsmtSF                 0
1stFlrSF                    0
2ndFlrSF                    0
LowQualFinSF                0
BsmtFullBath                0
BsmtHalfBath                0
FullBath                    0
HalfBath                    0
BedroomAbvGr                0
KitchenAbvGr                0
KitchenQual                 0
TotRmsAbvGrd                0
Fireplaces                  0
FireplaceQu                 0
GarageYrBl

In [12]:
y.shape

NameError: name 'y' is not defined

In [None]:

y=df['SalePrice']

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.3)

In [None]:
lm=LinearRegression()
lm.fit(x_train,y_train)
lm.fit(x_test, y_test)
print('The R2 for this training model is :',lm.score(x_train,y_train))
print('The R2 for this test model is :',lm.score(x_test, y_test))
print('The intercept for this model is:', lm.intercept_)
print('The coefficients for this model are:', lm.coef_)
print('The prediction with linear model:', lm.predict(x))