In [1]:
import pandas as pd

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split


In [2]:
train = pd.read_csv('./datasets/train.csv')
test = pd.read_csv('./datasets/test.csv')

In [3]:
train.columns

Index(['Id', 'PID', 'MS SubClass', 'MS Zoning', 'Lot Frontage', 'Lot Area',
       'Street', 'Alley', 'Lot Shape', 'Land Contour', 'Utilities',
       'Lot Config', 'Land Slope', 'Neighborhood', 'Condition 1',
       'Condition 2', 'Bldg Type', 'House Style', 'Overall Qual',
       'Overall Cond', 'Year Built', 'Year Remod/Add', 'Roof Style',
       'Roof Matl', 'Exterior 1st', 'Exterior 2nd', 'Mas Vnr Type',
       'Mas Vnr Area', 'Exter Qual', 'Exter Cond', 'Foundation', 'Bsmt Qual',
       'Bsmt Cond', 'Bsmt Exposure', 'BsmtFin Type 1', 'BsmtFin SF 1',
       'BsmtFin Type 2', 'BsmtFin SF 2', 'Bsmt Unf SF', 'Total Bsmt SF',
       'Heating', 'Heating QC', 'Central Air', 'Electrical', '1st Flr SF',
       '2nd Flr SF', 'Low Qual Fin SF', 'Gr Liv Area', 'Bsmt Full Bath',
       'Bsmt Half Bath', 'Full Bath', 'Half Bath', 'Bedroom AbvGr',
       'Kitchen AbvGr', 'Kitchen Qual', 'TotRms AbvGrd', 'Functional',
       'Fireplaces', 'Fireplace Qu', 'Garage Type', 'Garage Yr Blt',
       'G

In [4]:
lot_frontage_dict = train.groupby('Neighborhood')['Lot Frontage'].mean().to_dict()
lot_frontage_dict

{'Blmngtn': 47.25,
 'Blueste': 27.666666666666668,
 'BrDale': 21.31578947368421,
 'BrkSide': 56.357142857142854,
 'ClearCr': 92.5,
 'CollgCr': 70.75324675324676,
 'Crawfor': 69.27272727272727,
 'Edwards': 66.93023255813954,
 'Gilbert': 72.02564102564102,
 'Greens': 40.0,
 'GrnHill': nan,
 'IDOTRR': 61.890625,
 'Landmrk': nan,
 'MeadowV': 26.80952380952381,
 'Mitchel': 75.47692307692307,
 'NAmes': 75.19245283018869,
 'NPkVill': 27.625,
 'NWAmes': 80.96551724137932,
 'NoRidge': 88.30555555555556,
 'NridgHt': 86.29411764705883,
 'OldTown': 62.802547770700635,
 'SWISU': 58.38709677419355,
 'Sawyer': 72.77464788732394,
 'SawyerW': 70.49333333333334,
 'Somerst': 63.98290598290598,
 'StoneBr': 60.05882352941177,
 'Timber': 78.5945945945946,
 'Veenker': 67.84615384615384}

In [6]:
train.shape

(2051, 81)

In [7]:
test.shape

(878, 80)

In [8]:
# pd.set_option is a good tool to change the aesthetics of the output

pd.set_option('max_columns', 99)
pd.set_option('max_rows', 30)

In [9]:
train.head(3)

Unnamed: 0,Id,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,Utilities,Lot Config,Land Slope,Neighborhood,Condition 1,Condition 2,Bldg Type,House Style,Overall Qual,Overall Cond,Year Built,Year Remod/Add,Roof Style,Roof Matl,Exterior 1st,Exterior 2nd,Mas Vnr Type,Mas Vnr Area,Exter Qual,Exter Cond,Foundation,Bsmt Qual,Bsmt Cond,Bsmt Exposure,BsmtFin Type 1,BsmtFin SF 1,BsmtFin Type 2,BsmtFin SF 2,Bsmt Unf SF,Total Bsmt SF,Heating,Heating QC,Central Air,Electrical,1st Flr SF,2nd Flr SF,Low Qual Fin SF,Gr Liv Area,Bsmt Full Bath,Bsmt Half Bath,Full Bath,Half Bath,Bedroom AbvGr,Kitchen AbvGr,Kitchen Qual,TotRms AbvGrd,Functional,Fireplaces,Fireplace Qu,Garage Type,Garage Yr Blt,Garage Finish,Garage Cars,Garage Area,Garage Qual,Garage Cond,Paved Drive,Wood Deck SF,Open Porch SF,Enclosed Porch,3Ssn Porch,Screen Porch,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,SalePrice
0,109,533352170,60,RL,,13517,Pave,,IR1,Lvl,AllPub,CulDSac,Gtl,Sawyer,RRAe,Norm,1Fam,2Story,6,8,1976,2005,Gable,CompShg,HdBoard,Plywood,BrkFace,289.0,Gd,TA,CBlock,TA,TA,No,GLQ,533.0,Unf,0.0,192.0,725.0,GasA,Ex,Y,SBrkr,725,754,0,1479,0.0,0.0,2,1,3,1,Gd,6,Typ,0,,Attchd,1976.0,RFn,2.0,475.0,TA,TA,Y,0,44,0,0,0,0,,,,0,3,2010,WD,130500
1,544,531379050,60,RL,43.0,11492,Pave,,IR1,Lvl,AllPub,CulDSac,Gtl,SawyerW,Norm,Norm,1Fam,2Story,7,5,1996,1997,Gable,CompShg,VinylSd,VinylSd,BrkFace,132.0,Gd,TA,PConc,Gd,TA,No,GLQ,637.0,Unf,0.0,276.0,913.0,GasA,Ex,Y,SBrkr,913,1209,0,2122,1.0,0.0,2,1,4,1,Gd,8,Typ,1,TA,Attchd,1997.0,RFn,2.0,559.0,TA,TA,Y,0,74,0,0,0,0,,,,0,4,2009,WD,220000
2,153,535304180,20,RL,68.0,7922,Pave,,Reg,Lvl,AllPub,Inside,Gtl,NAmes,Norm,Norm,1Fam,1Story,5,7,1953,2007,Gable,CompShg,VinylSd,VinylSd,,0.0,TA,Gd,CBlock,TA,TA,No,GLQ,731.0,Unf,0.0,326.0,1057.0,GasA,TA,Y,SBrkr,1057,0,0,1057,1.0,0.0,1,0,3,1,Gd,5,Typ,0,,Detchd,1953.0,Unf,1.0,246.0,TA,TA,Y,0,52,0,0,0,0,,,,0,1,2010,WD,109000


In [10]:
features = ['Lot Area', 'Gr Liv Area']

In [11]:
X = train[features]
y = train['SalePrice']

In [12]:
X_train, X_holdout, y_train, y_holdout = train_test_split(X, y)

In [13]:
X_train.shape

(1538, 2)

In [14]:
X

Unnamed: 0,Lot Area,Gr Liv Area
0,13517,1479
1,11492,2122
2,7922,1057
3,9802,1444
4,14235,1445
...,...,...
2046,11449,1728
2047,12342,861
2048,7558,1913
2049,10400,1200


In [35]:

## do crossval scores on X_train, y_train

## try different feature combinations

## try interaction terms or any other modeling techniques

## after all of that, instantiate your object
lr = LinearRegression()

lr.fit(X_train, y_train)



LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [36]:
### did our model perform well?

lr.score(X_holdout, y_holdout)

0.49289282370108617

In [37]:
## optional: combine ALL data and refit model


In [40]:
X_test = test[features]

In [41]:
X_test

Unnamed: 0,Lot Area,Gr Liv Area
0,9142,1928
1,9662,1967
2,17104,1496
3,8520,968
4,9500,1394
...,...,...
873,8000,1877
874,14670,1988
875,8250,1211
876,9000,864


In [43]:
lr.coef_

# the right number is the slope of the prediction line

array([  0.95884801, 106.1625428 ])

In [44]:
predictions = lr.predict(X_test)

In [46]:
predictions # this is now a series of predicted sales prices

array([226045.1749541 , 230684.11508933, 187817.30433296, 123532.73040292,
       169697.64468708, 130339.72339314, 136799.21286087, 156115.9708198 ,
       194321.42657838, 158088.23227234, 148677.55298449, 129910.75410919,
       169416.07127489, 316120.78243772, 217810.96872143, 163542.97425077,
       175402.97389977, 135687.54665712, 211179.75082335, 214476.95668433,
       133036.95998533, 112471.69014348, 191945.12589019, 138056.76259307,
       151113.71619551, 111206.96961617, 209106.56846924, 207012.08591806,
       132931.18652697,  87082.87563025, 110723.41004123, 146825.56720844,
       304085.63864136, 147245.95944809, 181699.01466882, 180043.62046128,
       193802.17835726, 127177.85373249, 117731.99438084, 163786.46332898,
       146329.36479466, 223573.08123655, 160559.66731896, 171257.41795102,
       175835.15049544, 156750.37543858, 212991.34548498, 122523.87220616,
       117592.66410653, 130198.40851267, 154139.42370958, 262004.28308843,
       193010.94520407, 1

In [48]:
# last step is to put it in a form we can submit

test['SalePrice'] = predictions # we have to make the column names match exactly though

In [51]:
# another way to do it:

submission = test[['Id', 'SalePrice']]

In [52]:
submission.to_csv('./submissions01_08.csv', index = False)