In [101]:
%matplotlib notebook
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures

In [154]:
ames = pd.read_csv('data/ames_housing.csv')

In [103]:
ames.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [104]:
ames.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
Id               1460 non-null int64
MSSubClass       1460 non-null int64
MSZoning         1460 non-null object
LotFrontage      1201 non-null float64
LotArea          1460 non-null int64
Street           1460 non-null object
Alley            91 non-null object
LotShape         1460 non-null object
LandContour      1460 non-null object
Utilities        1460 non-null object
LotConfig        1460 non-null object
LandSlope        1460 non-null object
Neighborhood     1460 non-null object
Condition1       1460 non-null object
Condition2       1460 non-null object
BldgType         1460 non-null object
HouseStyle       1460 non-null object
OverallQual      1460 non-null int64
OverallCond      1460 non-null int64
YearBuilt        1460 non-null int64
YearRemodAdd     1460 non-null int64
RoofStyle        1460 non-null object
RoofMatl         1460 non-null object
Exterior1st      1460 non-n

## This is a huge dataset, and I'm assuming you aren't asking us to make the best possible model, so below, I've pulled in enough features that I think are relevant and useful to a linear model.  I've also decided to simply drop nulls here since I want to see the overall improvement in the model by filling certain nulls with appropriate fills.

In [128]:
ames_orig_X = ames[['LotFrontage', 'LotArea', 'OverallQual', 'TotalBsmtSF', 'SalePrice',
                    'GrLivArea', 'TotRmsAbvGrd', 'Fireplaces', 'GarageCars', 'GarageArea', 'PoolArea', 'YrSold'
                   ]].dropna()
ames_orig_y = np.log(ames_orig_X.SalePrice)
ames_orig_X = ames_orig_X.drop('SalePrice', axis=1)

In [129]:
X_train, X_test, y_train, y_test = train_test_split(ames_orig_X, ames_orig_y)
lr.fit(X_train, y_train)
orig_pred = lr.predict(X_test)
print('RMSE is {:.4f}'.format(np.sqrt(mean_squared_error(y_test, orig_pred))))

RMSE is 0.2217


##  Even without any feature engineering or filling, the model seems to be pretty good.

In [113]:
ames['PoolArea'].value_counts()

0      1453
738       1
648       1
576       1
555       1
519       1
512       1
480       1
Name: PoolArea, dtype: int64

In [114]:
ames[ames['PoolArea']!=0]

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
197,198,75,RL,174.0,25419,Pave,,Reg,Lvl,AllPub,...,512,Ex,GdPrv,,0,3,2006,WD,Abnorml,235000
810,811,20,RL,78.0,10140,Pave,,Reg,Lvl,AllPub,...,648,Fa,GdPrv,,0,1,2006,WD,Normal,181000
1170,1171,80,RL,76.0,9880,Pave,,Reg,Lvl,AllPub,...,576,Gd,GdPrv,,0,7,2008,WD,Normal,171000
1182,1183,60,RL,160.0,15623,Pave,,IR1,Lvl,AllPub,...,555,Ex,MnPrv,,0,7,2007,WD,Abnorml,745000
1298,1299,60,RL,313.0,63887,Pave,,IR3,Bnk,AllPub,...,480,Gd,,,0,1,2008,New,Partial,160000
1386,1387,60,RL,80.0,16692,Pave,,IR1,Lvl,AllPub,...,519,Fa,MnPrv,TenC,2000,7,2006,WD,Normal,250000
1423,1424,80,RL,,19690,Pave,,IR1,Lvl,AllPub,...,738,Gd,GdPrv,,0,8,2006,WD,Alloca,274970


In [119]:
ames['PoolQC'] = ames['PoolQC'].fillna('None')

In [121]:
ames.PoolQC.value_counts()

None    1453
Gd         3
Fa         2
Ex         2
Name: PoolQC, dtype: int64

In [8]:
ames['Alley'] = ames['Alley'].fillna("None")

In [9]:
ames['Alley'].value_counts()

None    1369
Grvl      50
Pave      41
Name: Alley, dtype: int64

In [10]:
ames['FireplaceQu'].value_counts(dropna=False)

NaN    690
Gd     380
TA     313
Fa      33
Ex      24
Po      20
Name: FireplaceQu, dtype: int64

In [140]:
ames['FireplaceQu'] = ames['FireplaceQu'].fillna("None")

In [141]:
ames['MiscFeature'].value_counts()

Shed    49
Othr     2
Gar2     2
TenC     1
Name: MiscFeature, dtype: int64

In [142]:
ames['MiscFeature'] = ames['MiscFeature'].fillna("None")

In [166]:
ames = pd.read_csv('data/ames_housing.csv')

In [167]:
ames = ames.replace({"BsmtCond": {None:0, "No": 0, "Po": 1, "Fa": 2, "TA": 3, "Gd": 4, "Ex": 5}})
ames['BsmtCond'].value_counts(dropna=False)

3    1311
4      65
2      45
0      37
1       2
Name: BsmtCond, dtype: int64

In [168]:
ames = ames.replace({"BsmtQual" : {"No" : 0, "Po" : 1, "Fa" : 2, "TA": 3, "Gd" : 4, "Ex" : 5, None: 0}})
ames['BsmtQual'].value_counts(dropna=False)

3    649
4    618
5    121
0     37
2     35
Name: BsmtQual, dtype: int64

In [169]:
ames = ames.replace({"GarageQual" : {"None": 0, "No" : 0, "Po" : 1, "Fa" : 2, "TA": 3, "Gd" : 4, "Ex" : 5}})
ames['GarageQual'].value_counts()

3.0    1311
2.0      48
4.0      14
1.0       3
5.0       3
Name: GarageQual, dtype: int64

In [170]:
ames = ames.replace({"GarageCond" : {"None" : 0, "No" : 0, "Po" : 1, "Fa" : 2, "TA": 3, "Gd" : 4, "Ex" : 5}})
ames.GarageCond.value_counts()

3.0    1326
2.0      35
4.0       9
1.0       7
5.0       2
Name: GarageCond, dtype: int64

In [171]:
ames['PoolQC'] = ames.replace({"PoolQC" : {"None" : 0, "No" : 0, "Po" : 1, "Fa" : 2, "TA": 3, "Gd" : 4, "Ex" : 5}})
ames.PoolQC.value_counts()

1460    1
479     1
481     1
482     1
483     1
484     1
485     1
486     1
487     1
488     1
489     1
490     1
491     1
492     1
493     1
494     1
495     1
496     1
497     1
498     1
499     1
480     1
478     1
501     1
477     1
458     1
459     1
460     1
461     1
462     1
       ..
996     1
997     1
998     1
999     1
1000    1
1001    1
1002    1
983     1
982     1
981     1
970     1
962     1
963     1
964     1
965     1
966     1
967     1
968     1
969     1
971     1
980     1
972     1
973     1
974     1
975     1
976     1
977     1
978     1
979     1
1       1
Name: PoolQC, Length: 1460, dtype: int64

In [174]:
ames.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,1,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,2,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,3,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,4,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,5,,,0,12,2008,WD,Normal,250000


# Note to self -- life will be a lot easier if you create a subset of the ames DF containing all of the categoricals for which you want to create dummies and then just use pd.get_dummies on the whole DF, otherwise you'll have to mess around with merging.

# 

# Delete this cell before turning in assignment!

In [173]:
subclass = pd.get_dummies(ames.MSSubClass, drop_first=True)
subclass

Unnamed: 0,30,40,45,50,60,70,75,80,85,90,120,160,180,190
0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,1,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,1,0,0,0,0,0,0,0,0
4,0,0,0,0,1,0,0,0,0,0,0,0,0,0
5,0,0,0,1,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,1,0,0,0,0,0,0,0,0,0
8,0,0,0,1,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,0,0,0,1


In [27]:
ames['BasementOverall'] = ames['BsmtCond'] * ames['BsmtQual']

In [28]:
ames['PoolOverall'] = ames['PoolArea'] * ames['PoolQC']

In [30]:
ames['GarageOverall'] = ames['GarageCond'] * ames['GarageArea'] * ames['GarageCars']

**PROBLEMS**


Continue to add additional features that combine other existing ones in a sensible way.  Here are a few additional ideas:

```python
ames['OverallGrade'] = ames['OverallQual'] * ames['OverallCond']
ames['GarageOverall'] = ames['GarageQual'] * ames['GarageCond']
ames['PoolOverall'] = ames['PoolArea'] * ames['PoolQC']
```

Be sure you've coded these as numeric vectors before creating columns based on arithmetic involving them.  Finally, make sure you can succinctly carry out and communicate these transformations.  Take all of your numerical columns and investigate their correlation with our target sales.

In [37]:
ames_orig = pd.read_csv('data/ames_housing.csv')
ames.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 84 columns):
Id                 1460 non-null int64
MSSubClass         1460 non-null int64
MSZoning           1460 non-null object
LotFrontage        1201 non-null float64
LotArea            1460 non-null int64
Street             1460 non-null object
Alley              1460 non-null object
LotShape           1460 non-null object
LandContour        1460 non-null object
Utilities          1460 non-null object
LotConfig          1460 non-null object
LandSlope          1460 non-null object
Neighborhood       1460 non-null object
Condition1         1460 non-null object
Condition2         1460 non-null object
BldgType           1460 non-null object
HouseStyle         1460 non-null object
OverallQual        1460 non-null int64
OverallCond        1460 non-null int64
YearBuilt          1460 non-null int64
YearRemodAdd       1460 non-null int64
RoofStyle          1460 non-null object
RoofMatl           

In [34]:
lr = LinearRegression()

**PROBLEM**

Using the `sklearn` implementation of `LinearRegression()`, create a test and train set from your housing data.  To begin, fit a linear model on the **Logarithm** of the sales column with the `GrLivArea` feature.  Use this as your baseline to compare your transformations to.  

Include the transformations from above into a second linear model and try it out on the test set. Did the performance improve with your adjustments and transformations? 

Add polynomial features into the mix and see if you can get better improvement still.

In [None]:
lr = 