# Intro to Machine Learning
[Kaggle Micro-Course](https://www.kaggle.com/learn/intro-to-machine-learning)

#### Acquire and Peruse the Data

In [10]:
import pandas as pd

In [14]:
home_df = pd.read_csv("../assets/iowa-house-prices/train.csv")
home_df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [15]:
home_df.describe()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice
count,1460.0,1460.0,1201.0,1460.0,1460.0,1460.0,1460.0,1460.0,1452.0,1460.0,...,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0
mean,730.5,56.89726,70.049958,10516.828082,6.099315,5.575342,1971.267808,1984.865753,103.685262,443.639726,...,94.244521,46.660274,21.95411,3.409589,15.060959,2.758904,43.489041,6.321918,2007.815753,180921.19589
std,421.610009,42.300571,24.284752,9981.264932,1.382997,1.112799,30.202904,20.645407,181.066207,456.098091,...,125.338794,66.256028,61.119149,29.317331,55.757415,40.177307,496.123024,2.703626,1.328095,79442.502883
min,1.0,20.0,21.0,1300.0,1.0,1.0,1872.0,1950.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2006.0,34900.0
25%,365.75,20.0,59.0,7553.5,5.0,5.0,1954.0,1967.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,2007.0,129975.0
50%,730.5,50.0,69.0,9478.5,6.0,5.0,1973.0,1994.0,0.0,383.5,...,0.0,25.0,0.0,0.0,0.0,0.0,0.0,6.0,2008.0,163000.0
75%,1095.25,70.0,80.0,11601.5,7.0,6.0,2000.0,2004.0,166.0,712.25,...,168.0,68.0,0.0,0.0,0.0,0.0,0.0,8.0,2009.0,214000.0
max,1460.0,190.0,313.0,215245.0,10.0,9.0,2010.0,2010.0,1600.0,5644.0,...,857.0,547.0,552.0,508.0,480.0,738.0,15500.0,12.0,2010.0,755000.0


#### Identify NaNs
*Let's just avoid them in this example...*

In [16]:
na_df = pd.DataFrame(home_df.isna().sum())
na_df.columns = ["Number of NaNs"]
na_df[na_df["Number of NaNs"] != 0]

Unnamed: 0,Number of NaNs
LotFrontage,259
Alley,1369
MasVnrType,8
MasVnrArea,8
BsmtQual,37
BsmtCond,37
BsmtExposure,38
BsmtFinType1,37
BsmtFinType2,38
Electrical,1


#### Set up Independent & Dependent Variables

In [17]:
features = ["LotArea", "YearBuilt","1stFlrSF","2ndFlrSF","FullBath","BedroomAbvGr","TotRmsAbvGrd"]
X = home_df[features]
y = home_df["SalePrice"]

#### Split the Training Set

In [18]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X, y, random_state=0)

#### ...and Train the Model

In [19]:
from sklearn.tree import DecisionTreeRegressor

home_model = DecisionTreeRegressor(random_state=0)
home_model.fit(X_train, y_train)
home_model_ = home_model.predict(X_val)

#### Sanity Check That We're Getting Results

In [20]:
y_val[0:5]

529    200624
491    133000
459    110000
279    192000
655     88000
Name: SalePrice, dtype: int64

In [21]:
home_model_[0:5]

array([335000., 205000., 124000., 207500.,  91500.])

#### Assess the Accuracy of the Model
*Thou shalt not fall prey to in-sample pitfalls!*

In [22]:
from sklearn.metrics import mean_absolute_error

train_mae = mean_absolute_error(home_model.predict(X_train), y_train)
val_mae = mean_absolute_error(home_model_, y_val)

print("Training MAE: {:0.2f} | Validation MAE: {:0.2f}".format(train_mae, val_mae))

Training MAE: 72.79 | Validation MAE: 32410.82


#### Tune the Model
*aka the fine line between underfitting and overfitting*

In [23]:
def get_mae(max_leaf_nodes, X_train, X_val, y_train, y_val):
    model = DecisionTreeRegressor(max_leaf_nodes=max_leaf_nodes, random_state=0)
    model.fit(X_train, y_train)
    model_ = model.predict(X_val)
    mae = mean_absolute_error(y_val, model_)
    return mae

In [24]:
scores = { leaf_nodes:  get_mae(leaf_nodes, X_train, X_val, y_train, y_val) for leaf_nodes in range(5, 1000, 1)}
optimal_leaf_nodes = min(scores, key=scores.get)
print("Minimal MAE for max_leaf_nodes = {}".format(optimal_leaf_nodes))

Minimal MAE for max_leaf_nodes = 82


In [25]:
mae_df = pd.DataFrame().from_dict(scores, orient="index").reset_index()
mae_df.columns = ["# Leaf Nodes","MAE"]
mae_df.head()

Unnamed: 0,# Leaf Nodes,MAE
0,5,35190.336708
1,6,33967.190367
2,7,33636.877992
3,8,31908.715163
4,9,31416.589415


In [26]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(15,5))
sns.scatterplot(x="# Leaf Nodes", y="MAE", data=mae_df)
plt.plot(optimal_leaf_nodes, scores[optimal_leaf_nodes], 'ro')
plt.text(optimal_leaf_nodes + 10, scores[optimal_leaf_nodes], "Optimal # Leaf Nodes")
plt.title("Optimizing # of Leaf Nodes")
plt.show()

<Figure size 1500x500 with 1 Axes>

#### Create Model Using Optimal Parameters & Fit to Data

In [27]:
home_model = DecisionTreeRegressor(max_leaf_nodes=optimal_leaf_nodes, random_state=0)
home_model

DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
                      max_leaf_nodes=82, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      presort=False, random_state=0, splitter='best')

In [29]:
home_model.fit(X, y)
home_model_ = home_model.predict(X)
home_model_[0:5]

array([209133.65384615, 146415.0075188 , 209133.65384615, 143297.46666667,
       280931.25      ])

In [31]:
mae = mean_absolute_error(home_model_, y)
print("MAE: {:0.2f}".format(mae))

MAE: 17776.10
