In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split


# Read the data
X_full = pd.read_csv('train.csv', index_col='Id')
X_test_full = pd.read_csv('test.csv', index_col='Id')

In [2]:
X_full.describe()

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice
count,1460.0,1201.0,1460.0,1460.0,1460.0,1460.0,1460.0,1452.0,1460.0,1460.0,...,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0
mean,56.89726,70.049958,10516.828082,6.099315,5.575342,1971.267808,1984.865753,103.685262,443.639726,46.549315,...,94.244521,46.660274,21.95411,3.409589,15.060959,2.758904,43.489041,6.321918,2007.815753,180921.19589
std,42.300571,24.284752,9981.264932,1.382997,1.112799,30.202904,20.645407,181.066207,456.098091,161.319273,...,125.338794,66.256028,61.119149,29.317331,55.757415,40.177307,496.123024,2.703626,1.328095,79442.502883
min,20.0,21.0,1300.0,1.0,1.0,1872.0,1950.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2006.0,34900.0
25%,20.0,59.0,7553.5,5.0,5.0,1954.0,1967.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,2007.0,129975.0
50%,50.0,69.0,9478.5,6.0,5.0,1973.0,1994.0,0.0,383.5,0.0,...,0.0,25.0,0.0,0.0,0.0,0.0,0.0,6.0,2008.0,163000.0
75%,70.0,80.0,11601.5,7.0,6.0,2000.0,2004.0,166.0,712.25,0.0,...,168.0,68.0,0.0,0.0,0.0,0.0,0.0,8.0,2009.0,214000.0
max,190.0,313.0,215245.0,10.0,9.0,2010.0,2010.0,1600.0,5644.0,1474.0,...,857.0,547.0,552.0,508.0,480.0,738.0,15500.0,12.0,2010.0,755000.0


In [3]:
# Obtain target and predictors
y = X_full.SalePrice
features = ['LotArea', 'YearBuilt', '1stFlrSF', '2ndFlrSF', 'FullBath', 'BedroomAbvGr', 'TotRmsAbvGrd']
X = X_full[features].copy()
X_test = X_test_full[features].copy()

In [4]:
# Break off validation set from training data
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2,
                                                      random_state=0)

In [5]:
X_train.head()

Unnamed: 0_level_0,LotArea,YearBuilt,1stFlrSF,2ndFlrSF,FullBath,BedroomAbvGr,TotRmsAbvGrd
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
619,11694,2007,1828,0,2,3,9
871,6600,1962,894,0,1,2,5
93,13360,1921,964,0,1,2,5
818,13265,2002,1689,0,2,3,7
303,13704,2001,1541,0,2,3,6


In [6]:
from sklearn.ensemble import RandomForestRegressor

# Define the models
model_1 = RandomForestRegressor(n_estimators=50, random_state=0)
model_2 = RandomForestRegressor(n_estimators=100, random_state=0)
model_3 = RandomForestRegressor(n_estimators=100, criterion='mae', random_state=0)
model_4 = RandomForestRegressor(n_estimators=200, min_samples_split=20, random_state=0)
model_5 = RandomForestRegressor(n_estimators=100, max_depth=7, random_state=0)

models = [model_1, model_2, model_3, model_4, model_5]

In [7]:
from sklearn.metrics import mean_absolute_error

# Function for comparing different models
def score_model(model, X_t=X_train, X_v=X_valid, y_t=y_train, y_v=y_valid):
    model.fit(X_t, y_t)
    preds = model.predict(X_v)
    return mean_absolute_error(y_v, preds)

model_results = {}

for i in range(0, len(models)):
    mae = score_model(models[i])
    model_results['model_' + str(i+1)] = mae
    print("Model %d MAE: %d" % (i+1, mae))




print(min(model_results, key = model_results.get))

Model 1 MAE: 24015
Model 2 MAE: 23740
Model 3 MAE: 23528
Model 4 MAE: 23996
Model 5 MAE: 23706
model_3


In [8]:
best_model = model_3

In [9]:
my_model = RandomForestRegressor(n_estimators=190, criterion='mae', min_samples_split=3, random_state=0)

my_model.fit(X_train, y_train)
my_pred = my_model.predict(X_valid)
my_mae  = score_model(my_model)

my_results = []
my_mae

23383.727956020186

In [35]:
# To keep things simple, we'll use only numerical predictors
X = X_full.select_dtypes(exclude=['object'])
X_test = X_test_full.select_dtypes(exclude=['object'])

# Break off validation set from training data
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2,
                                                      random_state=0)

In [36]:
X_train.head()

Unnamed: 0_level_0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
619,20,90.0,11694,9,5,2007,2007,452.0,48,0,...,0,108,0,0,260,0,0,7,2007,314813
871,20,60.0,6600,5,5,1962,1962,0.0,0,0,...,0,0,0,0,0,0,0,8,2009,109500
93,30,80.0,13360,5,7,1921,2006,0.0,713,0,...,0,0,44,0,0,0,0,8,2009,163500
818,20,,13265,8,5,2002,2002,148.0,1218,0,...,150,59,0,0,0,0,0,7,2008,271000
303,20,118.0,13704,7,5,2001,2002,150.0,0,0,...,468,81,0,0,0,0,0,1,2006,205000


In [13]:
# Shape of training data (num_rows, num_columns)
print(X_train.shape)

# Number of missing values in each column of training data
missing_val_count_by_column = (X_train.isnull().sum())
print(missing_val_count_by_column[missing_val_count_by_column > 0])

(1168, 37)
LotFrontage    212
MasVnrArea       6
GarageYrBlt     58
dtype: int64


In [14]:
missing_val_count_by_column[missing_val_count_by_column > 0].sum()

276

In [15]:
# Fill in the line below: get names of columns with missing values
missing_val_series = missing_val_count_by_column[missing_val_count_by_column > 0]
missing_val_cols = list(missing_val_count_by_column[missing_val_count_by_column > 0].index.values)

# Fill in the lines below: drop columns in training and validation data



reduced_X_train = X_train.drop(missing_val_cols, axis=1)
reduced_X_valid = X_valid.drop(missing_val_cols, axis=1)


In [16]:
missing_val_mean_by_column = (X_full[missing_val_cols].mean())
missing_val_mean_by_column = 


SyntaxError: invalid syntax (<ipython-input-16-f0ab62dc18b8>, line 2)

In [39]:
from sklearn.impute import SimpleImputer

# Imputation
my_imputer = SimpleImputer()
imputed_X_train = pd.DataFrame(my_imputer.fit_transform(X_train))
imputed_X_valid = pd.DataFrame(my_imputer.transform(X_valid))

# Imputation removed column names; put them back
imputed_X_train.columns = X_train.columns
imputed_X_valid.columns = X_valid.columns

In [40]:
imputed_X_train

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice
0,20.0,90.000000,11694.0,9.0,5.0,2007.0,2007.0,452.0,48.0,0.0,...,0.0,108.0,0.0,0.0,260.0,0.0,0.0,7.0,2007.0,314813.0
1,20.0,60.000000,6600.0,5.0,5.0,1962.0,1962.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,2009.0,109500.0
2,30.0,80.000000,13360.0,5.0,7.0,1921.0,2006.0,0.0,713.0,0.0,...,0.0,0.0,44.0,0.0,0.0,0.0,0.0,8.0,2009.0,163500.0
3,20.0,69.614017,13265.0,8.0,5.0,2002.0,2002.0,148.0,1218.0,0.0,...,150.0,59.0,0.0,0.0,0.0,0.0,0.0,7.0,2008.0,271000.0
4,20.0,118.000000,13704.0,7.0,5.0,2001.0,2002.0,150.0,0.0,0.0,...,468.0,81.0,0.0,0.0,0.0,0.0,0.0,1.0,2006.0,205000.0
5,20.0,62.000000,7500.0,7.0,5.0,2004.0,2005.0,0.0,410.0,0.0,...,0.0,113.0,0.0,0.0,0.0,0.0,0.0,10.0,2009.0,185000.0
6,20.0,84.000000,8658.0,6.0,5.0,1965.0,1965.0,101.0,643.0,0.0,...,0.0,138.0,0.0,0.0,0.0,0.0,0.0,12.0,2006.0,160000.0
7,160.0,24.000000,2572.0,7.0,5.0,1999.0,1999.0,0.0,604.0,0.0,...,0.0,44.0,0.0,0.0,0.0,0.0,0.0,5.0,2010.0,155000.0
8,180.0,21.000000,1596.0,4.0,5.0,1973.0,1973.0,0.0,462.0,0.0,...,120.0,101.0,0.0,0.0,0.0,0.0,0.0,11.0,2009.0,91000.0
9,50.0,60.000000,6000.0,6.0,6.0,1941.0,1950.0,0.0,375.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,2007.0,131000.0


In [50]:
my_imputer = SimpleImputer(strategy = 'median')
final_X_train = pd.DataFrame(my_imputer.fit_transform(X_train))
final_X_valid = pd.DataFrame(my_imputer.transform(X_valid))

# Define and fit model
model = RandomForestRegressor(n_estimators=60,min_samples_split=3,criterion='mae', random_state=0)
model.fit(final_X_train, y_train)

# Get validation predictions and MAE
preds_valid = model.predict(final_X_valid)
print("MAE (Your approach):")
print(mean_absolute_error(y_valid, preds_valid))

MAE (Your approach):
876.1362728310503


In [56]:
final_X_test = pd.DataFrame(my_imputer.fit_transform(X_test_full))
model = RandomForestRegressor(n_estimators=60,min_samples_split=3,criterion='mae', random_state=0)
model.fit(final_X_train, y_train)

# Fill in the line below: get test predictions
preds_test = model.predict(final_X_test)


AttributeError: 'DataFrame' object has no attribute 'dtype'

In [57]:
print(X_train.shape)
print(final_X_train.shape)
print(X_test_full.shape)
print(final_X_test.shape)

(1168, 37)
(1168, 37)
(1459, 79)
(1459, 36)


In [22]:
preds_valid = model.predict(final_X_valid)
print("MAE (Your approach):")
print(mean_absolute_error(y_valid, preds_valid))

Unnamed: 0_level_0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1461,20,80.0,11622,5,6,1961,1961,0.0,468.0,144.0,...,730.0,140,0,0,0,120,0,0,6,2010
1462,20,81.0,14267,6,6,1958,1958,108.0,923.0,0.0,...,312.0,393,36,0,0,0,0,12500,6,2010
1463,60,74.0,13830,5,5,1997,1998,0.0,791.0,0.0,...,482.0,212,34,0,0,0,0,0,3,2010
1464,60,78.0,9978,6,6,1998,1998,20.0,602.0,0.0,...,470.0,360,36,0,0,0,0,0,6,2010
1465,120,43.0,5005,8,5,1992,1992,0.0,263.0,0.0,...,506.0,0,82,0,0,144,0,0,1,2010
1466,60,75.0,10000,6,5,1993,1994,0.0,0.0,0.0,...,440.0,157,84,0,0,0,0,0,4,2010
1467,20,,7980,6,7,1992,2007,0.0,935.0,0.0,...,420.0,483,21,0,0,0,0,500,3,2010
1468,60,63.0,8402,6,5,1998,1998,0.0,0.0,0.0,...,393.0,0,75,0,0,0,0,0,5,2010
1469,20,85.0,10176,7,5,1990,1990,0.0,637.0,0.0,...,506.0,192,0,0,0,0,0,0,2,2010
1470,20,70.0,8400,4,5,1970,1970,0.0,804.0,78.0,...,525.0,240,0,0,0,0,0,0,4,2010
