# Reading the data

In [5]:
import pandas as pd
from pandas.api.types import is_string_dtype, is_numeric_dtype
import math
import numpy as np
from sklearn import metrics
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier


  from numpy.core.umath_tests import inner1d


In [6]:
PATH = "data/housing/"

In [8]:
df_raw = pd.read_csv(f'{PATH}house_train.csv', low_memory = False,
                    )

In [12]:
df_raw.head(4)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000


In [20]:
df_raw.columns

Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond', 'PavedDrive

In [42]:
testing_raw = pd.read_csv(f'{PATH}house_test.csv', low_memory = False)

In [43]:
testing_raw.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,144,0,,,,0,1,2010,WD,Normal


In [49]:
testing_raw.tail()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
1454,2915,160,RM,21.0,1936,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,6,2006,WD,Normal
1455,2916,160,RM,21.0,1894,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,4,2006,WD,Abnorml
1456,2917,20,RL,160.0,20000,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,9,2006,WD,Abnorml
1457,2918,85,RL,62.0,10441,Pave,,Reg,Lvl,AllPub,...,0,0,,MnPrv,Shed,700,7,2006,WD,Normal
1458,2919,60,RL,74.0,9627,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,11,2006,WD,Normal


## To prevent any unseen strings where it is normally ints, create a function that checks each cell in a column to see if it is a string and viceversa

In [30]:
def check_strings(fld):
    for n in fld:
        if type(n) is str:
            print(n)

In [33]:
def check_ints(fld):
    for n in fld:
        if type(n) is int or type(n) is float:
            print(n)

# Feature Engineering
## The most important column to predict would be SalePrice

### This one for the most part is already sanitized, we can just create a category for all bits of df_raw.

In [39]:
def train_cats(df):
    for n, c in df.items():#row, object
        if is_string_dtype(c):#checks if string.
            df[n] = c.astype('category').cat.as_ordered()#sets the categorical value for ordering.

In [40]:
train_cats(df_raw)

In [50]:
def proc_df(df, y_fld, skip_flds =None, do_scale = False,
           preproc_fn = None, max_n_cat = None, subset = None):
    if not skip_flds:#copies df, grabs y values, drops the original, and then does fix_missing
        skip_flds = []
    if subset:
        df= get_sample(df, subset)
    df = df.copy()
    if preproc_fn:
        preproc_fn(df)
    y = df[y_fld].values
    df.drop(skip_flds+[y_fld], axis=1, inplace =True)
    
    for n, c in df.items():
        fix_missing(df, c, n)
    if do_scale:
        mapper = scale_vars(df)
    for n, c in df.items():
        numericalize(df, c, n, max_n_cat)
    res = [pd.get_dummies(df, dummy_na = True), y]#discuss later
    if not do_scale: return res
    return res + [mapper]

def fix_missing(df, col, name):
    if is_numeric_dtype(col):
        if pd.isnull(col).sum():
            df[name+'_na'] = pd.isnull(col)
        df[name] = col.fillna(col.median())# replaced with median, and replaces with new column telling what's missing
        
def numericalize(df, col, name, max_n_cat):
    if not is_numeric_dtype(col) and (max_n_cat is None or col.nunique()>max_n_cat):
        df[name] = col.cat.codes+1# if it isn't numeric, we replace with its codes.
    

In [51]:
df, y =proc_df(df_raw, 'SalePrice')

# RMSE creation

In [45]:
def rmse(x,y):
    return math.sqrt(((x-y)**2).mean())
def print_score(m):
    res = [rmse(m.predict(X_train), Y_train), rmse(m.predict(X_valid), Y_valid),
          m.score(X_train, Y_train), m.score(X_valid, Y_valid)]
    if hasattr(m, 'oob_score_'):
        res.append(m.oob_score_)
    print(res)

In [46]:
def split_vals(a, n):
    return a[:n].copy(), a[n: ].copy()

In [52]:
n_valid = 1450 #same as Kaggle's test size
n_trn = len(df)- n_valid
raw_train, raw_valid = split_vals(df_raw, n_trn)
X_train, X_valid = split_vals(df, n_trn)
Y_train, Y_valid = split_vals(y, n_trn)

X_train.shape, Y_train.shape, X_valid.shape

((10, 83), (10,), (1450, 83))

In [53]:
m = RandomForestRegressor(n_jobs=-1)
%time m.fit(X_train, Y_train)
print_score(m)

Wall time: 117 ms
[21923.5558247288, 61059.442807571, 0.851793419318887, 0.4107693198295206]


# Outputting into CSV for submission