### 1) Import libraries 

In [12]:
# Data preprocessing
import pandas as pd

# Efficient linear algebra
import numpy as np

# ML modelling
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, ExtraTreesRegressor, GradientBoostingRegressor
from sklearn.svm import LinearSVR
from sklearn.cross_validation import KFold;

# Train-Dev Set Split
from sklearn.model_selection import train_test_split

# Visualizations
import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline

In [13]:
# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

### 2) Load data (Train and Test sets from assignment 1) 

In [17]:
open_df = pd.read_csv("../datasets_stanford/merged_train_2016.csv")
donttouch_df = pd.read_csv("../datasets_stanford/merged_test_2016.csv")

# Store our passenger ID for easy access
parcelIDs = train_df['parcelid']

In [18]:
open_df.head(3)

Unnamed: 0,parcelid,logerror,transactiondate,airconditioningtypeid,architecturalstyletypeid,basementsqft,bathroomcnt,bedroomcnt,buildingclasstypeid,buildingqualitytypeid,...,numberofstories,fireplaceflag,structuretaxvaluedollarcnt,taxvaluedollarcnt,assessmentyear,landtaxvaluedollarcnt,taxamount,taxdelinquencyflag,taxdelinquencyyear,censustractandblock
0,11054876,-0.0274,2016-12-09,,,,2.0,3.0,,7.0,...,,,177000.0,500000.0,2015.0,323000.0,6327.16,,,60371170000000.0
1,17180948,-0.0274,2016-05-23,,,,2.0,4.0,,,...,1.0,,192776.0,321293.0,2015.0,128517.0,3842.78,,,61110040000000.0
2,11712463,-0.007,2016-01-05,,,,2.0,3.0,,4.0,...,,,55871.0,87872.0,2015.0,32001.0,3449.38,,,60372360000000.0


In [19]:
open_df.shape

(72220, 60)

Let's split *open_df* in two halves: The training set (80% of *open_df*) and the dev set (20% of *open_df*)

In [20]:
# Create Training-Dev datasets (80-20% split)
train_df, dev_df = train_test_split(open_df, test_size=0.2, random_state=42)

In [22]:
print("Shape of training dataset", train_df.shape)
print("Type of train_df", type(train_df))

Shape of training dataset (57776, 60)
Type of train_df <class 'pandas.core.frame.DataFrame'>


In [23]:
print("Shape of training dataset", dev_df.shape)
print("Type of train_df", type(dev_df))

Shape of training dataset (14444, 60)
Type of train_df <class 'pandas.core.frame.DataFrame'>


### Data preprocessing 

In [25]:
# Create indicator feature for covariates that have lots of missing data: Is there a value? (Y/N)
train['has_pool'] = train["poolcnt"].apply(lambda x: 0 if np.isnan(x) else 1).astype(float)
train['has_airconditioning'] = train["airconditioningtypeid"].apply(lambda x: 0 if np.isnan(x) else 1).astype(float)
train_df['has_basement'] = train["basementsqft"].apply(lambda x: 0 if np.isnan(x) else 1).astype(float)
train['has_hottuborspa'] = train["hashottuborspa"].apply(lambda x: 0 if np.isnan(x) else 1).astype(float)

15071   NaN
12154   NaN
27687   NaN
41103   NaN
62131   NaN
Name: basementsqft, dtype: float64

In [None]:
# OneHotEncoding
train_df['has_basement'] = train["basementsqft"].apply(lambda x: 0 if np.isnan(x) else 1).astype(float)
train['has_hottuborspa'] = train["hashottuborspa"].apply(lambda x: 0 if np.isnan(x) else 1).astype(float)
train['has_pool'] = train["poolcnt"].apply(lambda x: 0 if np.isnan(x) else 1).astype(float)
train['has_airconditioning'] = train["airconditioningtypeid"].apply(lambda x: 0 if np.isnan(x) else 1).astype(float)

# Columns to be consolidated
train['yardbuildingsqft17'] = train['yardbuildingsqft17'].apply(lambda x: 0 if np.isnan(x) else x).astype(float)
train['yardbuildingsqft26'] = train['yardbuildingsqft26'].apply(lambda x: 0 if np.isnan(x) else x).astype(float)
train['yard_building_square_feet'] = train['yardbuildingsqft17'].astype(int) + train['yardbuildingsqft26'].astype(float)

# Assume some more friendly feature names
train.rename(columns={'fireplacecnt':'fireplace_count'}, inplace=True)
train.rename(columns={'bedroomcnt':'bedroom_count'}, inplace=True)
train.rename(columns={'bathroomcnt':'bathroom_count'}, inplace=True)
train.rename(columns={'calculatedfinishedsquarefeet':'square_feet'}, inplace=True)
train.rename(columns={'garagecarcnt':'garage_car_count'}, inplace=True)
train.rename(columns={'garagetotalsqft':'garage_square_feet'}, inplace=True)
train.rename(columns={'hashottuborspa':'has_hottub_or_spa'}, inplace=True)

train.rename(columns={'landtaxvaluedollarcnt':'land_tax'}, inplace=True)
train.rename(columns={'lotsizesquarefeet':'lot_size_square_feet'}, inplace=True)
train.rename(columns={'taxvaluedollarcnt':'tax_value'}, inplace=True)
train.rename(columns={'taxamount':'tax_amount'}, inplace=True)
train.rename(columns={'structuretaxvaluedollarcnt':'structure_tax_value'}, inplace=True)
train.rename(columns={'yearbuilt':'year_built'}, inplace=True)

train.rename(columns={'roomcnt':'room_count'}, inplace=True)