# Kaggle || Housing Prices: Advanced Regression Techniques

## Step 1: Import Packages

In [None]:
import numpy as np
import pandas as pd
import time

from sklearn.model_selection import cross_val_score
#sns.set_style('whitegrid')

%config InlineBackend.figure_format = 'retina'
%matplotlib inline

## Step 2: Import Dataset

In [None]:
df_train = pd.read_csv('/Users/austinwhaley/github_repos/DSI-SF-4-austinmwhaley/other_datasets/kaggle_housing_train.csv')
labels = df_train['SalePrice']
df_train = df_train.drop(['SalePrice'], 1)
df_train.shape

In [None]:
df_test = pd.read_csv('/Users/austinwhaley/github_repos/DSI-SF-4-austinmwhaley/other_datasets/kaggle_housing_test.csv')
ids = df_test[['Id']]
df_test.shape

## Step 3: Clean Dataset

In [None]:
# Average is a special type of munging and needs to be done first
for i in [df_train, df_test]:
    avg_cat = ['GarageYrBlt']
    for j in avg_cat: #Average Categories
        avg = round(np.mean(i[j][i[j].isnull() == False]), 0)
        i[j] = i[j].map(lambda x: avg if pd.isnull(x) == True else x) #Turn NaN to avg

In [None]:
# For all the float64 type columns, map the null values as 0
for i in [df_train, df_test]:
    reg_cat = i.select_dtypes(include=['float64', 'int64'])
    for j in reg_cat.columns: #Regression Categories
        i[j] = i[j].map(lambda x: 0 if pd.isnull(x) == True else x) #Turn NaN to 0

In [None]:
# For all the object type columns, map the null values as 'None'
for i in [df_train, df_test]:
    cat_cat = i.select_dtypes(include=['object'])
    for j in cat_cat.columns: #Categorical Categories
        i[j] = i[j].map(lambda x: 'None' if pd.isnull(x) == True else x) #Turn NaN to 'None'

In [None]:
#Combine train and test datasets for processing
df_concat = pd.concat([df_train, df_test])

In [None]:
h_dum = pd.get_dummies(df_concat)
#h_train_dum = pd.get_dummies(df_train)
#h_test_dum = pd.get_dummies(df_test)
#print h_train_dum.shape
#print h_test_dum.shape
print h_dum.shape

In [None]:
X_train = h_dum.drop(['Id'], 1)[:1460]
X_test = h_dum.drop(['Id'], 1)[1460:]
y_train = labels

## Step 4: Exploritory Data Analysis (EDA)

## Step 5: Modeling

### 5.1: Generalized Linear Models

#### 5.1.1: Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression
start_time = time.time()
LR = LinearRegression(n_jobs=-1).fit(X_train, y_train)
scores = cross_val_score(LR, X_train, y_train, cv=10, n_jobs=-1)
#print scores
print round(np.mean(scores),3), '= avg_r2'
print 'Runtime =', round(time.time() - start_time, 2), 'seconds'

#### 5.1.2: Ridge Regression

In [None]:
from sklearn.linear_model import RidgeCV
start_time = time.time()
R = RidgeCV().fit(X_train, y_train)
scores = cross_val_score(R, X_train, y_train, cv=2, n_jobs=-1)
#print scores
print round(np.mean(scores),3), '= avg_r2'
print 'Runtime =', round(time.time() - start_time, 2), 'seconds'

#### 5.1.3: Lasso Regression

In [None]:
from sklearn.linear_model import LassoCV
start_time = time.time()
L = LassoCV().fit(X_train, y_train)
scores = cross_val_score(L, X_train, y_train, cv=10, n_jobs=-1)
#print scores
print np.mean(scores), '= avg_r2'
print round(time.time() - start_time, 2), 'seconds'

#### 5.1.4: Elastic Net Regression

In [None]:
from sklearn.linear_model import ElasticNet
start_time = time.time()
EN = ElasticNet().fit(X_train, y_train)
scores = cross_val_score(EN, X_train, y_train, cv=10, n_jobs=-1)
#print scores
print round(np.mean(scores),3), '= avg_r2'
print 'Runtime =', round(time.time() - start_time, 2), 'seconds'

### 5.2: Support Vector Machines 

#### 5.2.1: Support Vector Regression

In [None]:
from sklearn.svm import NuSVR
start_time = time.time()
NuSVR = NuSVR().fit(X_train, y_train)
scores = cross_val_score(NuSVR, X_train, y_train, cv=10, n_jobs=-1)
print round(np.mean(scores),3), '= avg_r2'
print 'Runtime =', round(time.time() - start_time, 2), 'seconds'

### 5.3: Decision Tress

#### 5.3.1: Decison Tree Regressor

### 5.4: Ensemble Methods

#### 5.4.1: Random Forest

In [None]:
from sklearn.ensemble import RandomForestRegressor
start_time = time.time()
###38 for n_jobs = -1 vs 75 for n_jobs = 1. Big difference. with n_estimators = 200
RFR = RandomForestRegressor(n_estimators=200, verbose=0, n_jobs=-1).fit(X_train, y_train)
scores = cross_val_score(RFR, X_train, y_train, cv=10)
#print scores
print round(np.mean(scores),3), '= avg_r2'
print 'Runtime =', round(time.time() - start_time, 2), 'seconds'

#### 5.4.2: AdaBoost

In [None]:
from sklearn.ensemble import AdaBoostRegressor
start_time = time.time()
###
ADA = AdaBoostRegressor().fit(X_train, y_train)
scores = cross_val_score(ADA, X_train, y_train, cv=10, verbose=1)
#print scores
print round(np.mean(scores),3), '= avg_r2'
print 'Runtime =', round(time.time() - start_time, 2), 'seconds'

### 5.5: Neural-Networks

#### 5.5.1: 3-Layer Fully-Connected

#### 5.5.2: 5-Layer Fully-Connected

#### 5.5.3: 10-Layer Fully-Connected

## Step 6: Visualizations

## Step 7: Conclusion/ Submission

In [None]:
predictions = pd.DataFrame(RFR.predict(X_test))
predictions.columns = ['SalePrice']
###
predictions.insert(0, 'Id', ids)
#predictions.set_index('Id', inplace=True)
#predictions.reset_index(inplace=True)
predictions.head()

In [None]:
predictions.to_csv('/Users/austinwhaley/Desktop/DSI-SF-4-austinmwhaley/other_datasets/kaggle_house_submission.csv', header=True, index=False)