# New Test from kaggle
https://www.kaggle.com/erick5/predicting-house-prices-with-machine-learning

In [None]:
# libraries from kaggle
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
import warnings
import xgboost as xgb
import lightgbm as lgb
from scipy.stats import skew
from scipy import stats
from scipy.stats.stats import pearsonr
from scipy.stats import norm
from collections import Counter
from sklearn.linear_model import LinearRegression,LassoCV, Ridge, LassoLarsCV,ElasticNetCV
from sklearn.model_selection import GridSearchCV, cross_val_score, learning_curve
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, ExtraTreesRegressor, GradientBoostingRegressor
from sklearn.preprocessing import StandardScaler, Normalizer, RobustScaler
warnings.filterwarnings('ignore')
sns.set(style='white', context='notebook', palette='deep')
%config InlineBackend.figure_format = 'retina' #set 'png' here when working on notebook
%matplotlib inline

In [None]:
import sys
np.set_printoptions(threshold=np.inf)
pd.options.display.float_format = '{:.2f}'.format

In [None]:
house_df = pd.read_pickle('houseAll.pkl')
jobs_df = pd.read_pickle('jobFeatures.pkl')
yearTotals = pd.read_pickle('yearTotals.pkl')

df = house_df[['Parcel','Year','Age','Area','Neighborhood','TAV','Bedroom_Count','BATHROOMS',
               'Total_Sq_Ft']]

df = pd.merge(df, jobs_df, on = 'Year', how = 'left')
df = pd.merge(df, yearTotals, on = 'Year', how = 'left')

df = df.drop(columns = ['Job_Manual_Count', 'All Occupations'])

df = df[df['Area'] == 'BOISE BENCH']

df['Year'] = df['Year'].astype(str)
df['Year'] = df['Year'].astype('datetime64[Y]')


df.to_pickle('temp.pkl')
#df.info()
#df.head()

In [None]:
# print(house_df.info())
print(type(df.iloc[0,1]))
df.head(5)

In [None]:
# raw data plot do not re-run
sns.distplot(df['TAV'], fit = norm)

$z$ - score $ = \frac{x-\mu}{\sigma}$

In [None]:
# testing z-score

dfMean = df['TAV'].mean()
dfStd = df['TAV'].std()

df['Z-score'] = (df['TAV']-dfMean)/dfStd

df = df[(df['Z-score'] < 3) & (df['Z-score'] >-3)]
temp = df[['Parcel','Year','TAV','Z-score']]
#temp.tail(50)

In [None]:
# z-score results do not rerun
sns.distplot(df['TAV'], fit = norm)

## Figure out IQR
$$IQR = Q3-Q1$$
Outliers:
    $$<(Q1 - 1.5 * IQR)$$
    $$>(Q3 + 1.5 * IQR)$$

In [None]:
df = pd.read_pickle('temp.pkl')
dfq1 = df['TAV'].quantile(0.25)
dfq3 = df['TAV'].quantile(0.75)
dfiqr = dfq3 - dfq1
iqrMax = dfq3 + (1.5*dfiqr)
iqrMin = dfq1 - (1.5*dfiqr)
print(dfq1)
print(dfq3)
print(dfiqr)
print(iqrMin)
print(iqrMax)

In [None]:
# iqr results do not re-run
df = df[(df['TAV']> iqrMin) & (df['TAV'] < iqrMax)]
sns.distplot(df['TAV'], fit = norm)

In [None]:
#df = pd.read_pickle('temp.pkl')
train = df.sample(frac = 0.75, random_state = 0)
test = df.drop(train.index)
train.drop('Parcel', axis =1, inplace = True)
test.drop('Parcel', axis =1, inplace = True)

In [None]:
train['TAV'].describe()

In [None]:
print(df.shape)
print(test.shape)
print(train.shape)

In [None]:
sns.distplot(train['TAV'], fit = norm);

(mu, sigma) = norm.fit(train['TAV'])
print( '\n mu = {:.2f} and sigma = {:.2f}\n'.format(mu, sigma))
plt.legend(['Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format(mu, sigma)],
            loc='best')
plt.ylabel('Frequency')
plt.title('TAV distribution')

fig = plt.figure()
res = stats.probplot(train['TAV'], plot=plt)
plt.show()

print("Skewness: %f" % train['TAV'].skew())
print("Kurtosis: %f" % train['TAV'].kurt())

The values for asymmetry and kurtosis between -2 and +2 are considered acceptable in order to prove normal univariate distribution (George & Mallery, 2010). George, D., & Mallery, M. (2010). SPSS for Windows Step by Step: A Simple Guide and Reference, 17.0 update (10a ed.) Boston: Pearson.

In [None]:
print(train.select_dtypes(include = ['object']).columns)
print(train.select_dtypes(include = ['category']).columns)
print(train.select_dtypes(include=['int64','float64']).columns)

In [None]:
cat = len(train.select_dtypes(include = ['category']).columns)
num = len(train.select_dtypes(include = ['int64','float64']).columns)
print('Total Features: ', cat, 'categorical', '+',
      num, 'numerical', '=', cat+num, 'features')

In [None]:
# Correlation Matrix Heatmap
corrmat = train.corr()
f, ax = plt.subplots(figsize=(12, 9))
sns.heatmap(corrmat, vmax=.8, square=True);

In [None]:
# Top 10 Heatmap
k = 10 #number of variables for heatmap
cols = corrmat.nlargest(k, 'TAV')['TAV'].index
cm = np.corrcoef(train[cols].values.T)
sns.set(font_scale=1.25)
hm = sns.heatmap(cm, cbar=True, annot=True, square=True, fmt='.2f', annot_kws={'size': 10}, yticklabels=cols.values, xticklabels=cols.values)
plt.show()

In [None]:
most_corr = pd.DataFrame(cols)
most_corr.columns = ['Most Correlated Features']
most_corr

In [None]:
# Overall Quality vs Sale Price
var = 'Sales And Related Occupations'
data = pd.concat([train['TAV'], train[var]], axis=1)
f, ax = plt.subplots(figsize=(8, 6))
fig = sns.boxplot(x=var, y="TAV", data=data)
fig.axis(ymin=0, ymax=800000);

In [None]:
# Living Area vs Sale Price
sns.jointplot(x=train['Total_Sq_Ft'], y=train['TAV'], kind='reg')

In [None]:
# Total Rooms vs Sale Price
var = 'Year'
data = pd.concat([train['TAV'], train[var]], axis=1)
f, ax = plt.subplots(figsize=(16, 8))
fig = sns.boxplot(x=var, y="TAV", data=data)
fig.axis(ymin=0, ymax=800000);
plt.xticks(rotation=90);

In [None]:
# Total Rooms vs Sale Price

tempDF = df[df['Neighborhood'] == 'CENTRAL BOISE BENCH']
var = 'Year'
data = pd.concat([tempDF['TAV'], tempDF[var]], axis=1)
f, ax = plt.subplots(figsize=(16, 8))
fig = sns.boxplot(x=var, y="TAV", data=data)
fig.axis(ymin=0, ymax=800000);
plt.xticks(rotation=90);