<h1> CSCI 567 - Machine Learning - Spring 2021 </h1>
<h2> Project: Chekcpoint 1 - Data Preprocessing (03/10/21) </h2>
<h3> Team <u>StochasticResults</u>: </h3>
 - Abel Salinas [8793999216] [abelsali@usc.edu] <br>
 - Angel Nieto [2211798052] [nietogar@usc.edu] <br>
 - Misael Morales [5832732058] [misaelmo@usc.edu]

***

The "Housing Price" dataset consists of 79 predictors for the house prices in Ames, Iowa. The training and testing set are already pre-split for us from the Kaggle version (https://www.kaggle.com/c/house-prices-advanced-regression-techniques/overview), and we will focus on different regression techniques to predict the housing SalePrice based on the given features.

This notebook focuses on the data pre-processing, wrangling, visualization, and statistical analysis. It is a crucial step in any machine-learning/data-analytics application to ensure proper data formatting in order to optimize the techniques implemented. 

# Table of Contents:
1. Load required packages <br>
2. Data wrangling <br>
3. Data visualization <br>
4. Exploratory data analysis

***

# 1. Load required libraries

In [5]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [6]:
# Basic data management packages
import os
import numpy as np
import pandas as pd

# Visualization packages
import matplotlib.pyplot as plt
import seaborn as sns
import pandas.plotting as pd_plot
%matplotlib inline

In [7]:
# Exploratory Data Analysis packages
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA  
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.mixture import GaussianMixture
from sklearn.neighbors import LocalOutlierFactor

In [8]:
# Regression and Modeling packages
import tensorflow as tf
import keras
from scipy import stats
from sklearn.metrics import mean_squared_error, r2_score

# Verify GPU compatibility
print("Tensorflow Version:", tf.__version__)
print("Tensorflow built with CUDA?", tf.test.is_built_with_cuda())
print(tf.config.list_physical_devices('CPU'))
print(tf.config.list_physical_devices('GPU'))
print("Num GPU Available:", len(tf.config.list_physical_devices('GPU')))

ModuleNotFoundError: No module named 'tensorflow'

In [None]:
# Preprocessing choice
# preprocessing_method = 'HousePreprocessor'
preprocessing_method = 'Mappings'

***

In [None]:
# Read CSV files for Train/Test datasets
train_df = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/train.csv')
test_df  = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/test.csv')

In [None]:
print('Train Shape: {} | types: {} \nTest Shape:  {} | types: {}'.format(train_df.shape, pd.unique(train_df.dtypes), 
                                                                       test_df.shape, pd.unique(test_df.dtypes)))
print('Set difference train-vs-test: {}'.format(set(train_df.columns).difference(set(test_df.columns))))

In [None]:
x_train = train_df.iloc[:,1:-1]  #79 train features
y_train = train_df.iloc[:,-1]    #SalePrice training target
x_test  = test_df.iloc[:,1:]     #79 test features
print('x_train {} | y_train {} \nx_test  {}'.format(x_train.shape, y_train.shape, x_test.shape))

In [None]:
#preview the training data set
x_train.head()

In [None]:
#preview the testing data set
x_test.head()

In [None]:
numerical_feats = x_train.dtypes[x_train.dtypes != "object"].index
print("Number of Numerical features: ", len(numerical_feats))

categorical_feats = x_train.dtypes[x_train.dtypes == "object"].index
print("Number of Categorical features: ", len(categorical_feats))

In [None]:
total = x_train.isnull().sum().sort_values(ascending=False)
percent = (x_train.isnull().sum()/x_train.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data.head(10)

In [None]:
# Non-numeric variables that require attention
non_num_vars = x_train.dtypes[x_train.dtypes=='object'].index
print(non_num_vars)

In [None]:
# Replace object/string values with categorial values from the "data_description" file from Kaggle
mappings = dict(
MSZoning_mapping      = {'nan':0, 'A':1, 'C (all)':2, 'FV':3, 'I':4, 'RH':5, 'RL':6, 'RP':7, 'RM':8},
Street_mapping        = {'nan':0, 'Grvl':1, 'Pave':2},
Alley_mapping         = {'nan':0, 'Grvl':1, 'Pave':2, 'NaN':0},
LotShape_mapping      = {'nan':0, 'Reg':1, 'IR1':2, 'IR2':3, 'IR3':4 },
LandContour_mapping   = {'nan':0, 'Lvl':1, 'Bnk':2, 'HLS':3, 'Low':4},
Utilities_mapping     = {'nan':0, 'AllPub':1, 'NoSewr':2, 'NoSeWa':3, 'ELO':4},
LotConfig_mapping     = {'nan':0, 'Inside':1, 'Corner':2, 'CulDSac':3, 'FR2':4, 'FR3':5},
LandSlope_mapping     = {'nan':0, 'Gtl':1, 'Mod':2, 'Sev':3},
Neighborhood_mapping  = {'nan':0, 'Blmngtn':1, 'Blueste':2, 'BrDale':3, 'BrkSide':4, 'ClearCr':5, 'CollgCr':6, 'Crawfor':7,
                         'Edwards':8, 'Gilbert':9, 'IDOTRR':10, 'MeadowV':11, 'Mitchel':12, 'NAmes':13, 'NoRidge':14, 
                         'NPkVill':15, 'NridgHt':16, 'NWAmes':17, 'OldTown':18, 'SWISU':19, 'Sawyer':20, 'SawyerW':21, 
                         'Somerst':22, 'StoneBr':23, 'Timber':24, 'Veenker':25},
Condition1_mapping    = {'nan':0, 'Artery':1, 'Feedr':2, 'Norm':3, 'RRNn':4, 'RRAn':5, 'PosN':6, 'PosA':7, 'RRNe':8, 'RRAe':9},
Condition2_mapping    = {'nan':0, 'Artery':1, 'Feedr':2, 'Norm':3, 'RRNn':4, 'RRAn':5, 'PosN':6, 'PosA':7, 'RRNe':8, 'RRAe':9},
BldgType_mapping      = {'nan':0, '1Fam':1, '2fmCon':2, 'Duplex':3, 'Twnhs':4, 'TwnhsE':4, 'TwnhsI':5},
HouseStyle_mapping    = {'nan':0, '1Story':1, '1.5Fin':2, '1.5Unf':3, '2Story':4, '2.5Fin':5, '2.5Unf':6, 'SFoyer':7, 'SLvl':8},
RoofStyle_mapping     = {'nan':0, 'Flat':1, 'Gable':2, 'Gambrel':3, 'Hip':4, 'Mansard':5, 'Shed':6},
RoofMatl_mapping      = {'nan':0, 'ClyTile':1, 'CompShg':2, 'Membran':3, 'Metal':4, 
                         'Roll':5, 'Tar&Grv':6, 'WdShake':7, 'WdShngl':8},
Exterior1st_mapping   = {'nan':0, 'AsbShng':1, 'AsphShn':2, 'BrkComm':3, 'BrkFace':4, 'CBlock':5, 'CemntBd':6, 
                         'HdBoard':7, 'ImStucc':8, 'MetalSd':9, 'Other':10, 'Plywood':11, 'PreCast':12, 'Stone':13, 
                         'Stucco':14, 'VinylSd':15,'Wd Sdng':16, 'WdShing':17},
Exterior2nd_mapping   = {'nan':0, 'AsbShng':1, 'AsphShn':2, 'Brk Cmn':3, 'BrkFace':4, 'CBlock':5, 'CmentBd':6, 
                         'HdBoard':7, 'ImStucc':8, 'MetalSd':9, 'Other':10, 'Plywood':11, 'PreCast':12, 'Stone':13, 
                         'Stucco':14, 'VinylSd':15, 'Wd Shng':16, 'Wd Sdng':16, 'WdShing':17},
MasVnrType_mapping    = {'nan':0, 'BrkCmn':1, 'BrkFace':2, 'CBlock':3, 'None':4, 'Stone':5},
ExterQual_mapping     = {'nan':0, 'Ex':1, 'Gd':2, 'TA':3, 'Fa':4, 'Po':5},
ExterCond_mapping     = {'nan':0, 'Ex':1, 'Gd':2, 'TA':3, 'Fa':4, 'Po':5},
Foundation_mapping    = {'nan':0, 'BrkTil':1, 'CBlock':2, 'PConc':3, 'Slab':4, 'Stone':5, 'Wood':6},
BsmtQual_mapping      = {'nan':0, 'Ex':1, 'Gd':2, 'TA':3, 'Fa':4, 'Po':5, 'NA':0},
BsmtCond_mapping      = {'nan':0, 'Ex':1, 'Gd':2, 'TA':3, 'Fa':4, 'Po':5, 'NA':0},
BsmtExposure_mapping  = {'nan':0, 'Gd':1, 'Av':2, 'Mn':3, 'No':4, 'NA':5},
BsmtFinType1_mapping  = {'nan':0, 'GLQ':1, 'ALQ':2, 'BLQ':3, 'Rec':4, 'LwQ':5, 'Unf':6, 'NA':0},
BsmtFinType2_mapping  = {'nan':0, 'GLQ':1, 'ALQ':2, 'BLQ':3, 'Rec':4, 'LwQ':5, 'Unf':6, 'NA':0},
Heating_mapping       = {'nan':0, 'Floor':1, 'GasA':2, 'GasW':3, 'Grav':4, 'OthW':5, 'Wall':6},
HeatingQC_mapping     = {'nan':0, 'Ex':1, 'Gd':2, 'TA':3, 'Fa':4, 'Po':5},
CentralAir_mapping    = {'nan':0, 'N':1, 'Y':2},
Electrical_mapping    = {'nan':0, 'SBrkr':1, 'FuseA':2, 'FuseF':3, 'FuseP':4, 'Mix':5},
KitchenQual_mapping   = {'nan':0, 'Ex':1, 'Gd':2, 'TA':3, 'Fa':4, 'Po':5},
Functional_mapping    = {'nan':0, 'Typ':1, 'Min1':2, 'Min2':3, 'Mod':4, 'Maj1':5, 'Maj2':6, 'Sev':7, 'Sal':8},
FireplaceQu_mapping   = {'nan':0, 'Ex':1, 'Gd':2, 'TA':3, 'Fa':4, 'Po':5, 'NA':0},
GarageType_mapping    = {'nan':0, '2Types':1, 'Attchd':2, 'Basment':3, 'BuiltIn':4, 'CarPort':5, 'Detchd':6, 'NA':0},
GarageFinish_mapping  = {'nan':0, 'Fin':1, 'RFn':2, 'Unf':3, 'NA':0},
GarageQual_mapping    = {'nan':0, 'Ex':1, 'Gd':2, 'TA':3, 'Fa':4, 'Po':5, 'NA':0},
GarageCond_mapping    = {'nan':0, 'Ex':1, 'Gd':2, 'TA':3, 'Fa':4, 'Po':5, 'NA':0},
PavedDrive_mapping    = {'nan':0, 'Y':1, 'P':2, 'N':3},
PoolQC_mapping        = {'nan':0, 'Ex':1, 'Gd':2, 'TA':3, 'Fa':4, 'NA':0},
Fence_mapping         = {'nan':0, 'GdPrv':1, 'MnPrv':2, 'GdWo':3, 'MnWw':4, 'NA':0},
MiscFeature_mapping   = {'nan':0, 'Elev':1, 'Gar2':2, 'Othr':3, 'Shed':4, 'TenC':5, 'NA':0},
SaleType_mapping      = {'nan':0, 'WD':1, 'CWD':2, 'VWD':3, 'New':4, 'COD':5, 'Con':6, 'ConLw':7, 'ConLI':8,
                         'ConLD':9, 'Oth':10},
SaleCondition_mapping = {'nan':0, 'Normal':1, 'Abnorml':2, 'AdjLand':3, 'Alloca':4, 'Family':5, 'Partial':6})

In [None]:
x_train = x_train.replace({list(non_num_vars)[k] : list(mappings.values())[k] 
                           for k in np.arange(len(list(non_num_vars)))}).fillna(0)
x_test  = x_test.replace({list(non_num_vars)[k] : list(mappings.values())[k] 
                           for k in np.arange(len(list(non_num_vars)))}).fillna(0)

print('x_train {} | y_train {} \nx_test  {}'.format(x_train.shape, y_train.shape, x_test.shape))

In [None]:
numerical_feats = x_train.dtypes[x_train.dtypes != "object"].index
print("Number of Numerical features: ", len(numerical_feats))

categorical_feats = x_train.dtypes[x_train.dtypes == "object"].index
print("Number of Categorical features: ", len(categorical_feats))

***

In [None]:
corr_mat = np.corrcoef(x_train.join(y_train),rowvar = False)
corr_vec = corr_mat[:-1,-1]   #correlation of features to target only
print(corr_mat.round(2))            

In [None]:
#View features that have a +/- correlation greater than the user-specified corrrelation bound (corr_bound)
corr_bound = 0.6
corr_dat = (x_train.iloc[:, corr_vec<-corr_bound]).join(x_train.iloc[:, corr_vec>corr_bound])
corr_dat

In [None]:
#full training set basic statistics
x_train.describe()

In [None]:
#main features (based on correlation coefficient) basic statistics
corr_dat.describe()

In [None]:
corr_dat.join(y_train).corr()['SalePrice']

In [None]:
target = np.log10(y_train)
print('       SalePrice: Mean {:.3e} | Std. Dev {:.3e}'.format(y_train.mean(), y_train.std()))
print('Log10(SalePrice): Mean {:.3f}     | Std. Dev {:.3f}'.format(target.mean(), target.std()))

In [None]:
target_name = 'SalePrice'

corr_abs = train_df.corr().abs()
ser_corr = corr_abs.nlargest(len(numerical_feats), target_name)[target_name]

cols_abv_corr_limit = list(ser_corr[ser_corr.values >= corr_bound].index)
cols_bel_corr_limit = list(ser_corr[ser_corr.values < corr_bound].index)

nr_feats = len(cols_abv_corr_limit)

In [None]:
scaler = StandardScaler()
x_train_s = scaler.fit_transform(corr_dat)

In [None]:
n_components = 4
pca = PCA(n_components=n_components)
pca.fit(x_train_s)

#variance explained by each PC
print('PC Variance Explained: {}'.format(pca.explained_variance_ratio_))
print('Cumulative Variance Explained: {}'.format(np.sum(pca.explained_variance_ratio_).round(4)))

In [None]:
x_trans = pca.transform(x_train_s)  

In [None]:
n_clusters = 3
kmeans = KMeans(n_clusters = n_clusters).fit(X=x_train_s, y=target).labels_
hclust = AgglomerativeClustering(n_clusters = n_clusters).fit(X=x_train_s, y=target).labels_
gmm    = GaussianMixture(n_components=n_clusters).fit_predict(X=x_train_s, y=target)

In [None]:
df_all = pd.concat([x_train, x_test])

len_train = x_train.shape[0]

In [None]:
# StandardScaler from Scikit-Learn
from sklearn.preprocessing import StandardScaler

# Initialize instance of StandardScaler
scaler = StandardScaler()

# Fit and transform item_data
data_scaled = scaler.fit_transform(df_all)

# Display first 5 rows of item_data_scaled
data_scaled[:5]

In [None]:
pca = PCA(n_components=n_components)
pca.fit(data_scaled)

PC_items = pca.transform(data_scaled)

In [None]:
# Put PC_items into a dataframe
items_pca = pd.DataFrame(PC_items)

# Name the columns
items_pca.columns = ['PC{}'.format(i + 1) for i in range(PC_items.shape[1])]

# Update its index
items_pca.index = df_all.index

# Display first 5 rows
items_pca.head()

In [None]:
# from sklearn.linear_model import LinearRegression

# lr = LinearRegression()
# lr.fit(x_train, y_train)
# pred = lr.predict(x_test)
# pred_train = lr.predict(x_train)
# r2 = r2_score(y_train,pred_train)
# rmse = np.sqrt(mean_squared_error(y_train,pred_train))

In [None]:
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error

In [None]:
print( 'R^2:', r2_score(y_train, pred_train ))
print( 'MAE:', mean_absolute_error(y_train, pred_train))

In [None]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
# rf = RandomForestRegressor() #default features
# rf_fit = rf.fit(x_train, y_train)
# pred = rf_fit.predict(x_test)
# pred_train = rf.predict(x_train)
# r2 = r2_score(y_train,pred_train)
# rmse = np.sqrt(mean_squared_error(y_train,pred_train))
# print( 'R^2:', r2_score(y_train, pred_train ))
# print( 'MAE:', mean_absolute_error(y_train, pred_train))

In [None]:
# import xgboost
# reg = xgboost.XGBRegressor().fit(x_train, y_train)
# pred = reg.predict(x_test)
# pred_train = reg.predict(x_train)
# r2 = r2_score(y_train,pred_train)
# rmse = np.sqrt(mean_squared_error(y_train,pred_train))
# print( 'R^2:', r2_score(y_train, pred_train ))
# print( 'MAE:', mean_absolute_error(y_train, pred_train))

In [None]:
x_train

In [None]:
from sklearn.ensemble import GradientBoostingRegressor

gbrt = GradientBoostingRegressor(max_depth=2, warm_start=True)
min_val_error = float("inf") 
error_going_up = 0
size=len(x_train)
x_val, y_val = x_train[size-400:], y_train[size-400:]
x_train_new, y_train_new = x_train[:-400], y_train[:-400]

for n_estimators in range(1, 120):
    gbrt.n_estimators = n_estimators 
    gbrt.fit(x_train_new, y_train_new)
    y_pred = gbrt.predict(x_val)
    val_error = mean_squared_error(y_val, y_pred) 
    if val_error < min_val_error:
        min_val_error = val_error
        error_going_up = 0
    else:
        error_going_up += 1
        if error_going_up == 5:
            break # early stopping
            
pred = gbrt.predict(x_test)
pred_train = gbrt.predict(x_train)
r2 = r2_score(y_train,pred_train)
rmse = np.sqrt(mean_squared_error(y_train,pred_train))
print( 'R^2:', r2_score(y_train, pred_train ))
print( 'MAE:', mean_absolute_error(y_train, pred_train))

In [None]:
id_test = test_df['Id']

pred_pd = pd.DataFrame()
pred_pd['Id'] = id_test
pred_pd['SalePrice'] = pred

pred_pd.head
pred_pd.to_csv('submission_StochasticResults_RF.csv',index=False)

***

***

# End of Notebook