In [None]:
import pandas as pd
import numpy as np
import seaborn as sns

my_data = pd.read_csv("train.csv")
my_test_data = pd.read_csv("test.csv")

In [None]:
my_data.head()

In [None]:
my_data.info()

In [None]:
# identify how many null values we have
nullseries = my_data.isnull().sum()
print(nullseries[nullseries > 0])

In [None]:
# According to kaggle, NA = none, so we can replace NA values with 'None'
    # For: Misc, PoolQC, Fence, Basement, Garage, Fireplace, MasVnr | NA = none or 0

cols = ["MiscFeature", "PoolQC", "Fence", "BsmtQual", "BsmtCond", "BsmtExposure", "BsmtFinType1", 
        "BsmtFinType2", "MasVnrType", "MasVnrArea", "Alley",
        "FireplaceQu", "GarageType", "GarageYrBlt", "GarageFinish", "GarageQual", "GarageCond"]
for col in cols:
    my_data[col].fillna("None", inplace=True)
    my_test_data[col].fillna("None", inplace=True)

In [None]:
# See how the null values have improved
nullseries = my_data.isnull().sum()
print(nullseries[nullseries > 0])

In [None]:
my_data.info()

# All the variables that are not numeric - "Neighborhood", "MSZoning", "Street", "LotShape", 
#  "LandContour", "Utilities", "LotConfig", "LandSlope", "Condition1", 
#  "Condition2", "BldgType", "HouseStyle", "Exterior1st", "RoofMatl",
#  "KitchenQual", "Functional", "FireplaceQu", "GarageType",
#  "Exterior2nd", "MasVnrType", "ExterQual", "ExterCond",
#  "Foundation", "BsmtQual", "BsmtCond", "BsmtExposure", "BsmtFinType1", 
#  "BsmtFinType2", "Heating", "HeatingQC", "CentralAir", "Electrical",
#  "GarageFinish", "GarageQual", "GarageCond", "PavedDrive", "PoolQC", 
#  "Fence", "MiscFeature", "SaleType", "SaleCondition"

In [None]:
my_data_exp = my_data.copy()

In [None]:
# copied from handson ML book
import matplotlib.pyplot as plt
my_data.plot(kind="scatter", x="BldgType", y="LotFrontage", alpha=0.1)

# After doing some correlation analysis in the data, I found out that houses with the same BldgType tends to have 
# similar LotFrontage size

In [None]:
# First, encode missing values of LotFrontage to the mean of LonFrontage
# so we can compare if the model will perform better if we encode it using BldgType

In [None]:
from sklearn.model_selection import train_test_split

X = my_data.drop('SalePrice', axis=1)
y = my_data['SalePrice']

X_train, X_test, y_train, y_test = train_test_split(my_data, y, test_size=0.1)

In [None]:
# Get the error of baseline model

from sklearn.metrics import mean_absolute_error

average_saleprice = my_data["SalePrice"].mean()
baseline_test_predictions = [average_saleprice]*len(X_test)

mean_absolute_error(baseline_test_predictions, y_test)

In [None]:
# Get the first implementation of Random Forest with only LotFrontage as the predictor
# Compare performance of the model when missing values were only replaced by mean

LF_mean_train = X_train["LotFrontage"].copy()
LF_mean_test = X_test["LotFrontage"].copy()

LF_mean_train.fillna(LF_mean_train.mean(), inplace=True) # ref change
LF_mean_test.fillna(LF_mean_test.mean(), inplace=True)

from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()

rfc.fit(LF_mean_train.to_numpy().reshape(-1, 1), y_train)
rfc_predictions1 = rfc.predict(LF_mean_test.to_numpy().reshape(-1, 1))

mean_absolute_error(rfc_predictions1, y_test)

In [None]:
# Get the mean LotFrontage per bldg type
# Use that to populate missing Lot Frontage values based on their building type

def FrontageFromBldgType(data):

    PredLF=[]
    l = 0
    Bldg_list = data["BldgType"].to_numpy()
    # Next step, if LotFrontage is NA, fill it will Pred_LF
    for rows in Bldg_list:
        Bldg_list[l].strip()
        if Bldg_list[l] == '1Fam':
            PredLF.append(np.mean(data.query("BldgType == '1Fam'")["LotFrontage"]))
        elif Bldg_list[l] == 'TwnhsE':
            PredLF.append(np.mean(data.query("BldgType == 'TwnhsE'")["LotFrontage"]))
        elif Bldg_list[l] == 'Duplex':
            PredLF.append(np.mean(data.query("BldgType == 'Duplex'")["LotFrontage"]))
        elif Bldg_list[l] == 'Twnhs':
            PredLF.append(np.mean(data.query("BldgType == 'Twnhs'")["LotFrontage"]))
        elif Bldg_list[l] == '2fmCon':
            PredLF.append(np.mean(data.query("BldgType == '2fmCon'")["LotFrontage"]))
        l += 1
    data.insert(2, "Pred_LF", PredLF)
    data["LotFrontage"].fillna(data["Pred_LF"], inplace=True)
    return data

In [None]:
# 2nd implem of RFC, with LotFrontage NA filled from BldgType

LF_BldgType_train = X_train.copy()
LF_BldgType_test = X_test.copy()

FrontageFromBldgType(LF_BldgType_train) # the error starts here
FrontageFromBldgType(LF_BldgType_test)

In [None]:
LF_BldgType_train = LF_BldgType_train["LotFrontage"]
LF_BldgType_test = LF_BldgType_test["LotFrontage"]

rfc.fit(LF_BldgType_train.to_numpy().reshape(-1, 1), y_train)
rfc_predictions2 = rfc.predict(LF_BldgType_test.to_numpy().reshape(-1, 1))

mean_absolute_error(rfc_predictions2, y_test)

# This investigation has confirmed that LotFrontage becomes a better predictor
# When the missing values are filled using BldgType

In [None]:
# For now, we just use BldgType first
    # We can change this in the future, I can test again the Random Forest performance
    # including the other variables

FrontageFromBldgType(X_train)
FrontageFromBldgType(X_test)

In [None]:
nullseries = X_train.isnull().sum()
print(nullseries[nullseries > 0])

# The only null value left is 1 - Electrical

In [None]:
# removes the row with null values for Electrical
X_train.dropna(inplace=True)
X_test.dropna(inplace=True)

In [None]:
X_train_exp = X_train.copy()
X_test_exp = X_test.copy()

X_train_OH = X_train.copy()
X_test_OH = X_test.copy()

# For data visualizations: my_data_exp
# For experimental encoding: X_train_exp, X_test_exp
# For one hot encoding: X_train_OH, X_test_OH

In [None]:
data_explore = pd.concat([X_train, X_test], ignore_index=True)

In [None]:
data_explore.to_csv('data_explore.csv')

## **START OF CATEGORICAL ENCODING**

In [None]:
# For features with these categories: None, Po, Fa, TA, Gd, Ex

def categ_seq1(column):
    Types = ['none', 'po', 'fa', 'ta', 'gd', 'ex']

    n = 0
    for rows in column:
        item = column.iloc[n].strip().lower() #remove whitespaces
        column.iloc[n] = Types.index(item)
        n += 1

In [None]:
categ_seq1(X_train_exp["BsmtQual"])
categ_seq1(X_train_exp["BsmtCond"])
categ_seq1(X_train_exp["FireplaceQu"])
categ_seq1(X_train_exp["GarageQual"])
categ_seq1(X_train_exp["GarageCond"])
categ_seq1(X_train_exp["PoolQC"])
categ_seq1(X_train_exp["ExterQual"])
categ_seq1(X_train_exp["ExterCond"])
categ_seq1(X_train_exp["HeatingQC"])
categ_seq1(X_train_exp["KitchenQual"])

categ_seq1(X_test_exp["BsmtQual"])
categ_seq1(X_test_exp["BsmtCond"])
categ_seq1(X_test_exp["FireplaceQu"])
categ_seq1(X_test_exp["GarageQual"])
categ_seq1(X_test_exp["GarageCond"])
categ_seq1(X_test_exp["PoolQC"])
categ_seq1(X_test_exp["ExterQual"])
categ_seq1(X_test_exp["ExterCond"])
categ_seq1(X_test_exp["HeatingQC"])
categ_seq1(X_test_exp["KitchenQual"])

categ_seq1(X_train_OH["BsmtQual"])
categ_seq1(X_train_OH["BsmtCond"])
categ_seq1(X_train_OH["FireplaceQu"])
categ_seq1(X_train_OH["GarageQual"])
categ_seq1(X_train_OH["GarageCond"])
categ_seq1(X_train_OH["PoolQC"])
categ_seq1(X_train_OH["ExterQual"])
categ_seq1(X_train_OH["ExterCond"])
categ_seq1(X_train_OH["HeatingQC"])
categ_seq1(X_train_OH["KitchenQual"])

categ_seq1(X_test_OH["BsmtQual"])
categ_seq1(X_test_OH["BsmtCond"])
categ_seq1(X_test_OH["FireplaceQu"])
categ_seq1(X_test_OH["GarageQual"])
categ_seq1(X_test_OH["GarageCond"])
categ_seq1(X_test_OH["PoolQC"])
categ_seq1(X_test_OH["ExterQual"])
categ_seq1(X_test_OH["ExterCond"])
categ_seq1(X_test_OH["HeatingQC"])
categ_seq1(X_test_OH["KitchenQual"])

In [None]:
def categ_seq2(Types, column):

    n = 0
    for rows in column:
        item = column.iloc[n].strip().lower() #remove whitespaces
        column.iloc[n] = Types.index(item)
        n += 1

# function calls:
Bsmt_arr = ['none', 'unf', 'lwq', 'rec', 'blq', 'alq', 'glq']

categ_seq2(Bsmt_arr, X_train_exp["BsmtFinType1"])
categ_seq2(Bsmt_arr, X_test_exp["BsmtFinType1"])

categ_seq2(Bsmt_arr, X_train_exp["BsmtFinType2"])
categ_seq2(Bsmt_arr, X_test_exp["BsmtFinType2"])

categ_seq2(Bsmt_arr, X_train_OH["BsmtFinType1"])
categ_seq2(Bsmt_arr, X_test_OH["BsmtFinType1"])

categ_seq2(Bsmt_arr, X_train_OH["BsmtFinType2"])
categ_seq2(Bsmt_arr, X_test_OH["BsmtFinType2"])

In [None]:
# For features with these categories:

# Street: Grvl, Pave
Street_arr = ['none', 'grvl', 'pave']
categ_seq2(Street_arr, X_train_exp["Street"])
categ_seq2(Street_arr, X_test_exp["Street"])
categ_seq2(Street_arr, X_train_OH["Street"])
categ_seq2(Street_arr, X_test_OH["Street"])
    
# LotShape: IR3, IR2, IR1, Reg
LotShape_arr = ['ir3', 'ir2', 'ir1', 'reg']
categ_seq2(LotShape_arr, X_train_exp["LotShape"])
categ_seq2(LotShape_arr, X_test_exp["LotShape"])
categ_seq2(LotShape_arr, X_train_OH["LotShape"])
categ_seq2(LotShape_arr, X_test_OH["LotShape"])

# Utilities: ELO, NoSeWa, NoSewr, AllPub
Utilities_arr = ['elo', 'nosewa', 'nosewr', 'allpub']
categ_seq2(Utilities_arr, X_train_exp["Utilities"])
categ_seq2(Utilities_arr, X_test_exp["Utilities"])
categ_seq2(Utilities_arr, X_train_OH["Utilities"])
categ_seq2(Utilities_arr, X_test_OH["Utilities"])

# BsmtExposure: None, No, Mn, Av, Gd
BsmtExp_arr = ['none', 'no', 'mn', 'av', 'gd']
categ_seq2(BsmtExp_arr, X_train_exp["BsmtExposure"])
categ_seq2(BsmtExp_arr, X_test_exp["BsmtExposure"])
categ_seq2(BsmtExp_arr, X_train_OH["BsmtExposure"])
categ_seq2(BsmtExp_arr, X_test_OH["BsmtExposure"])

# Alley: None, Grvl, Pave
Alley_arr = ['none', 'grvl', 'pave']
categ_seq2(Alley_arr, X_train_exp["Alley"])
categ_seq2(Alley_arr, X_test_exp["Alley"])
categ_seq2(Alley_arr, X_train_OH["Alley"])
categ_seq2(Alley_arr, X_test_OH["Alley"])

# NOTE!!!!
# Remember to put 'None' to the null values of all the features with 'None category'
# because initially, i only filled 'None' to those with missing values in the train set
    
# CentralAir: N, Y
CentralAir_arr = ['n', 'y']
categ_seq2(CentralAir_arr, X_train_exp["CentralAir"])
categ_seq2(CentralAir_arr, X_test_exp["CentralAir"])
categ_seq2(CentralAir_arr, X_train_OH["CentralAir"])
categ_seq2(CentralAir_arr, X_test_OH["CentralAir"])

# Functional: Sal, Sev, Maj2, Maj1, Mod, Min2, Min1, Typ
Functional_arr = ['sal', 'sev', 'maj2', 'maj1', 'mod', 'min2', 'min1', 'typ']
categ_seq2(Functional_arr, X_train_exp["Functional"])
categ_seq2(Functional_arr, X_test_exp["Functional"])
categ_seq2(Functional_arr, X_train_OH["Functional"])
categ_seq2(Functional_arr, X_test_OH["Functional"])

# GarageType: None, CarPort, Detchd, BuiltIn, Basment, Attchd, 2Types
GarageType_arr = ['none', 'carport', 'detchd', 'builtin', 'basment', 'attchd', '2types']
categ_seq2(GarageType_arr, X_train_exp["GarageType"])
categ_seq2(GarageType_arr, X_test_exp["GarageType"])
categ_seq2(GarageType_arr, X_train_OH["GarageType"])
categ_seq2(GarageType_arr, X_test_OH["GarageType"])

# GarageFinish: None, Unf, RFn, Fin
GarageFinish_arr = ['none', 'unf', 'rfn', 'fin']
categ_seq2(GarageFinish_arr, X_train_exp["GarageFinish"])
categ_seq2(GarageFinish_arr, X_test_exp["GarageFinish"])
categ_seq2(GarageFinish_arr, X_train_OH["GarageFinish"])
categ_seq2(GarageFinish_arr, X_test_OH["GarageFinish"])

# PavedDrive: N, P, Y
PavedDrive_arr = ['n', 'p', 'y']
categ_seq2(PavedDrive_arr, X_train_exp["PavedDrive"])
categ_seq2(PavedDrive_arr, X_test_exp["PavedDrive"])
categ_seq2(PavedDrive_arr, X_train_OH["PavedDrive"])
categ_seq2(PavedDrive_arr, X_test_OH["PavedDrive"])

# Fence: None, MnWw, GdWo, MnPrv, GdPrv
Fence_arr = ['none', 'mnww', 'gdwo', 'mnprv', 'gdprv']
categ_seq2(Fence_arr, X_train_exp["Fence"])
categ_seq2(Fence_arr, X_test_exp["Fence"])
categ_seq2(Fence_arr, X_train_OH["Fence"])
categ_seq2(Fence_arr, X_test_OH["Fence"])

#----------------------------------- unsure: made via investigation

def LotConfig_exp(data):
    n = 0
    for rows in data['LotConfig']:
        item = data['LotConfig'].iloc[n].strip().lower() #remove whitespaces
        if item == 'culdsac':
            data['LotConfig'].iloc[n] = 1
        else: data['LotConfig'].iloc[n] = 0
        n += 1
    
LotConfig_exp(X_train_exp)
LotConfig_exp(X_test_exp)

def BldgType_exp(data):
    n = 0
    for rows in data['BldgType']:
        item = data['BldgType'].iloc[n].strip().lower() #remove whitespaces
        if item == '2fmcon' or item == 'duplx' or item == 'twnhsi' or item == 'twnhs' or item == 'duplex':
            data['BldgType'].iloc[n] = 0
        elif item == 'twnhse':
            data['BldgType'].iloc[n] = 1
        elif item == '1fam':
            data['BldgType'].iloc[n] = 1.5
        n += 1

BldgType_exp(X_train_exp)
BldgType_exp(X_test_exp)

# Condition1 & 2: PosA, PosN, Norm, RRNn, RRNe, RRAn, RRAe, Feedr, Artery
Cond_arr = ['posa', 'posn', 'norm', 'rrnn', 'rrne', 'rran', 'rrae', 'feedr', 'artery']
categ_seq2(Cond_arr, X_train_exp["Condition1"])
categ_seq2(Cond_arr, X_test_exp["Condition1"])
categ_seq2(Cond_arr, X_train_exp["Condition2"])
categ_seq2(Cond_arr, X_test_exp["Condition2"])

# LandSlope: Gtl, Mod, Sev
LandS_arr = ['gtl', 'mod', 'sev']
categ_seq2(LandS_arr, X_train_exp["LandSlope"])
categ_seq2(LandS_arr, X_test_exp["LandSlope"])

# HouseStyle: 1Story, 1.5Unf, 1.5Fin, 2Story, SFoyer, 2.5Unf, 2.5Fin, SLvl 
HouseS_arr = ['1story', '1.5unf', '1.5fin', '2story', 'sfoyer', '2.5unf', '2.5fin', 'slvl']
categ_seq2(HouseS_arr, X_train_exp['HouseStyle'])
categ_seq2(HouseS_arr, X_test_exp['HouseStyle'])

# RoofStyle: shed, flat, gable, hip, gambrel, mansard
RoofS_arr = ['shed', 'flat', 'gable', 'hip', 'gambrel', 'mansard']
categ_seq2(RoofS_arr, X_train_exp['RoofStyle'])
categ_seq2(RoofS_arr, X_test_exp['RoofStyle'])

# RoofMatl: Roll, CompShg, Tar&Grv, Membran, WdShngl, Metal, WdShake, Clytile
RoofM_arr = ['roll', 'compshg', 'tar&grv', 'membran', 'wdshngl', 'metal', 'wdshake', 'clytile']
categ_seq2(RoofM_arr, X_train_exp['RoofMatl'])
categ_seq2(RoofM_arr, X_test_exp['RoofMatl'])

# MasVnrType: None, CBlock, BrkCmn, BrkFace, Stone
MVT_arr = ['none', 'cblock', 'brkcmn', 'brkface', 'stone']
categ_seq2(MVT_arr, X_train_exp['MasVnrType'])
categ_seq2(MVT_arr, X_test_exp['MasVnrType'])

# Foundation: Slab, BrkTil, CBlock, Wood, Stone, PConc
Found_arr = ['slab', 'brktil', 'cblock', 'wood', 'stone', 'pconc']
categ_seq2(Found_arr, X_train_exp['Foundation'])
categ_seq2(Found_arr, X_test_exp['Foundation'])

# Electrical: Mix, FuseP, FuseF, FuseA, SBrkr
Elec_arr = ['mix', 'fusep', 'fusef', 'fusea', 'sbrkr']
categ_seq2(Elec_arr, X_train_exp['Electrical'])
categ_seq2(Elec_arr, X_test_exp['Electrical'])

# MiscFeature: None, Othr, Gar2, Shed, Elev, TenC
MiscF_arr = ['none', 'othr', 'gar2', 'shed', 'elev', 'tenc']
categ_seq2(MiscF_arr, X_train_exp['MiscFeature'])
categ_seq2(MiscF_arr, X_test_exp['MiscFeature'])

In [None]:
# for features with unknown category order
# Create a function that automates ordinal encoding for categorical variables
# putting the highest index on one with the greatest mean for the target variable

import operator
from collections import OrderedDict

def categ_sort(data, column):
    Types = column.unique()
    categ_dict = {}
    m = 0
    for item in Types:
        mean_item = np.mean(data[column == Types[m]]["SalePrice"])
        categ_dict[Types[m]] = mean_item
        m += 1
    # Sort the dictionary into ascending order | higher value = higher index
    # copied from: https://stackabuse.com/how-to-sort-dictionary-by-value-in-python/
    sorted_tuples = sorted(categ_dict.items(), key=operator.itemgetter(1))
    sorted_dict = OrderedDict()
    for k, v in sorted_tuples:
        sorted_dict[k] = v

    sorted_list = list(sorted_dict.keys()) # sorted variables list
    sorted_list2 = []
    for categ in sorted_list:
        try: sorted_list2.append(categ.lower())
        except: sorted_list2.append(categ)
    sorted_list = sorted_list2
    print(sorted_list)
    # Replace the category in the dataframe based on their index in the dictionary
    n = 0
    for rows in column:
        try: item = column.iloc[n].strip().lower() #remove whitespaces
        except: item = column.iloc[n]
        column.iloc[n] = sorted_list.index(item)
        n += 1

In [None]:
categ_sort(X_train_exp, X_train_exp['SaleCondition'])
categ_sort(X_train_exp, X_train_exp['Exterior1st'])
categ_sort(X_train_exp, X_train_exp['Exterior2nd'])
categ_sort(X_train_exp, X_train_exp['MSZoning'])
categ_sort(X_train_exp, X_train_exp['LandContour'])
categ_sort(X_train_exp, X_train_exp['Heating'])
categ_sort(X_train_exp, X_train_exp['SaleType'])
categ_sort(X_train_exp, X_train_exp['Neighborhood'])
categ_sort(X_train_exp, X_train_exp['YrSold'])
categ_sort(X_train_exp, X_train_exp['MoSold'])

categ_sort(X_test_exp, X_test_exp['SaleCondition'])
categ_sort(X_test_exp, X_test_exp['Exterior1st'])
categ_sort(X_test_exp, X_test_exp['Exterior2nd'])
categ_sort(X_test_exp, X_test_exp['MSZoning'])
categ_sort(X_test_exp, X_test_exp['LandContour'])
categ_sort(X_test_exp, X_test_exp['Heating'])
categ_sort(X_test_exp, X_test_exp['SaleType'])
categ_sort(X_test_exp, X_test_exp['Neighborhood'])
categ_sort(X_test_exp, X_test_exp['YrSold'])
categ_sort(X_test_exp, X_test_exp['MoSold'])

# For one-hot encoding: 
# Condition1 & 2, LandSlope, HouseStyle, RoofStyle, RoofMatl, MasVnrType, Foundation, Electrical, MiscFeature
# SaleCondition, Exterior1st, Exterior2nd, MSZoning, LandContour, Heating, SaleType, LotConfig, BldgType

In [None]:
toOneHot = ['Condition1', 'Condition2', 'LandSlope', 'YrSold', 'MoSold',
        'HouseStyle', 'RoofStyle', 'RoofMatl', 'MasVnrType', 'Foundation',
       'Electrical', 'MiscFeature', 'SaleCondition', 'Exterior1st', 'Exterior2nd', 
        'MSZoning', 'LandContour', 'Heating', 'SaleType', 'LotConfig', 'BldgType', 'Neighborhood']

train_OH_var_df = X_train_OH[toOneHot].copy()

test_OH_var_df = X_test_OH[toOneHot].copy()

In [None]:
def name_concat(data):
    n = 0
    titles_arr = list(data.columns) # Creates an array of column names
    for titles_arr[n] in titles_arr: # goes through each items in the column names array
        p = 0
        for data[titles_arr[n]].iloc[p] in data[titles_arr[n]]: # goes through each row in the column
            data[titles_arr[n]].iloc[p] = str(titles_arr[n]) + str(data[titles_arr[n]].iloc[p])
            p += 1
        n += 1

In [None]:
name_concat(train_OH_var_df)
name_concat(test_OH_var_df)

In [None]:
# One hot encoding

from sklearn.preprocessing import OneHotEncoder

# copied from: https://www.youtube.com/watch?v=InZ0n2knz1E

ohe = OneHotEncoder(handle_unknown='ignore')

# before one hot encoding, concatenate the column name to each of the rows

def OH_func(features_arr, label_arr):
    
    p = 0
    
    OH_labels = []
    for label_arr[p] in label_arr:
        temp_list = label_arr[p].tolist()
        q = 0
        for temp_list[q] in temp_list:
            OH_labels.append(label_arr[p][q])
            q += 1
        p += 1
        
    return pd.DataFrame(features_arr, columns = OH_labels)

In [None]:
# 22 columns both

In [None]:
train_OH_features = ohe.fit_transform(train_OH_var_df).toarray() # one hot encodes the features
train_OH_labels = ohe.categories_ # the labels
train_OH_var_df = OH_func(train_OH_features, train_OH_labels)

test_OH_features = ohe.transform(test_OH_var_df).toarray() # one hot encodes the features
test_OH_var_df = OH_func(test_OH_features, train_OH_labels)

In [None]:
# Next Action: Drop the encoded features from X_train_OH and concat OH_features_df to it
def drop_concat(data1, data2, toDrop):
    data1.drop(toDrop, axis=1, inplace=True)

    data1.index = data2.index
    return pd.concat([data1, data2], axis=1)

X_train_OH = drop_concat(X_train_OH, train_OH_var_df, toOneHot)
X_test_OH = drop_concat(X_test_OH, test_OH_var_df, toOneHot)

### Time Series Encoding

In [None]:
from datetime import date

def age(data, var):
    n = 0
    new_var = []
    for data[var].iloc[n] in data[var]:
        try: new_var.append(date.today().year - data[var].iloc[n]) # append instead
        except: new_var.append(data[var].iloc[n])
        n += 1
    data.drop([var], axis=1, inplace=True)
    # data.insert(2, "Pred_LF", PredLF)
    return new_var

X_train_exp['House_Age'] = age(X_train_exp, 'YearBuilt')
X_test_exp['House_Age'] = age(X_test_exp, 'YearBuilt')
X_train_OH['House_Age'] = age(X_train_OH, 'YearBuilt')
X_test_OH['House_Age'] = age(X_test_OH, 'YearBuilt')

X_train_exp['Remodel_Age'] = age(X_train_exp, 'YearRemodAdd')
X_test_exp['Remodel_Age'] = age(X_test_exp, 'YearRemodAdd')
X_train_OH['Remodel_Age'] = age(X_train_OH, 'YearRemodAdd')
X_test_OH['Remodel_Age'] = age(X_test_OH, 'YearRemodAdd')

X_train_exp['Garage_Age'] = age(X_train_exp, 'GarageYrBlt')
X_test_exp['Garage_Age'] = age(X_test_exp, 'GarageYrBlt')
X_train_OH['Garage_Age'] = age(X_train_OH, 'GarageYrBlt')
X_test_OH['Garage_Age'] = age(X_test_OH, 'GarageYrBlt')

In [None]:
# this part creates a copy of the train and test set that only contains Garage Age
# this is what we will manipulate, with the goal of finding out what to replace the 'None' values
# in the train and test set

X_train_OH_NoNan = X_train_OH[['Garage_Age', 'SalePrice']].copy()
X_test_OH_NoNan = X_test_OH[['Garage_Age', 'SalePrice']].copy()
X_train_exp_NoNan = X_train_exp[['Garage_Age', 'SalePrice']].copy()
X_test_exp_NoNan = X_test_exp[['Garage_Age', 'SalePrice']].copy()

In [None]:
# this function, convert the none values to 1000 (a very high number)
# and then sorts the dataframe

def N2zero(data, var):
    g = 0
    for data[var].iloc[g] in data[var]:
        try: item = data[var].iloc[g].lower()
        except: item = data[var].iloc[g]
        if item == 'none':
            data[var].iloc[g] = 1000
        g += 1
    return data

X_train_OH_NoNan = N2zero(X_train_OH_NoNan, 'Garage_Age')
X_test_OH_NoNan = N2zero(X_test_OH_NoNan, 'Garage_Age')
X_train_exp_NoNan = N2zero(X_train_exp_NoNan, 'Garage_Age')
X_test_exp_NoNan = N2zero(X_test_exp_NoNan, 'Garage_Age')

In [None]:
from sklearn.preprocessing import KBinsDiscretizer

bin = KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='kmeans')

X_train_OH_NoNan_GA = bin.fit_transform(X_train_OH_NoNan['Garage_Age'].to_numpy().reshape(-1, 1))
X_test_OH_NoNan_GA = bin.fit_transform(X_test_OH_NoNan['Garage_Age'].to_numpy().reshape(-1, 1))
X_train_exp_NoNan_GA = bin.fit_transform(X_train_exp_NoNan['Garage_Age'].to_numpy().reshape(-1, 1))
X_test_exp_NoNan_GA = bin.fit_transform(X_test_exp_NoNan['Garage_Age'].to_numpy().reshape(-1, 1))
# Xt = est.transform(X)

In [None]:
X_train_OH_NoNan_GA.tolist()
X_test_OH_NoNan_GA.tolist() 
X_train_exp_NoNan_GA.tolist()
X_test_exp_NoNan_GA.tolist() 

In [None]:
# insert X_train_OH_NoNan_GA in the dataframe
X_train_OH_NoNan.insert(2, "GA_binned", X_train_OH_NoNan_GA)
X_test_OH_NoNan.insert(2, "GA_binned", X_test_OH_NoNan_GA)
X_train_exp_NoNan.insert(2, "GA_binned", X_train_exp_NoNan_GA)
X_test_exp_NoNan.insert(2, "GA_binned", X_test_exp_NoNan_GA)

In [None]:
print(X_train_OH_NoNan["GA_binned"].value_counts())
print(X_test_OH_NoNan["GA_binned"].value_counts())
print(X_train_exp_NoNan["GA_binned"].value_counts())
print(X_test_exp_NoNan["GA_binned"].value_counts())

In [None]:
X_train_OH_Enc = X_train_OH_NoNan["GA_binned"].copy()
X_test_OH_Enc = X_test_OH_NoNan["GA_binned"].copy()

X_train_exp_Enc = X_train_exp_NoNan["GA_binned"].copy()
X_test_exp_Enc = X_test_exp_NoNan["GA_binned"].copy()

In [None]:
def OH_simple(data, labels, output, toDrop):
    GA_binned_arr = ohe.fit_transform(data.to_numpy().reshape(-1, 1)).toarray() # one hot encodes the features
    data = pd.DataFrame(GA_binned_arr, columns = labels)

    output.drop(toDrop, axis=1)
    output = pd.concat([output, data], axis=1)

In [None]:
GA_bin = ['Garage_Age_bin1', 'Garage_Age_bin2', 'Garage_Age_bin3', 'Garage_Age_bin4', 'Garage_Age_bin5']
OH_simple(X_train_OH_Enc, GA_bin, X_train_OH, 'Garage_Age')
OH_simple(X_test_OH_Enc, GA_bin, X_test_OH, 'Garage_Age')
OH_simple(X_train_exp_Enc, GA_bin, X_train_OH, 'Garage_Age')
OH_simple(X_test_exp_Enc, GA_bin, X_test_OH, 'Garage_Age')

def dropFeatures(data, *argv):
    features_arr = []
    for arg in argv:
        features_arr.append(arg)
    return data.drop(features_arr, axis = 1)

X_train_exp = dropFeatures(X_train_exp, 'Garage_Age')
X_test_exp = dropFeatures(X_test_exp, 'Garage_Age')
X_train_OH = dropFeatures(X_train_OH, 'Garage_Age')
X_test_OH = dropFeatures(X_test_OH, 'Garage_Age')

X_train_exp = pd.concat([X_train_exp, X_train_exp_Enc], axis=1)
X_test_exp = pd.concat([X_test_exp, X_test_exp_Enc], axis=1)
X_train_OH = pd.concat([X_train_OH, X_train_OH_Enc], axis=1)
X_test_OH = pd.concat([X_test_OH, X_test_OH_Enc], axis=1)

In [None]:
# to check if there are non numerics left

for col in X_train_exp.columns:
    print(col)
    print(X_train_exp[col].value_counts())
    print('---------------')

# X_train_exp: All numeric
# X_train_OH: All numeric

In [None]:
# Convert the None of MasVnrArea to 0
# MasVnrArea is the only variable where I replaced the null values with 'None' but didn't do any encoding on it

def NoneExpeller(data):
    for col in data:
        if 'None' in data[col].unique().tolist():
            n = 0
            for data[col].iloc[n] in data[col]:
                if data[col].iloc[n] == 'None':
                    data[col].iloc[n] = 0
                n += 1
    return data
                
X_train_exp = NoneExpeller(X_train_exp)
X_test_exp = NoneExpeller(X_test_exp)
X_train_OH = NoneExpeller(X_train_OH)
X_test_OH = NoneExpeller(X_test_OH)
#X_train_exp = N2zero(X_train_exp, 'MasVnrArea')
#X_test_exp = N2zero(X_test_exp, 'MasVnrArea')
#X_train_OH = N2zero(X_train_OH, 'MasVnrArea')
#X_test_OH = N2zero(X_test_OH, 'MasVnrArea')

In [None]:
X_train_exp['Foundation'].value_counts()

In [None]:
X_train_exp.to_csv('X_train_exp.csv')
X_test_exp.to_csv('X_test_exp.csv')
X_train_OH.to_csv('X_train_OH.csv')
X_test_OH.to_csv('X_test_OH.csv')