## Package Importing

In [286]:
import numpy as np
import pandas as pd
import random
from pandas.api.types import CategoricalDtype
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder, MinMaxScaler, FunctionTransformer
from sklearn.model_selection import KFold, StratifiedKFold, GroupShuffleSplit, GroupKFold, train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, make_scorer


plt.rcParams['figure.figsize'] = (12, 9)
plt.rcParams['font.size'] = 16

## Importing DataFrame

In [253]:
df = pd.read_csv("../data/cook_county_housing.csv", index_col='Unnamed: 0')

#### Replacing space with empty

In [254]:
# replacing the space with the "" in all the columns
df.columns = df.columns.str.replace(" ", "")
df.columns

Index(['PIN', 'PropertyClass', 'NeighborhoodCode', 'LandSquareFeet',
       'TownCode', 'Apartments', 'WallMaterial', 'RoofMaterial', 'Basement',
       'BasementFinish', 'CentralHeating', 'OtherHeating', 'CentralAir',
       'Fireplaces', 'AtticType', 'AtticFinish', 'DesignPlan',
       'CathedralCeiling', 'ConstructionQuality', 'SiteDesirability',
       'Garage1Size', 'Garage1Material', 'Garage1Attachment', 'Garage1Area',
       'Garage2Size', 'Garage2Material', 'Garage2Attachment', 'Garage2Area',
       'Porch', 'OtherImprovements', 'BuildingSquareFeet', 'RepairCondition',
       'MultiCode', 'NumberofCommercialUnits', 'Estimate(Land)',
       'Estimate(Building)', 'DeedNo.', 'SalePrice', 'Longitude', 'Latitude',
       'CensusTract', 'MultiPropertyIndicator', 'ModelingGroup', 'Age', 'Use',
       'O'HareNoise', 'Floodplain', 'RoadProximity', 'SaleYear', 'SaleQuarter',
       'SaleHalf-Year', 'SaleQuarterofYear', 'SaleMonthofYear',
       'SaleHalfofYear', 'MostRecentSale', 'AgeDec

In [255]:
df["LogSalePrice"] = np.log(df["SalePrice"])
df["LogBuildingSquareFeet"] = np.log(df["BuildingSquareFeet"])
df["Grp_LogSalePrice"] = np.zeros(df.shape[0])
df["Grp_LogSalePrice"][df["LogSalePrice"]<2.5] = "lessthan2.5"
df["Grp_LogSalePrice"][(df["LogSalePrice"]>=2.5) & (df["LogSalePrice"]<7.5)] = "between2.5and7.5"
df["Grp_LogSalePrice"][df["LogSalePrice"]>=7.5] = "over7.5"

In [256]:
Q1 = df["LogSalePrice"].quantile(0.25)
Q3 = df["LogSalePrice"].quantile(0.75)
IQR = Q3 - Q1
IQR

1.9319061009712968

In [257]:
outliers = ~((df["LogSalePrice"] < (Q1 - 1.5*IQR)) | (df["LogSalePrice"] > (Q3 + 1.5*IQR)))
df1 = df.copy()
df = df1.loc[outliers,]
df.shape

(168757, 65)

In [258]:
df.columns

Index(['PIN', 'PropertyClass', 'NeighborhoodCode', 'LandSquareFeet',
       'TownCode', 'Apartments', 'WallMaterial', 'RoofMaterial', 'Basement',
       'BasementFinish', 'CentralHeating', 'OtherHeating', 'CentralAir',
       'Fireplaces', 'AtticType', 'AtticFinish', 'DesignPlan',
       'CathedralCeiling', 'ConstructionQuality', 'SiteDesirability',
       'Garage1Size', 'Garage1Material', 'Garage1Attachment', 'Garage1Area',
       'Garage2Size', 'Garage2Material', 'Garage2Attachment', 'Garage2Area',
       'Porch', 'OtherImprovements', 'BuildingSquareFeet', 'RepairCondition',
       'MultiCode', 'NumberofCommercialUnits', 'Estimate(Land)',
       'Estimate(Building)', 'DeedNo.', 'SalePrice', 'Longitude', 'Latitude',
       'CensusTract', 'MultiPropertyIndicator', 'ModelingGroup', 'Age', 'Use',
       'O'HareNoise', 'Floodplain', 'RoadProximity', 'SaleYear', 'SaleQuarter',
       'SaleHalf-Year', 'SaleQuarterofYear', 'SaleMonthofYear',
       'SaleHalfofYear', 'MostRecentSale', 'AgeDec

In [259]:
maxmin_ftr = ['Longitude','Latitude',]
std_ftr = ['Fireplaces','LandSquareFeet', 'Garage1Size', 'Garage2Size',
           "NumberofCommercialUnits", 
           'Estimate(Land)','Estimate(Building)', 'Age', 'SaleYear', 
           'SaleQuarter', 'SaleHalf-Year', 'AgeDecade', 'LotSize','LogBuildingSquareFeet']
no_need = ['PIN', 'DeedNo.', 'Description', 'SalePrice', 'LogSalePrice','CensusTract', 
           'SiteDesirability', 'OtherImprovements', 'ModelingGroup', "BuildingSquareFeet"]
ord_ftr = ["BasementFinish", "AtticType", "ConstructionQuality", "RepairCondition", "Floodplain"]
onehot_ftr = ["PropertyClass","NeighborhoodCode", "TownCode", "Apartments" ,"WallMaterial", "RoofMaterial","Basement", 
              "CentralHeating", "OtherHeating", "CentralAir", "AtticFinish", "DesignPlan", "CathedralCeiling",
              "Garage1Material", "Garage1Attachment", "Garage1Area", 
              "Garage2Material", "Garage2Attachment", "Garage2Area", 
              "Porch", "MultiCode", "MultiPropertyIndicator", "Use", "O'HareNoise", "RoadProximity", 
              "SaleQuarterofYear", 'SaleMonthofYear', 'SaleHalfofYear', "MostRecentSale", "PureMarketFilter", 
              "GarageIndicator", "NeigborhoodCode(mapping)", "TownandNeighborhood"]

ord_cat = [[1.0, 2.0, 3.0], [1.0, 2.0, 3.0], [1.0, 2.0, 3.0], [1.0, 2.0, 3.0], [0.0, 1.0]]

In [284]:
# Use stratified splitting
from collections import Counter
#y_forsplitting = df[["Grp_LogSalePrice"]]
#X_forsplitting = df.loc[:, ((df.columns != "SalePrice") & (df.columns != "LogSalePrice"))]

def stratified_fcn(X, y, train_size, val_size, test_size, strati_tar, random_seed, num_folds):
    X_other, X_test, y_other, y_test = train_test_split(X, y, test_size= test_size, random_state = random_seed, stratify = strati_tar)
    second_split = StratifiedKFold(n_splits=num_folds, shuffle = True, random_state=random_seed)
    ind_needed = X_other.index
    new_strat = strati_tar[ind_needed]
    total = {"X_train":[], "y_train":[], "X_val":[], "y_val":[], "X_test":[], "y_test":[]}
    for train_ind, val_ind in second_split.split(X_other, y_other, groups=new_strat):
        X_train = X_other.iloc[train_ind]
        y_train = y_other.iloc[train_ind]
        X_val = X_other.iloc[val_ind]
        y_val = y_other.iloc[val_ind]
        total["X_val"].append(X_val)
        total["y_val"].append(y_val)
        total["X_train"].append(X_train)
        total["y_train"].append(y_train)
    total["X_test"].append(X_test)
    total["y_test"].append(y_test) 
    return total


#splitting = stratified_fcn(X_forsplitting, y_forsplitting, 0.8, 0.1, 0.1, df["Grp_LogSalePrice"], 44, 5) 


X = df.loc[:, ((df.columns != "Grp_LogSalePrice"))]
X = X.drop(columns = no_need)
y = df["LogSalePrice"]

def getting_split(df_set):
    training_index = [i.index.to_list() for i in df_set["X_train"]]
    validation_index = [i.index.to_list() for i in df_set["X_val"]]
    testing_index = [i.index.to_list() for i in df_set["X_test"]]
    for i in range(0, len(training_index)):
        #globals()["X_training_fold_" + str(i)] = X.iloc[training_index[i]]
        #globals()["X_validation_fold_" + str(i)] = X.iloc[validation_index[i]]
        #globals()["y_training_fold_" + str(i)] = y.iloc[training_index[i]]
        #globals()["y_validation_fold_" + str(i)] = y.iloc[validation_index[i]]
        globals()["fold_" + str(i)] = {"X_training": X.iloc[training_index[i]], "y_training": y.iloc[training_index[i]], 
                                       "X_validation": X.iloc[validation_index[i]], "y_validation": y.iloc[validation_index[i]],
                                       "X_testing": X.iloc[testing_index[0]], "y_testing": y.iloc[testing_index[0]]}
    return {"fold_1": fold_0, "fold_2": fold_1, "fold_3": fold_2, "fold_4": fold_3, "fold_5": fold_4}


#folds = getting_split(splitting)

In [321]:
X[["TownCode", "Apartments"]]

Unnamed: 0,TownCode,Apartments
1,71,0.0
2,70,0.0
3,17,0.0
4,32,0.0
6,37,0.0
...,...,...
204787,72,0.0
204788,23,0.0
204789,15,0.0
204790,22,0.0


In [351]:
X.iloc[:, [1, 2, 3]]

Unnamed: 0,NeighborhoodCode,LandSquareFeet,TownCode
1,120,3780.0,71
2,210,4375.0,70
3,220,4375.0,17
4,120,8400.0,32
6,181,10890.0,37
...,...,...,...
204787,321,4375.0,72
204788,21,16509.0,23
204789,90,3810.0,15
204790,80,6650.0,22


In [372]:
preprocessor = ColumnTransformer(
    transformers=[
        ('onehot', OneHotEncoder(sparse=False,handle_unknown='ignore'), onehot_ftr),
        ('minmax', MinMaxScaler(), maxmin_ftr),
        ('std', StandardScaler(), std_ftr),
        ('ord', OrdinalEncoder(categories = ord_cat), ord_ftr)])

def MLpipe_strKFold_RMSE(X, y, preprocessor, ML_algo, parameter_grid, strati_tar, num_folds):
    best_models = []
    test_scores = []
    state_lst = [random.randint(0, 1000) for _ in range(5)]
    
    for state in state_lst:
        pipe = Pipeline(steps = [('preprocessor', preprocessor),
                                 ('colselect', FunctionTransformer(keep_columns)),
                                 ('regressor', ML_algo)])
        X_other, X_test, y_other, y_test = train_test_split(X, y, test_size= 0.20, 
                                                            random_state = state, stratify = strati_tar)
        kf = KFold(n_splits = num_folds, shuffle = True, random_state=state)
        grid = GridSearchCV(pipe, param_grid = parameter_grid,
                           scoring = make_scorer(mean_squared_error, greater_is_better = False, squared = False),
                           cv = kf, return_train_score = True, n_jobs = -1, verbose = False)
        grid.fit(X_other, y_other)
        predictions = grid.predict(X_test)
        best_model = grid.best_params_
        test_score = mean_squared_error(y_test, predictions, squared = False)
        best_models.append(best_model)
        test_scores.append(test_score)
    return best_models, test_scores

#### Lasso

In [373]:
from sklearn.linear_model import Lasso

regressor = Lasso(max_iter=100000)
alpha = [1e-2, 1e-1, 1e0, 1e1, 1e2]
param_grid = {"regressor__alpha": alpha}
models, scores = MLpipe_strKFold_RMSE(X.iloc[1:500,], y[1:500], preprocessor, regressor, param_grid, df["Grp_LogSalePrice"][1:500], 4)


Traceback (most recent call last):
  File "/opt/anaconda3/envs/data1030/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/anaconda3/envs/data1030/lib/python3.9/site-packages/sklearn/pipeline.py", line 341, in fit
    Xt = self._fit(X, y, **fit_params_steps)
  File "/opt/anaconda3/envs/data1030/lib/python3.9/site-packages/sklearn/pipeline.py", line 303, in _fit
    X, fitted_transformer = fit_transform_one_cached(
  File "/opt/anaconda3/envs/data1030/lib/python3.9/site-packages/joblib/memory.py", line 352, in __call__
    return self.func(*args, **kwargs)
  File "/opt/anaconda3/envs/data1030/lib/python3.9/site-packages/sklearn/pipeline.py", line 754, in _fit_transform_one
    res = transformer.fit_transform(X, y, **fit_params)
  File "/opt/anaconda3/envs/data1030/lib/python3.9/site-packages/sklearn/base.py", line 702, in fit_transform
    return self.fit(X, y, **fit_params).tran

NotFittedError: All estimators failed to fit

In [312]:
models
scores

[0.6422632233660511,
 0.6380627618652492,
 0.439123485265782,
 0.6595142893198469,
 0.795042589395531]

#### RandomForestRegressor

In [237]:
from sklearn.ensemble import RandomForestRegressor
regressor = RandomForestRegressor()
grid = {'regressor__n_estimators': list(np.arange(100, 1200, 100)),
       'regressor__max_depth': list(np.arange(1, 10, 2))}
models, scores = MLpipe_strKFold_RMSE(X.iloc[1:100,], y[1:100], preprocessor, regressor, grid, df["Grp_LogSalePrice"][1:100], 4)

In [238]:
models

[{'regressor__max_depth': 9, 'regressor__n_estimators': 300},
 {'regressor__max_depth': 5, 'regressor__n_estimators': 300},
 {'regressor__max_depth': 7, 'regressor__n_estimators': 500},
 {'regressor__max_depth': 9, 'regressor__n_estimators': 300},
 {'regressor__max_depth': 5, 'regressor__n_estimators': 100}]

In [239]:
scores

[0.5056951471395876,
 0.5783670407844198,
 0.7342831437925698,
 1.874993593173462,
 0.6262940782724713]

#### Ridge Regressor

In [76]:
from sklearn.linear_model import Ridge
regressor = Ridge(max_iter = 1000000)
grid = {"regressor__alpha": alpha}
models, scores = MLpipe_strKFold_RMSE(X.iloc[1:100,], y[1:100], preprocessor, regressor, grid, df["Grp_LogSalePrice"][1:100], 4)

Fitting 4 folds for each of 5 candidates, totalling 20 fits
Fitting 4 folds for each of 5 candidates, totalling 20 fits
Fitting 4 folds for each of 5 candidates, totalling 20 fits
Fitting 4 folds for each of 5 candidates, totalling 20 fits
Fitting 4 folds for each of 5 candidates, totalling 20 fits


In [78]:
scores

[1.7134005181776035,
 1.6560582432479631,
 1.8895417090789517,
 1.378560233630301,
 1.8189448491330238]

#### SVR

In [81]:
from sklearn.svm import SVR
regressor = SVR()
grid = {'regressor__C': [1e-2, 1e-1, 1e1, 1e2],
       'regressor__gamma': [1e-2, 1e-1, 1e1, 1e2]}
models, scores = MLpipe_strKFold_RMSE(X.iloc[1:100,], y[1:100], preprocessor, regressor, grid, df["Grp_LogSalePrice"][1:100], 4)

Fitting 4 folds for each of 16 candidates, totalling 64 fits
Fitting 4 folds for each of 16 candidates, totalling 64 fits
Fitting 4 folds for each of 16 candidates, totalling 64 fits
Fitting 4 folds for each of 16 candidates, totalling 64 fits
Fitting 4 folds for each of 16 candidates, totalling 64 fits


In [83]:
scores

[2.53174590826996,
 2.3714800625844306,
 2.685314514815253,
 1.8460288706525552,
 2.7475722231750175]

In [84]:
preprocessor = ColumnTransformer(
    transformers=[
        ('onehot', OneHotEncoder(sparse=False,handle_unknown='ignore'), onehot_ftr),
        ('minmax', MinMaxScaler(), maxmin_ftr),
        ('std', StandardScaler(), std_ftr),
        ('ord', OrdinalEncoder(categories = ord_cat), ord_ftr)])

clf = Pipeline(steps=[('preprocessor', preprocessor)])

X_train_prep = clf.fit_transform(folds["fold_1"]["X_training"])
X_val_prep = clf.transform(folds["fold_1"]["X_validation"])
X_test_prep = clf.transform(folds["fold_1"]["X_testing"])

In [85]:
X_train_prep.shape

(147449, 1375)

In [106]:
feature_names = list(preprocessor.named_transformers_['onehot'].get_feature_names(onehot_ftr)) + preprocessor.transformers_[1][-1] + \
preprocessor.transformers_[2][-1] + preprocessor.transformers_[3][-1]


In [110]:
df_X_train_prep = pd.DataFrame(data = X_train_prep, columns = feature_names)

In [302]:
from sklearn.feature_selection import f_regression, SelectKBest, SelectPercentile, mutual_info_regression
X = df.loc[:, ((df.columns != "Grp_LogSalePrice"))]
X = X.drop(columns = no_need)
y = df["LogSalePrice"]

preprocessor = ColumnTransformer(
    transformers=[
        ('onehot', OneHotEncoder(sparse=False,handle_unknown='ignore'), onehot_ftr),
        ('minmax', MinMaxScaler(), maxmin_ftr),
        ('std', StandardScaler(), std_ftr),
        ('ord', OrdinalEncoder(categories = ord_cat), ord_ftr)])

clf = Pipeline(steps=[('preprocessor', preprocessor)])


original_prep = clf.fit_transform(X)
feature_names = list(preprocessor.named_transformers_['onehot'].get_feature_names(onehot_ftr)) + preprocessor.transformers_[1][-1] + \
preprocessor.transformers_[2][-1] + preprocessor.transformers_[3][-1]
df_original_prep = pd.DataFrame(data = original_prep, columns = feature_names)

f_selector = SelectPercentile(f_regression, percentile = 66)
f_selector.fit_transform(df_original_prep, y)
f_sel_66per = list(np.array(feature_names)[(f_selector.get_support())])

In [345]:
len(f_sel_66per)

911

In [346]:
indices = [i for i in range(len(feature_names)) if ((f_selector.get_support())[i] == True)]

In [164]:
mul_selector = SelectPercentile(mutual_info_regression, percentile = 80)
mul_selector.fit_transform(X, y)
mul_sel_50per = list(np.array(X.columns)[(mul_selector.get_support())])

In [166]:
list(set(mul_sel_50per) - set(f_sel_50per))

["O'HareNoise", 'Garage2Size', 'MultiPropertyIndicator']

In [122]:
from sklearn.feature_selection import f_regression, SelectKBest, SelectPercentile, mutual_info_regression

In [153]:
from sklearn.feature_selection import f_regression, SelectKBest, SelectPercentile, mutual_info_regression
f_selector = SelectPercentile(f_regression, percentile = 50)
f_selector.fit_transform(df_original_prep, y)
f_sel_50per = list(np.array(feature_names)[(f_selector.get_support())])

In [140]:
mul_selector = SelectPercentile(mutual_info_regression, percentile = 33)
mul_selector.fit_transform(df_original_prep, y)
mul_sel_50per = list(np.array(feature_names)[(mul_selector.get_support())])

In [154]:
f_selector.scores_

array([1101.39698827,  448.7293027 ,  106.39327302, ...,  196.98830593,
         86.9406719 ,   73.99283728])

In [147]:
?SelectPercentile