In [3]:
%run func.ipynb
import mahotas
from pyfeats import *
import pandas as pd
import numpy as np
import time

from sklearn.feature_selection import f_regression
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error

In [4]:
df = pd.read_csv('train.csv')
df.fillna(df.mode().iloc[0], inplace=True) # Replace NaN values with mode of corresponding column
# use the factorize() function to convert the column of strings to integers
non_int_columns = df.select_dtypes(exclude=['int']).columns.tolist()
for i in non_int_columns:
    df[i] = pd.factorize(df[i])[0] + 1

In [5]:
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,1,1,8450,1,1,1,1,1,...,0,1,1,1,0,2,2008,1,1,208500
1,2,20,1,2,9600,1,1,1,1,1,...,0,1,1,1,0,5,2007,1,1,181500
2,3,60,1,3,11250,1,1,2,1,1,...,0,1,1,1,0,9,2008,1,1,223500
3,4,70,1,4,9550,1,1,2,1,1,...,0,1,1,1,0,2,2006,1,2,140000
4,5,60,1,5,14260,1,1,2,1,1,...,0,1,1,1,0,12,2008,1,1,250000


In [6]:
X = df.iloc[:, :-1]
y = df.iloc[:, -1]

len_row, len_col = X.shape
print('Row: ',len_row)

X_col = X.columns.tolist()

# Create a StandardScaler object
scaler = StandardScaler()

# Fit the scaler to the dataset
scaler.fit(X)

# Transform the dataset using the scaler
X_std = scaler.transform(X)

X_std = pd.DataFrame(data=X_std, columns=X_col)

Row:  1460


In [7]:
X_std.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,-1.730865,0.073375,-0.426561,-0.89654,-0.207142,-0.064238,-0.169981,-0.701291,-0.304158,-0.02618,...,-0.270208,-0.068692,-0.049718,-0.282681,-0.054116,-0.087688,-1.599111,0.138777,-0.292103,-0.394337
1,-1.728492,-0.872563,-0.426561,-0.859423,-0.091886,-0.064238,-0.169981,-0.701291,-0.304158,-0.02618,...,-0.270208,-0.068692,-0.049718,-0.282681,-0.054116,-0.087688,-0.48911,-0.614439,-0.292103,-0.394337
2,-1.72612,0.073375,-0.426561,-0.822305,0.07348,-0.064238,-0.169981,1.016637,-0.304158,-0.02618,...,-0.270208,-0.068692,-0.049718,-0.282681,-0.054116,-0.087688,0.990891,0.138777,-0.292103,-0.394337
3,-1.723747,0.309859,-0.426561,-0.785188,-0.096897,-0.064238,-0.169981,1.016637,-0.304158,-0.02618,...,-0.270208,-0.068692,-0.049718,-0.282681,-0.054116,-0.087688,-1.599111,-1.367655,-0.292103,0.73234
4,-1.721374,0.073375,-0.426561,-0.74807,0.375148,-0.064238,-0.169981,1.016637,-0.304158,-0.02618,...,-0.270208,-0.068692,-0.049718,-0.282681,-0.054116,-0.087688,2.100892,0.138777,-0.292103,-0.394337


In [8]:
# # without feature selection
# X_train, X_test, y_train, y_test = train_test_split(X_std, y, test_size = 0.2, random_state = 93)
# start_time = time.time()
# # Train a random forest model on the training data
# model = GradientBoostingRegressor(random_state = 10)
# model.fit(X_train, y_train)
# end_time = time.time()

# # Evaluate the model on the testing data
# y_pred = model.predict(X_test)
# mse = mean_squared_error(y_test, y_pred)

# print(f'Mean Squared Error = {mse}')
# print(f"Elapsed time: {end_time - start_time} seconds")
# print()

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X_std, y, test_size = 0.2, random_state = 93)

# Define the range of values for k
k_values = range(1, len_col+1)
# Initialize dictionaries to store results
mses = {}
selected_each_k = {}
timer = {}

for k in k_values:
    scores, selected, scores_ith, score_df, relevancy, redundancy = min_redun_max_relev(X, y, k)
    selected_each_k[f'k = {k}'] = selected
    # Remove all columns from X that are not selected
    X_selected = X_train[selected]
    
    start_time = time.time()
    # Train a random forest model on the training data
    model = GradientBoostingRegressor(random_state = 10)
    model.fit(X_selected, y_train)
    end_time = time.time()

    # Evaluate the model on the testing data
    X_selected_test = X_test[selected]
    y_pred = model.predict(X_selected_test)
    mse = mean_squared_error(y_test, y_pred)
    
    times = end_time - start_time
    
    timer[f'k = {k}'] = times
    mses[f'k = {k}'] = mse

In [10]:
X_selected.head()

Unnamed: 0,OverallQual,GrLivArea,GarageCars,TotalBsmtSF,GarageArea,1stFlrSF,FullBath,YearRemodAdd,TotRmsAbvGrd,YearBuilt,...,MiscVal,Alley,LowQualFinSF,Utilities,BsmtHalfBath,MiscFeature,MasVnrType,BsmtFinSF2,Condition2,BsmtQual
1090,-2.241782,-0.905127,0.311725,-2.411167,-0.341457,-0.317311,0.789741,-1.689368,-0.318683,-0.704406,...,-0.087688,-0.169981,-0.120242,-0.02618,-0.241061,-0.054116,0.312806,-0.288653,-0.091553,0.406942
468,1.374795,0.225654,1.650307,1.296463,1.838848,1.219736,0.789741,1.024029,0.296763,1.150356,...,-0.087688,-0.169981,-0.120242,-0.02618,-0.241061,-0.054116,1.904082,-0.288653,-0.091553,-0.984475
1007,-1.518467,-0.806136,-1.026858,-1.166169,-0.874836,-1.595596,-1.026041,-0.720298,-0.318683,-0.041991,...,-0.087688,-0.169981,-0.120242,-0.02618,-0.241061,-0.054116,0.312806,1.404223,-0.091553,0.406942
1194,-0.071836,-0.311181,-1.026858,-0.899384,-0.814012,-1.292845,-1.026041,-0.768752,0.296763,-0.075111,...,-0.087688,-0.169981,-0.120242,-0.02618,-0.241061,-0.054116,1.904082,-0.288653,-0.091553,0.406942
859,-0.071836,2.1674,0.311725,-0.00782,0.683193,0.904046,0.789741,-0.041949,1.527656,-0.108232,...,-0.087688,-0.169981,-0.120242,-0.02618,-0.241061,-0.054116,-1.278471,-0.288653,-0.091553,0.406942


In [11]:
# Choose the best k based on the validation accuracies

print(f"MSE using all columns= {mses[f'k = {len_col}']}")
print(f"Elapsed Time = {timer[f'k = {len_col}']}")
print()
best_k = min(mses, key=mses.get)
print(f"Best {best_k} with MSE = {mses[best_k]}")
print(f"Elapsed Time = {timer[best_k]}")
print()
print(f"MSE using 3 best columns= {mses[f'k = 3']}")
print(f"Elapsed Time = {timer[f'k = 3']}")
print()
print(f"MSE using 10 best columns= {mses[f'k = 10']}")
print(f"Elapsed Time = {timer[f'k = 10']}")
print()

MSE using all columns= 412026380.4704741
Elapsed Time = 0.777214765548706

Best k = 67 with MSE = 372592607.604737
Elapsed Time = 0.6826181411743164

MSE using 3 best columns= 895863183.6754111
Elapsed Time = 0.12546730041503906

MSE using 10 best columns= 587507015.6618106
Elapsed Time = 0.2629692554473877



In [12]:
worsts = [['BsmtFinSF2','Condition2','BsmtQual'],['MiscVal','Alley','LowQualFinSF','Utilities','BsmtHalfBath','MiscFeature','MasVnrType','BsmtFinSF2','Condition2','BsmtQual']]

for worst in worsts:
    # USING 3 Worst columns & USING 10 Worst columns
    X_select = X_selected[worst]

    start_time = time.time()
    # Train a random forest model on the training data
    model = GradientBoostingRegressor()
    model.fit(X_select, y_train)
    end_time = time.time()

    # Evaluate the model on the testing data
    X_selected_test = X_test[worst]
    y_pred = model.predict(X_selected_test)
    mse = mean_squared_error(y_test, y_pred)

    times = end_time - start_time

    print(f"MSE using {len(worst)} worst columns= {mse}")
    print(f"Elapsed Time = {times}")
    print()

MSE using 3 worst columns= 3236522438.9142814
Elapsed Time = 0.05188345909118652

MSE using 10 worst columns= 2742889438.53906
Elapsed Time = 0.06624865531921387



In [13]:
print(selected_each_k[best_k])

['OverallQual', 'GrLivArea', 'GarageCars', 'TotalBsmtSF', 'GarageArea', '1stFlrSF', 'FullBath', 'YearRemodAdd', 'TotRmsAbvGrd', 'YearBuilt', 'Fireplaces', 'BsmtFinSF1', 'HeatingQC', 'Foundation', 'WoodDeckSF', 'BsmtFinType1', 'OpenPorchSF', 'LotArea', '2ndFlrSF', 'MasVnrArea', 'HalfBath', 'LotShape', 'BsmtExposure', 'CentralAir', 'LotFrontage', 'ExterQual', 'RoofStyle', 'BsmtFullBath', 'Electrical', 'GarageFinish', 'PavedDrive', 'HouseStyle', 'BsmtUnfSF', 'FireplaceQu', 'SaleCondition', 'Neighborhood', 'ScreenPorch', 'BedroomAbvGr', 'LotConfig', 'Exterior2nd', 'ExterCond', 'KitchenAbvGr', 'Functional', 'GarageCond', 'PoolQC', 'EnclosedPorch', 'LandContour', 'MSZoning', 'SaleType', 'GarageYrBlt', 'Heating', 'BldgType', 'Exterior1st', 'GarageType', 'PoolArea', 'KitchenQual', 'BsmtCond', 'Fence', '3SsnPorch', 'MoSold', 'MSSubClass', 'OverallCond', 'Condition1', 'Street', 'GarageQual', 'LandSlope', 'BsmtFinType2']


In [16]:
pd.DataFrame(scores, columns=['mRMR', 'Highest_score_each_iteration']).head()

Unnamed: 0,mRMR,Highest_score_each_iteration
0,OverallQual,243677100.0
1,GrLivArea,2479.876
2,GarageCars,1898.471
3,TotalBsmtSF,1850.413
4,GarageArea,1544.816


In [17]:
# Find the maximum number of values across all keys
max_values = max([len(val) if isinstance(val, list) else 1 for val in selected_each_k.values()])

# Iterate through the keys and add "n/a" values as necessary
for key, val in selected_each_k.items():
    # Check if the value is a string and split by comma if necessary
    if isinstance(val, str):
        values = val.split(',')
    else:
        values = val
        
    # If the number of values is less than the maximum, add "n/a" values
    num_values = len(values)
    if num_values < max_values:
        diff = max_values - num_values
        values += ['-'] * diff
    
    # Join the values with commas and update the dictionary
    selected_each_k[key] = values
    
selected_each_k_df = pd.DataFrame(selected_each_k)

In [18]:
selected_each_k_df.iloc[:, :100].head(100)

Unnamed: 0,k = 1,k = 2,k = 3,k = 4,k = 5,k = 6,k = 7,k = 8,k = 9,k = 10,...,k = 71,k = 72,k = 73,k = 74,k = 75,k = 76,k = 77,k = 78,k = 79,k = 80
0,OverallQual,OverallQual,OverallQual,OverallQual,OverallQual,OverallQual,OverallQual,OverallQual,OverallQual,OverallQual,...,OverallQual,OverallQual,OverallQual,OverallQual,OverallQual,OverallQual,OverallQual,OverallQual,OverallQual,OverallQual
1,-,GrLivArea,GrLivArea,GrLivArea,GrLivArea,GrLivArea,GrLivArea,GrLivArea,GrLivArea,GrLivArea,...,GrLivArea,GrLivArea,GrLivArea,GrLivArea,GrLivArea,GrLivArea,GrLivArea,GrLivArea,GrLivArea,GrLivArea
2,-,-,GarageCars,GarageCars,GarageCars,GarageCars,GarageCars,GarageCars,GarageCars,GarageCars,...,GarageCars,GarageCars,GarageCars,GarageCars,GarageCars,GarageCars,GarageCars,GarageCars,GarageCars,GarageCars
3,-,-,-,TotalBsmtSF,TotalBsmtSF,TotalBsmtSF,TotalBsmtSF,TotalBsmtSF,TotalBsmtSF,TotalBsmtSF,...,TotalBsmtSF,TotalBsmtSF,TotalBsmtSF,TotalBsmtSF,TotalBsmtSF,TotalBsmtSF,TotalBsmtSF,TotalBsmtSF,TotalBsmtSF,TotalBsmtSF
4,-,-,-,-,GarageArea,GarageArea,GarageArea,GarageArea,GarageArea,GarageArea,...,GarageArea,GarageArea,GarageArea,GarageArea,GarageArea,GarageArea,GarageArea,GarageArea,GarageArea,GarageArea
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
75,-,-,-,-,-,-,-,-,-,-,...,-,-,-,-,-,MiscFeature,MiscFeature,MiscFeature,MiscFeature,MiscFeature
76,-,-,-,-,-,-,-,-,-,-,...,-,-,-,-,-,-,MasVnrType,MasVnrType,MasVnrType,MasVnrType
77,-,-,-,-,-,-,-,-,-,-,...,-,-,-,-,-,-,-,BsmtFinSF2,BsmtFinSF2,BsmtFinSF2
78,-,-,-,-,-,-,-,-,-,-,...,-,-,-,-,-,-,-,-,Condition2,Condition2


In [19]:
for i in range(len(scores_ith)):
    combines = {'Relevancy':relevancy[i], 'Redundancy':redundancy[i],'MRMR':scores_ith[i]}
    scored_ith = pd.DataFrame(combines)
    scored_ith = scored_ith.sort_values('MRMR', ascending=False)
    print('ITERASI KE ', i+1)
    print(scored_ith)
    print()
    print()

ITERASI KE  1
               Relevancy  Redundancy          MRMR
OverallQual  2436.770591     0.00001  2.436771e+08
GrLivArea    1470.585010     0.00001  1.470585e+08
GarageCars   1013.705666     0.00001  1.013706e+08
GarageArea    926.951287     0.00001  9.269513e+07
TotalBsmtSF   880.341282     0.00001  8.803413e+07
...                  ...         ...           ...
MiscFeature     0.216122     0.00001  2.161222e+04
BsmtFinSF2      0.188780     0.00001  1.887795e+04
MasVnrType      0.166779     0.00001  1.667793e+04
Condition2      0.034062     0.00001  3.406202e+03
BsmtQual        0.023949     0.00001  2.394869e+03

[80 rows x 3 columns]


ITERASI KE  2
               Relevancy  Redundancy         MRMR
GrLivArea    1470.585010    0.593007  2479.876196
1stFlrSF      845.524488    0.476224  1775.477068
GarageCars   1013.705666    0.600671  1687.622916
GarageArea    926.951287    0.562022  1649.315665
TotalBsmtSF   880.341282    0.537808  1636.904743
...                  ...         ..

BsmtQual        0.023949    0.113438    0.211117


ITERASI KE  33
               Relevancy  Redundancy        MRMR
BsmtUnfSF      70.303948    0.147073  478.020501
FireplaceQu    40.844608    0.089935  454.156314
SaleCondition  30.221556    0.066634  453.545862
Neighborhood   30.707622    0.082415  372.599617
ScreenPorch    18.336600    0.051951  352.959764
BedroomAbvGr   42.456412    0.127946  331.829617
LotConfig      17.565263    0.058605  299.721952
Exterior2nd    40.660476    0.137526  295.656643
PoolQC          9.140489    0.032005  285.592212
KitchenAbvGr   27.437234    0.099095  276.877179
Functional     17.325200    0.063721  271.892139
ExterCond      21.921284    0.082741  264.939068
SaleType        7.788912    0.030403  256.190418
BldgType       18.726622    0.077568  241.420831
EnclosedPorch  24.509275    0.103831  236.048672
GarageCond     18.485024    0.078509  235.450242
MSZoning       19.902726    0.085485  232.821848
LandContour    12.448384    0.055337  224.957053
Poo

ITERASI KE  58
              Relevancy  Redundancy        MRMR
Fence          6.938205    0.044056  157.487696
3SsnPorch      2.903843    0.022442  129.395046
MoSold         3.150172    0.025400  124.022251
MSSubClass    10.431466    0.085445  122.083966
OverallCond    8.891623    0.083644  106.303592
Condition1     2.934821    0.033920   86.521451
Street         2.459290    0.033635   73.116416
GarageQual     4.466017    0.069809   63.975240
LandSlope      3.824942    0.064499   59.301935
BsmtFinType2   3.179701    0.058628   54.234729
YrSold         1.220661    0.029609   41.226199
Id             0.700676    0.022586   31.022167
RoofMatl       1.873169    0.063302   29.590915
MiscVal        0.654934    0.024913   26.288844
Alley          1.115961    0.049771   22.421808
LowQualFinSF   0.956600    0.051324   18.638382
Utilities      0.298804    0.021732   13.749432
BsmtHalfBath   0.413789    0.035544   11.641500
MiscFeature    0.216122    0.040495    5.336964
MasVnrType     0.166779  