In [15]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time
from ipykernel import kernelapp as app
from scipy import stats
import json
import pickle

from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split, StratifiedKFold, ShuffleSplit
from sklearn.metrics import mean_absolute_error
from sklearn.utils import resample

from tensorflow import keras
from keras import Sequential, optimizers
from keras.models import Model, model_from_json, Sequential
from keras.layers import Dense, Input, BatchNormalization, Dropout
from keras.callbacks import ReduceLROnPlateau

pd.options.display.max_rows = 999
pd.options.display.max_columns = 999

import BoostedIGA as BIGA

         x1        x2        x3        x4        x5        x6        x7  \
0  0.516399  0.570668  0.028474  0.171522  0.685277  0.833897  0.306966   
1  0.554228  0.352132  0.181892  0.785602  0.965483  0.232354  0.083561   
2  0.685306  0.517867  0.048485  0.137869  0.186967  0.994318  0.520665   
3  0.913154  0.807920  0.402998  0.357224  0.952877  0.343632  0.865100   
4  0.097146  0.102847  0.701507  0.890480  0.159560  0.275573  0.672492   

         x8        x9       x10  
0  0.893613  0.721544  0.189939  
1  0.603548  0.728993  0.276239  
2  0.578790  0.734819  0.541962  
3  0.830278  0.538161  0.922469  
4  0.164303  0.701371  0.487635  
           y
0  17.579365
1  20.461479
2  15.369717
3  15.862597
4  10.828538


In [None]:
from sklearn.datasets import load_boston
boston_dataset = load_boston()
boston_df = pd.DataFrame(boston_dataset.data, columns=boston_dataset.feature_names)
boston_df['MEDV'] = boston_dataset.target

X = boston_df.drop(['MEDV'], axis = 1)
y = boston_df.drop(X.columns, axis = 1)


In [None]:
def drop_features(features):
    # id and label (not features)
    unused_feature_list = ['parcelid']

    # too many missing
    missing_list = ['framing_id', 'architecture_style_id', 'story_id', 'perimeter_area', 'basement_sqft', 'storage_sqft'
                   ]
    unused_feature_list += missing_list

    # not useful
    bad_feature_list = ['fireplace_flag', 'deck_id', 'pool_unk_1', 'construction_id', 'fips', 'county_id',
                        'Unnamed: 0', 'missing'
                       ]
    #['parcelid', 'logerror', 'transactiondate', 'propertyzoningdesc', 'propertycountylandusecode','fireplacecnt', 'fireplaceflag']
    unused_feature_list += bad_feature_list

    # hurts performance
    unused_feature_list += ['county_landuse_code_id', 'zoning_description_id']

    return features.drop(unused_feature_list, axis=1, errors='ignore')

def impute_data(df):
    values = {'quality_id':stats.mode(df.loc[(df.quality_id.isnull()==False),'quality_id'].values)[0][0],
              'finished_area_sqft_calc': df.loc[(df.finished_area_sqft_calc.isnull() == False),'finished_area_sqft_calc'].values.mean(),
              'lot_sqft': df.loc[(df.lot_sqft.isnull() == False),'lot_sqft'].values.mean(),
              'census_1': df.loc[(df.census_1.isnull() == False),'census_1'].values.mean(), 
              'bathroom_small_cnt':0,
              'unit_cnt': stats.mode(df.loc[(df.unit_cnt.isnull() == False),'unit_cnt'].values)[0][0],
              'patio_sqft':df.loc[(df.patio_sqft.isnull() == False),'patio_sqft'].values.mean(),
              'tax_property':df.loc[(df.tax_property.isnull() == False),'tax_property'].values.mean(),
              'census_2': df.loc[(df.census_2.isnull() == False),'census_2'].values.mean(),
             }
    df = df.fillna(value=values)
    
    mask = (df.pool_cnt >= 1)
    df.loc[mask,'pool_total_size'] = df.loc[(df.pool_total_size.isnull() == False),'pool_total_size'].values.mean()
    
    mask = (df.garage_cnt >= 1)
    df.loc[mask,'garage_cnt'] = df.loc[(df.garage_sqft.isnull()==False),'garage_sqft'].values.mean()
    
    mask_0 = (df.tax_property >0)
    mask_1 = (df.finished_area_sqft_calc > 0)
    mask_null = (df.property_tax_per_sqft.isnull() == True)
    df.loc[mask_null,'property_tax_per_sqft']= df.loc[mask_0,'tax_property'] / df.loc[mask_1,'finished_area_sqft_calc']

    mask_0 = (df.avg_area_per_room >0)
    mask_null = (df.avg_area_per_room.isnull() == True)
    df.loc[mask_null,'avg_area_per_room'] = df.loc[mask_1,'finished_area_sqft_calc'] / df.loc[df.room_cnt>0, 'room_cnt']
    
    df.loc[np.isfinite(df.avg_garage_size) == False] = 0
    
    return df

def transform_test_features(features_2016, features_2017):
    test_features_2016 = catboost_drop_features(features_2016)
    test_features_2017 = catboost_drop_features(features_2017)
    
    test_features_2016['year'] = 0
    test_features_2017['year'] = 1
    
    # 11 and 12 lead to bad results, probably due to the fact that there aren't many training examples for those two
    test_features_2016['month'] = 10
    test_features_2017['month'] = 10
    
    test_features_2016['quarter'] = 4
    test_features_2017['quarter'] = 4
    
    return test_features_2016, test_features_2017

In [None]:
zillow_df = pd.read_csv('zillow_2016.csv', index_col=0)

drop_list = ['bathroom_cnt','bedroom_cnt','latitude','longitude','room_cnt','tax_year',
            'year_built','location_1','location_2','location_3','location_4','derived_room_cnt',
            'avg_area_per_room']

#zillow_features = impute_data(zillow_df)

for c in zillow_df.columns:
    if zillow_df[c].dtype == 'object':
        lbl = LabelEncoder()
        lbl.fit(list(zillow_df[c].fillna(-1).values))
        zillow_df[c] = lbl.transform(list(zillow_df[c].fillna(-1).values))

zillow_features = drop_features(zillow_df)

FS = ['bathroom_cnt', 'bedroom_cnt', 'quality_id', 'floor1_sqft',
       'finished_area_sqft_calc', 'floor1_sqft_unk', 'base_total_area',
       'fireplace_cnt', 'bathroom_full_cnt', 'garage_cnt', 'garage_sqft',
       'heating_id', 'latitude', 'longitude', 'pool_total_size',
       'landuse_type_id', 'census_1', 'city_id', 'neighborhood_id',
       'patio_sqft', 'year_built', 'story_cnt', 'tax_structure', 'tax_parcel',
       'tax_property', 'tax_overdue_year', 'census_2', 'location_1',
       'location_2', 'location_3', 'location_4', 'missing_total_area',
       'derived_room_cnt', 'avg_area_per_room', 'derived_avg_area_per_room',
       'month', 'quarter']

#zillow_features.dropna(axis=0, subset = drop_list, inplace=True)
#zillow_features.fillna(-1.0, inplace= True)

zillow_label = zillow_df.logerror.astype(np.float32)
#prepare

X = zillow_features.drop(['logerror'],axis=1)
#X = zillow_features[FS]
X.reset_index(drop=True, inplace=True)
y = zillow_label.values.reshape(-1,1)
#y = zillow_norm(y,zillow_label.min(),zillow_label.max(),0,1)
"""
from sklearn.preprocessing import Imputer
imputer= Imputer()
imputer.fit(X.iloc[:, :])
X_train = imputer.transform(X.iloc[:, :])

X = pd.DataFrame(X_train, columns = X.columns)
"""
#normData, (scaler_x, scaler_y) = normalize(X,y)


In [None]:
feature_names = [s for s in X.columns]
cat_features = ['cooling_id', 'heating_id', 'landuse_type_id', 'year', 'month', 'quarter']

categorical_indices = []
for i, n in enumerate(X.columns):
    if n in cat_features:
        categorical_indices.append(i)
        X[n] = X[n].astype(np.str)

In [10]:
#Hyperparameter Tuning

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
from keras.wrappers.scikit_learn import KerasRegressor

scoring = {
    'MAAPE':make_scorer(mean_arctan_absolute_percentage_error, greater_is_better=False),
    'MSE':'neg_mean_squared_error'}


model = keras.wrappers.scikit_learn.KerasRegressor(build_fn=create_model,
                        epochs=10, 
                        batch_size=5,
                        verbose=0)

param_grid = {'epochs':[50,100,150],
              'batch_size':[16,32,64]}

grid = GridSearchCV(model,
                    param_grid=param_grid,
                    return_train_score=True,
                    scoring=scoring,
                    refit = 'MSE')

grid_results = grid.fit(normData[0], y)

print('Parameters of the best model: ')
print(grid_results.best_params_)



Parameters of the best model: 
{'batch_size': 32, 'epochs': 150}


In [9]:
print(f'Best Accuracy for {grid_results.best_score_:.4} using {grid_results.best_params_}')
means = grid_results.cv_results_['mean_score_time']
stds = grid_results.cv_results_['std_score_time']
params = grid_results.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print(f'mean={mean:.4}, std={stdev:.4} using {param}')

Best Accuracy for -0.009246 using {'batch_size': 32, 'epochs': 50, 'learning_rate': 0.001}
mean=0.03319, std=0.005137 using {'batch_size': 16, 'epochs': 50, 'learning_rate': 0.01}
mean=0.05543, std=0.004372 using {'batch_size': 16, 'epochs': 50, 'learning_rate': 0.005}
mean=0.08032, std=0.005013 using {'batch_size': 16, 'epochs': 50, 'learning_rate': 0.001}
mean=0.1012, std=0.004867 using {'batch_size': 16, 'epochs': 100, 'learning_rate': 0.01}
mean=0.1269, std=0.003253 using {'batch_size': 16, 'epochs': 100, 'learning_rate': 0.005}
mean=0.1497, std=0.00592 using {'batch_size': 16, 'epochs': 100, 'learning_rate': 0.001}
mean=0.1737, std=0.002343 using {'batch_size': 16, 'epochs': 150, 'learning_rate': 0.01}
mean=0.2006, std=0.005892 using {'batch_size': 16, 'epochs': 150, 'learning_rate': 0.005}
mean=0.2205, std=0.004274 using {'batch_size': 16, 'epochs': 150, 'learning_rate': 0.001}
mean=0.237, std=0.005691 using {'batch_size': 32, 'epochs': 50, 'learning_rate': 0.01}
mean=0.2663, std

In [None]:
Best_score = {'RMSE':0, 
              'MAAPE':0, 
              'Iteration':0,
              'Features':[]
             }
Score_t = []
D_t = []
MAPE_score = []
MAAPE_score = []
MAE_score = []
RMSE_score = []

#init weight
D_t.append([1/X.shape[0]] * X.shape[0])

for t in range(0,X.shape[1]):
    print('Feature Selection x Boosting >>> Iteration: %d' % (t+1))
    if (t == 0):
        Xs = pd.DataFrame(data = normData[0], columns = X.columns)
        
    nw1 = create_model(len(Xs))
    #feature selection
    nw1.fit(
            Xs, y, 
            epochs = grid_results.best_params_['epochs'], 
            batch_size=grid_results.best_params_['batch_size'], 
            verbose = 0
           )
    W = nw1.layers[1].get_weights()[0]
    V = nw1.layers[2].get_weights()[0]
    print("Feature Selection Process: %d" % (t+1))
    ranked_features = IGA(W, V, Xs.values, D_t[t], Xs.columns) 
    ranked_features.reset_index(inplace = True)
    Xs.drop([ranked_features['Feature'][0]], axis = 1, inplace = True)
    X_sel = X.drop(Xs.columns, axis = 1)
    print("Selected Feature: ",(X_sel.columns))
    

    #Boosting
    nw2, ss_maape, ss_rmse  = train_Shuffle(X_sel,y, 
                                            batch_size=grid_results.best_params_['batch_size'], 
                                            epochs=grid_results.best_params_['epochs'],
                                            n_split = 5
                                           )
    
    print("Avg MAAPE: %.5f%% (+/- %.5f%%)" % (np.mean(ss_maape), np.std(ss_maape)))
    print('Avg RMSE: %.5f%% (+/- %.5f%%)' % (np.mean(ss_rmse), np.std(ss_rmse)))
    
    MAAPE_score.append(np.mean(ss_maape))
    RMSE_score.append(np.mean(ss_rmse))

    D_t.append(update_weight_R2(nw2.predict(X_sel), y, D_t[t]))
    
    if(np.mean(ss_maape) < Best_score[0][0]):
        print('New record!')

        Best_score['MAAPE'] = np.mean(ss_maape)
        Best_score['RMSE'] = np.mean(ss_rmse)
        Best_score['Iteration'] = t
        Best_score['Features'] = X_sel.columns

In [None]:
plt.scatter(range(1,X.shape[1]+1), MAE_score)
print(Best_score[3].values)
print(Best_score[2])
print(Best_score[0])
print(Best_score[1])

In [None]:
#Non-selection model baseline

model, ss_maape, ss_rmse = train_Shuffle(
                                        X, y, 
                                        batch_size=grid_results.best_params_['batch_size'], 
                                        epoch = grid_results.best_params_['epochs'],
                                        n_split = 5,
                                        norm = True
                                        )

print("Avg MAAPE: %.5f%% (+/- %.3f%%)" % (np.mean(ss_maape), np.std(ss_maape)))
print("Avg RMSE: %.5f%% (+/- %.3f%%)" % (np.mean(ss_rmse), np.std(ss_rmse)))


In [None]:
#Create test set
prop_2016 = pd.read_csv('prop_2016.csv')

sample = pd.read_csv('sample_sub.csv')
sample['parcelid'] = sample['ParcelId']
sample = sample.drop('Unnamed: 0', axis = 1)

df_test = sample.merge(prop_2016, on='parcelid', how='left')

#add datetime
df_test['year'] = zillow_df['year']
df_test['month'] = zillow_df['month']
df_test['quarter'] = zillow_df['quarter']
#df.drop()
for c in df_test.columns:
    if df_test[c].dtype == 'object':
        lbl = LabelEncoder()
        lbl.fit(list(df_test[c].fillna(-1).values))
        df_test[c] = lbl.transform(list(df_test[c].fillna(-1).values))
        
#df_test = impute_data(df_test)
df_test = drop_features(df_test)
df_test = df_test[X.columns]
#df_test.fillna(-1.0, inplace= True)
#df_test = df_test[zillow_features.drop('logerror',axis=1).columns.values]
for i, n in enumerate(df_test.columns):
    if n in cat_features:
        df_test[n] = df_test[n].astype(np.str)
        
FS = ['bathroom_cnt', 'bedroom_cnt', 'quality_id', 'floor1_sqft',
       'finished_area_sqft_calc', 'floor1_sqft_unk', 'base_total_area',
       'fireplace_cnt', 'bathroom_full_cnt', 'garage_cnt', 'garage_sqft',
       'heating_id', 'latitude', 'longitude', 'pool_total_size',
       'landuse_type_id', 'census_1', 'city_id', 'neighborhood_id',
       'patio_sqft', 'year_built', 'story_cnt', 'tax_structure', 'tax_parcel',
       'tax_property', 'tax_overdue_year', 'census_2', 'location_1',
       'location_2', 'location_3', 'location_4', 'missing_total_area',
       'derived_room_cnt', 'avg_area_per_room', 'derived_avg_area_per_room',
       'month', 'quarter']

#df_test = df_test[FS]

x_test = df_test.values
#df_test = df_test[X.columns]
"""
from sklearn.preprocessing import Imputer
imputer= Imputer()
imputer.fit(df_test.iloc[:, :])
x_test = imputer.transform(df_test.iloc[:, :])
"""
#print(x_test.shape)

In [None]:
y_pred = model.predict(x_test)
y_pred = y_pred.flatten()
#y_pred = scaler_y.inverse_transform(y_pred_ann).flatten()

output = pd.DataFrame({'ParcelId': prop_2016['parcelid'].astype(np.int32),
        '201610': y_pred, '201611': y_pred, '201612': y_pred,
        '201710': y_pred, '201711': y_pred, '201712': y_pred})

output.to_csv('model/score/submission_02_CatBoost.csv', index=False)
print(output.head())

In [None]:
#Create test set
prop_2016 = pd.read_csv('prop_2016.csv')

sample = pd.read_csv('sample_sub.csv')
sample['parcelid'] = sample['ParcelId']
sample = sample.drop('Unnamed: 0', axis = 1)

df_test = sample.merge(prop_2016, on='parcelid', how='left')

#add datetime
df_test['year'] = zillow_df['year']
df_test['month'] = zillow_df['month']
df_test['quarter'] = zillow_df['quarter']

for c in df_test.columns:
    if df_test[c].dtype == 'object':
        lbl = LabelEncoder()
        lbl.fit(list(df_test[c].fillna(-1).values))
        df_test[c] = lbl.transform(list(df_test[c].fillna(-1).values))
        
#df_test = impute_data(df_test)
df_test = drop_features(df_test)
#df_test = df_test[zillow_features.drop('logerror',axis=1).columns.values]

"""
df_test = df_test[['bathroom_cnt', 'bedroom_cnt', 'floor1_sqft', 'finished_area_sqft_calc',
       'floor1_sqft_unk', 'bathroom_full_cnt', 'latitude', 'longitude',
       'region_zip', 'unit_cnt', 'tax_structure', 'tax_parcel', 'tax_land',
       'tax_property', 'property_tax_per_sqft', 'location_1', 'location_2',
       'location_3', 'location_4', 'derived_room_cnt', 'avg_area_per_room']]
"""
df_test = df_test[['bathroom_cnt', 'bedroom_cnt', 'quality_id', 'floor1_sqft',
       'finished_area_sqft_calc', 'floor1_sqft_unk', 'base_total_area',
       'fireplace_cnt', 'bathroom_full_cnt', 'garage_cnt', 'garage_sqft',
       'heating_id', 'latitude', 'longitude', 'pool_total_size',
       'landuse_type_id', 'census_1', 'city_id', 'neighborhood_id',
       'patio_sqft', 'year_built', 'story_cnt', 'tax_structure', 'tax_parcel',
       'tax_property', 'tax_overdue_year', 'census_2', 'location_1',
       'location_2', 'location_3', 'location_4', 'missing_total_area',
       'derived_room_cnt', 'avg_area_per_room', 'derived_avg_area_per_room',
       'month', 'quarter']]

#df_test = df_test[X.columns]
#df_test.fillna(-1.0, inplace= True)

from sklearn.preprocessing import Imputer
imputer= Imputer()
imputer.fit(df_test)
x_test = imputer.transform(df_test)

#x_test = df_test[Best_score[3].values]

#print(x_test.shape)

In [None]:
from sklearn.model_selection import KFold

def create_model_zillow(normX=10, optimizer='adam', learning_rate = 0.001):
    nn = Sequential()
    nn.add(Dense(units = 160 , kernel_initializer = 'he_normal', activation = 'tanh', input_dim = normX.shape[1]))
    nn.add(Dense(units = 80 , kernel_initializer = 'he_normal', activation = 'relu'))
    nn.add(Dense(units = 25 , kernel_initializer = 'he_normal', activation = 'relu'))
    nn.add(Dense(1, kernel_initializer='he_normal'))
    adam = optimizers.Adam(lr=0.001, beta_1=0.9, beta_2=0.999, amsgrad=False)
    nn.compile(loss='mae', optimizer=adam)

    #print(model.summary())
    return nn

FS = df_test.columns

Xs = X[FS]

sc = StandardScaler()
Xs = sc.fit_transform(X[FS])

model = create_model_zillow(Xs)
"""
cv = KFold(n_splits=4, random_state=42, shuffle=False)
for train_index, test_index in cv.split(Xs):
    X_train, X_test, y_train, y_test = Xs[train_index], Xs[test_index], y[train_index], y[test_index]
    model.fit(X_train, y_train ,epochs=60, verbose=2, batch_size=32)
    print('------End of fold---------')
"""

#normData, Scaler = normalize(Xs, y)

model.fit(Xs, y, epochs = 60, batch_size=32, verbose = 2)
#y_pred = model.predict(x_test)

#print(metrics.mean_squared_error(normY[test], pred,))

In [None]:
#x_test = MinMaxScaler().fit_transform(x_test)
x_test = sc.transform(x_test)

#y_pred_ann = loaded_model.predict(x_test)
y_pred = model.predict(x_test)
y_pred = y_pred.flatten()
#y_pred = scaler_y.inverse_transform(y_pred_ann).flatten()

output = pd.DataFrame({'ParcelId': prop_2016['parcelid'].astype(np.int32),
        '201610': y_pred, '201611': y_pred, '201612': y_pred,
        '201710': y_pred, '201711': y_pred, '201712': y_pred})

In [None]:
output.to_csv('model/score/submission_06_3-FS.csv', index=False)
print(output.head())

In [None]:
MAAPE_score = score[0]
RMSE_score = score[1]
MAE_score = score[2]

print(X.shape)
print(len(MAAPE_score))
print(len(MAAPE_RFE))
print(len(MAAPE_RLF))

x_plt = range(1,X.shape[1]+1)
y1 = MAAPE_score
y2 = MAAPE_RFE
y3 = MAAPE_RLF

plt.style.use('seaborn-whitegrid')

fig, ax = plt.subplots(3)
ax[0].scatter(x_plt, y1, linewidth=2, c='green')
ax[1].scatter(x_plt, y2, linewidth=2, marker = '^')
ax[2].scatter(x_plt, y3, linewidth=2, c='red', marker = '+')

for i, txt in enumerate(y1):
    ax[0].annotate(s='%.3f'%y1[i], xy=(x_plt[i], y1[i]), xytext =(x_plt[i], y1[i]+0.5))
    ax[1].annotate(s='%.3f'%y2[i], xy=(x_plt[i], y2[i]), xytext =(x_plt[i], y2[i]+0.5))
    ax[2].annotate(s='%.3f'%y3[i], xy=(x_plt[i], y3[i]), xytext =(x_plt[i], y3[i]+0.5))

In [None]:
#Friedman
score = [[30.473142470555143, 27.524494961588864, 26.605383547798986, 23.22201894753832, 15.27896139801174, 15.36745466501974, 15.711099321309906, 15.72440426777695, 15.476193287964936, 16.127032362238708], [5.049074973674501, 4.551485525981321, 4.320769171530718, 3.804615991930487, 2.546364366210144, 2.619511759671777, 2.6712968881620966, 2.627200736599554, 2.5760268708290726, 2.6155109889741155], [4.104870167962773, 3.676849911772784, 3.509723176576311, 3.0744177133171386, 1.9530088838472508, 2.0300163041462653, 2.055415123054879, 2.077687303564899, 2.007485151557529, 2.0278159884068554]]
comp_score = ([[30.473142470555143, 27.524494961588864, 26.605383547798986, 23.22201894753832, 15.27896139801174, 15.36745466501974, 15.711099321309906, 15.72440426777695, 15.476193287964936, 16.127032362238708], [5.049074973674501, 4.551485525981321, 4.320769171530718, 3.804615991930487, 2.546364366210144, 2.619511759671777, 2.6712968881620966, 2.627200736599554, 2.5760268708290726, 2.6155109889741155], [4.104870167962773, 3.676849911772784, 3.509723176576311, 3.0744177133171386, 1.9530088838472508, 2.0300163041462653, 2.055415123054879, 2.077687303564899, 2.007485151557529, 2.0278159884068554]], [[[26.55372129318402, 21.881879098311042, 18.111132191390166, 15.358869570274464, 15.820994934186828, 16.309656175237414, 15.528439099502176, 15.827722810709627, 16.020385590257167, 15.858275496878587], [4.2103912774873065, 3.4719257065339284, 2.9478176558982163, 2.5718204605288655, 2.604625462867344, 2.7068780391950265, 2.588527680957001, 2.606214539563855, 2.6884476055062225, 2.6140861015572776], [3.4677701423071654, 2.805577983714497, 2.341127548497288, 2.0042473374892174, 2.0367794520028735, 2.103645841729993, 2.005077985338816, 2.0268774602921793, 2.0710612564211077, 1.9893827088903238]]], [[[29.933526726938254, 30.136473180118106, 30.33066703827501, 30.075184152073348, 29.196192868059093, 27.929515699260776, 26.740085995113578, 20.45699571587513, 21.223858383283602, 15.675444064475759], [4.2103912774873065, 3.4719257065339284, 2.9478176558982163, 2.5718204605288655, 2.604625462867344, 2.7068780391950265, 2.588527680957001, 2.606214539563855, 2.6884476055062225, 2.6140861015572776], [3.4677701423071654, 2.805577983714497, 2.341127548497288, 2.0042473374892174, 2.0367794520028735, 2.103645841729993, 2.005077985338816, 2.0268774602921793, 2.0710612564211077, 1.9893827088903238]]])

MAAPE_score = score[0]
RMSE_score = score[1]
MAE_score = score[2]

RFE_comp = comp_score[1]
RMSE_RFE = comp_score[1][0][1]

x1 = ['x1','x2','x4','x4','x5']
y1 = RMSE_score[:5]
y2 = RMSE_RFE[:5]
y3 = RMSE_RLF[:5]

plt.style.use('seaborn-whitegrid')

fig, ax = plt.subplots(3)
ax[0].scatter(x_plt, y1, linewidth=2, c='green')
ax[0].set_xlim([0.5,5.5])
ax[1].scatter(x_plt, y2, linewidth=2, marker='^')
ax[1].set_xlim([0.5,5.5])
ax[2].scatter(x_plt, y3, linewidth=2, marker='+', c='red')
ax[2].set_xlim([0.5,5.5])


labels_1 = ['','x3','x2','x5','x1','x4']
ax[0].set_xticklabels(labels_1)

labels_1 = ['','x4','x2','x1','x5','x10']
ax[1].set_xticklabels(labels_1)

for i, txt in enumerate(y1):
    ax[0].annotate(s='%.3f'%y1[i], xy=(x_plt[i], y1[i]), xytext =(x_plt[i], y1[i]+0.2))
    ax[1].annotate(s='%.3f'%y2[i], xy=(x_plt[i], y2[i]), xytext =(x_plt[i], y2[i]+0.2))
    ax[2].annotate(s='%.3f'%y3[i], xy=(x_plt[i], y3[i]), xytext =(x_plt[i], y3[i]+0.02))

In [None]:
#Feature Selection : RFE
from sklearn import linear_model
from sklearn.feature_selection import RFE
# load data
#dataframe = house_df.drop(['SalePrice'],axis = 1)
#dataframe.drop(['ProvinceID','DistrictID','UserType','PropertyType'], axis = 1, inplace = True)
#target = house_df['SalePrice']

RFE_FS = []
high_score_RFE=0
nof_RFE = 0
MAAPE_RFE = []
RMSE_RFE = []
MAE_RFE = []
RFE_Score = []
sel_f = np.nan
# feature extraction
for k in range(0, X.shape[1]):
    print("Ite :", k+1)
    model = linear_model.LinearRegression()
    rfe = RFE(model,k+1)
    X_rfe = rfe.fit_transform(normData[0], normData[1])
    temp = pd.Series(rfe.support_, index = X.columns)
    selected_features_rfe = temp[temp==True].index
    print(selected_features_rfe)

    model, history_RFE, res, RFE_mape, RFE_maape, RFE_rmse, RFE_mae = train_Shuffle(X_rfe, y, 150, 16,norm = True)

    print("Avg MAAPE: %.2f%% (+/- %.2f%%)" % (np.mean(RFE_maape), np.std(RFE_maape)))
    print("Avg RMSE: %.2f%% (+/- %.2f%%)" % (np.mean(RFE_rmse), np.std(RFE_rmse)))
    print("Avg MAE: %.2f%% (+/- %.2f%%)" % (np.mean(RFE_mae), np.std(RFE_mae)))
    
    MAAPE_RFE.append(np.mean(RFE_maape))
    RMSE_RFE.append(np.mean(RFE_rmse))
    MAE_RFE.append(np.mean(RFE_mae))
    RFE_FS.append(temp)
    if(np.mean(RFE_rmse) < high_score_RFE):
        print('New Record!')
        high_score_RFE = np.mean(RFE_rmse)
        nof_RFE = X_rfe.shape[1]
        sel_f = selected_features_rfe
        
RFE_Score = [MAAPE_RFE, RMSE_RFE, MAE_RFE]

In [None]:
print(high_score_RFE)
print(nof_RFE)
print(sel_f)

plt.scatter(range(1,len(r2_RFE)+1), rmse_RFE)

In [None]:
import sklearn_relief as relief

y_list = []
for i in y.values:
    y_list.append(i[0])
y_list = np.asarray(y_list)

high_score_RLF = 100
nof_RLF = 0
sel_RLF = []
MAAPE_RLF = []
RMSE_RLF = []
MAPE_RLF = []
MAE_RLF = []
RLF_Score = []

for k in range(0,X.shape[1]):
    print('Iteration: ', k+1)
    r = relief.RReliefF(n_jobs=1, n_features = k+1)
    X_RLF = r.fit_transform(X.values, y_list)
    sel_RLF.append(X_RLF)

    model_RLF, history_RLF, res, RLF_mape, RLF_maape, RLF_rmse, RLF_mae = train_Shuffle(X_RLF, y, 100,16)
    
    print("Avg MAAPE: %.2f%% (+/- %.2f%%)" % (np.mean(RLF_maape), np.std(RLF_maape)))
    print("Avg RMSE: %.2f%% (+/- %.2f%%)" % (np.mean(RLF_rmse), np.std(RLF_rmse)))
    
    RMSE_RLF.append(np.mean(RLF_rmse))
    MAPE_RLF.append(np.mean(RLF_mape))
    MAAPE_RLF.append(np.mean(RLF_maape))
    MAE_RLF.append(np.mean(RLF_mae))
    if(np.mean(RLF_rmse) < high_score_RLF):
        print('New Record!')
        high_score_RLF = np.mean(RLF_rmse)
        nof_RLF = k

RLF_Score = [MAAPE_RLF, RMSE_RLF, MAE_RLF]

In [None]:
Scores = []
RLF_Score = [MAAPE_RLF, RMSE_RLF, MAE_RLF]
RFE_Score = [[[25.272627511637463,21.174547376864002,17.7255826783602,15.651749607165133,16.455264568189722,15.630521473014335,17.075491796367615,15.559287132699529,17.311902544858057,15.374725296968132],[4.1359020402279345,3.4572107607236555,2.855058535957814,2.4631819795571217,2.6538084828702626,2.5270956763719603,2.7123168325586544,2.5950615464680595,2.7969309531917426,2.553069299888645],[3.422283506529446,2.739075642052992,2.2423720075785685,1.9547090107450167,2.059485032023333,1.9648064177913915,2.149548692290976,2.009688620488025,2.19583138148212,1.9822889501347647]]]
IGarson_Score = [[30.473142470555143, 27.524494961588864, 26.605383547798986, 23.22201894753832, 15.27896139801174, 15.36745466501974, 15.711099321309906, 15.72440426777695, 15.476193287964936, 16.127032362238708], [5.049074973674501, 4.551485525981321, 4.320769171530718, 3.804615991930487, 2.546364366210144, 2.619511759671777, 2.6712968881620966, 2.627200736599554, 2.5760268708290726, 2.6155109889741155], [4.104870167962773, 3.676849911772784, 3.509723176576311, 3.0744177133171386, 1.9530088838472508, 2.0300163041462653, 2.055415123054879, 2.077687303564899, 2.007485151557529, 2.0278159884068554]]

Scores = IGarson_Score, RFE_Score, RLF_Score

file = open("model/Comparing_Score_Friedman.txt","w")
file.write(str(Scores))
file.close()

In [None]:
RLF_Score

In [None]:
print("Unselected Model")
print('Accuracy = %.5f with %d features' % (unsel_score, X.shape[1]))
print("Avg MAPE: %.5f%% (+/- %.3f%%)" % (np.mean(ss_mape), np.std(ss_mape)))

print("\nBoosting x IGarson")
print('Accuracy = %.5f with %d features' % (Best_score[0][0], Best_score[2]))

print("\nRLF")
print("Accuracy = %.5f with %d features" %(high_score_RLF, nof_RLF))

print("\nRFE")
print("Accuracy = %.5f with %d features" %(high_score_RFE, nof_RFE)) 