In [None]:
from itertools import combinations
import random
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from scipy.stats import zscore
import tensorflow.keras as tf
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation
from tensorflow.keras.optimizers import Adam
from sklearn import metrics

In [None]:
# determine list of features you want to check effect of on model accuracy
# use format ['f1', 'f2', 'f3']
# don't change if you want to check all features!
list_features = []

# determine number of best features you want to select in the data
export_files = True
keep_features = 10

In [None]:
# load data
clean_data = pd.read_csv('data/preprocess_nodummies_data.csv')
test_data = pd.read_csv('data/preprocess_nodummies_test_data.csv')
train_data = pd.read_csv('data/preprocess_nodummies_train_data.csv')

In [None]:
def create_dummies(df):
    """ transform categorical features to dummies with one-hot-encoding
    Taken from https://pandas.pydata.org/docs/reference/api/pandas.get_dummies.html"""

    # select catagorical data from dataframe
    cat_data = df.select_dtypes(include = 'object')
    
    # temporary remove categorical data from df
    df = df.drop(cat_data, axis=1)
    
    # create dummies from categorical data
    cat_data = pd.get_dummies(cat_data, drop_first=True)
    
    # merge cat data back into df
    df = pd.concat([cat_data, df], axis = 1)
    
    return df

In [None]:
def preprocess(df, target, train_size = 0.7):
    
    """convert the pandas dataframes to numpy ndarrays"""
    df = create_dummies(df)
    
    X_np = df.to_numpy()
    y_np = target.to_numpy()
    
    # split the data into 70% training and 30% testing
    X_train, X_test, y_train, y_test = train_test_split(X_np, y_np, train_size=train_size, random_state=1265599650)
    
    return X_train, X_test, y_train, y_test

In [None]:
def run_NN(df, y, runs = 3, ep = 100): 
    """Run neural network"""
    
    X_train, X_test, y_train, y_test = preprocess(df, target)
    n_features = X_train.shape[1]

    score = 0
    
    # run model i times
    for i in range(runs):
         
        # create layers
        model = tf.Sequential([
            Dense(160),
            Dense(224, activation='tanh'),
            Dense(112, activation='tanh'),
            Dense(240, activation='tanh'),
            Dense(1, input_shape=(n_features,)),
        ])    
    
        # compile model
        metric = tf.metrics.RootMeanSquaredError()
        model.compile(optimizer='Adam', loss=tf.metrics.mean_squared_error, metrics=[metric])  
    
        # run model for first number of epochs
        model.fit(X_train, y_train, batch_size=64, epochs=ep, validation_data=(X_test, y_test))
    
        # compute RMSE of model 
        y_pred = model.predict(X_test)
        score += metric(y_test,y_pred)
    
    score = score/runs
    
    return score

In [None]:
# run very simple neural network with all data
target = clean_data['SalePrice']
all_data = clean_data.drop(columns = ['SalePrice'])
score = run_NN(all_data, target)
print('Score of model with all data is:', score)

In [None]:
accuracies = {}

# select which features to check
if list_features == []:
    check_features = clean_data.drop(['SalePrice'], axis = 1) 
else:
    check_features = clean_data[list_features]
    
# itterate over all features in check data
for feature in check_features:
    
    # reset pop data to complete dataframe
    pop_data = clean_data.drop(['SalePrice'], axis = 1)
    
    # remove 1 check feature
    pop_data = pop_data.drop(columns = [feature])
    
    # run very simple neural network with pool data
    score = run_NN(pop_data, target)
    
    # save score of nn and reoved feature in dict 
    accuracies[feature] = score

In [None]:
# RMSE of the model drops after bad feature is removed
# sort RMSE of models from low to high to determine bad features
d = accuracies
d_list=sorted((value, key) for (key,value) in d.items())
sortdict=dict([(k,v) for v,k in d_list])
print(sortdict)

# display features from good to bad (low to high RMSE) 
worst_features = list(sortdict.keys())

# select bad features 
del worst_features[-keep_features:]                                    
display(worst_features)

In [None]:
# remove worst features from dataframe
for worst_feature in worst_features:
    clean_data = clean_data.drop([worst_feature], axis = 1)
    train_data = train_data.drop([worst_feature], axis = 1)
    test_data = test_data.drop([worst_feature], axis = 1)

In [None]:
display(clean_data)

In [None]:
# export data with dummies
clean_data = create_dummies(clean_data)

In [None]:
if export_files:
    # export final cleaned dataframe to csv file  
    clean_data.to_csv('data/bestfeatures_data.csv', index=False)  