In [50]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge, Lasso
import xgboost as xgb
# from sklearn.model_selection import GridSearchCV
import pandas as pd
from sklearn.model_selection import train_test_split

In [51]:
preprocessed_data = pd.read_csv('../data/processed_data/feature_engineered_data.csv')

In [52]:
from sklearn.preprocessing import StandardScaler
#Standerdize
def scalar(X_train, X_test):
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    return X_train, X_test

In [53]:
from sklearn.decomposition import PCA
# Perform PCA for dimensionality reduction
def apply_pca(xt,n):
    pca = PCA(n_components=n)
    X_pca = pca.fit_transform(xt)
    return X_pca

In [54]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
import xgboost as xgb
from sklearn.metrics import mean_squared_error
from math import sqrt
# from sklearn.model_selection import GridSearchCV
import pandas as pd
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error

def model_training (X_train, X_test, y_train, y_test) :

    model_list = ['Decision Tree']
    #model_list = ['Decision Tree','Linear Regression','XgBoost', 'Random Forest', 'Ridge', 'Lasso' ]
    results_df = pd.DataFrame()
    for model in model_list:
        #Model Training and parameter tuning using grid search
        if model == "XgBoost":
            model_name = xgb.XGBRegressor()
            print ("XgBoost Model Training...........")
            model_name.fit(X_train, y_train)

        elif model == "Random Forest" :
            model_name = RandomForestRegressor()
            print ("Random Forest Model Training...........")
            model_name.fit(X_train, y_train)

        elif model == "Ridge" : 
            model_name = Ridge()
            print ("Ridge Training...........")
            model_name.fit(X_train, y_train)

        elif model == "Lasso":    
            model_name = Lasso()
            print ("Lasso Model Training...........")
            model_name.fit(X_train, y_train)

        elif model == "Decision Tree":    
            model_name = DecisionTreeRegressor(random_state=0)
            print ("Decision Tree Model Training...........")
            model_name.fit(X_train, y_train)

        elif model == "Linear Regression":    
            model_name = LinearRegression()
            print ("Linear Regression Model Training...........")
            model_name.fit(X_train, y_train)
        
        results_df = predictandaccuracyModel(X_test, y_test, results_df, model_name)

    return (results_df)

def predictandaccuracyModel(X_test, y_test, model, model_name):
    predictions =  model_name.predict(X_test)
    mse = mean_squared_error(y_test, predictions)
    rmse = sqrt(mse)
    results_df = pd.DataFrame()

        #Normalized rmse
    '''avg = y.max() - y.min()
        rmseN = rmse/avg'''
    
    print(f'rmse {model_name} with PCA: {rmse}')
        
    mae = mean_absolute_error(y_test, predictions)
    print(f'MAE {model_name} with PCA: {mae}')

    mape = mean_absolute_percentage_error(y_test, predictions)
    print(f'MAPE {model_name} with PCA: {mape}')

        # print (predictions)
    data = {'Actual': list(y_test), 'Predicted': predictions, 'Model': model_name} 
    model_pred = pd.DataFrame(data)

        #results_df = results_df.append(model_pred, ignore_index = True)
    results_df = pd.concat([model_pred,results_df])
    return results_df
  

In [55]:
df = preprocessed_data

drop_cols = ['SalePrice']
traincols = df.columns
feature_cols  = list(set(traincols)-set(drop_cols))
label_col = 'SalePrice'


X = df[df['type'] == 'train'][feature_cols]
y = df[df['type'] == 'train'][label_col]
X = X.drop(columns=['type'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
X_train, X_test = scalar(X_train, X_test )
X_train = apply_pca(X_train,10)
X_test = apply_pca(X_test,10)
predictions = model_training(X_train, X_test, y_train, y_test)
print (predictions)

Decision Tree Model Training...........
rmse DecisionTreeRegressor(random_state=0) with PCA: 118789.06350195389
MAE DecisionTreeRegressor(random_state=0) with PCA: 95782.18493150685
MAPE DecisionTreeRegressor(random_state=0) with PCA: 0.6492746184676093
       Actual  Predicted                                  Model
0    155000.0   128000.0  DecisionTreeRegressor(random_state=0)
1    260000.0   164000.0  DecisionTreeRegressor(random_state=0)
2    120000.0   189000.0  DecisionTreeRegressor(random_state=0)
3    168500.0   132250.0  DecisionTreeRegressor(random_state=0)
4    126000.0   189000.0  DecisionTreeRegressor(random_state=0)
..        ...        ...                                    ...
287  130500.0   159000.0  DecisionTreeRegressor(random_state=0)
288  133000.0   194500.0  DecisionTreeRegressor(random_state=0)
289  184100.0   135750.0  DecisionTreeRegressor(random_state=0)
290  135000.0   228000.0  DecisionTreeRegressor(random_state=0)
291  156500.0   230000.0  DecisionTreeRegr

In [56]:
predictions.to_csv("../data/predictions/model_predictions.csv", index=False)

In [57]:
import numpy as np
from sklearn.model_selection import GridSearchCV
parameters = {'criterion':['squared_error','absolute_error'],
              'max_depth':np.arange(1,21).tolist()[0::2],
              'min_samples_split':np.arange(2,11).tolist()[0::2],
              'max_leaf_nodes':np.arange(3,26).tolist()[0::2]}

# create an instance of the grid search object
g2 = GridSearchCV(DecisionTreeRegressor(), parameters, cv=5, n_jobs=-1)

# conduct grid search over the parameter space
g2.fit(X_train,y_train)

# show best parameter configuration found for regressor
rgr_params1 = g2.best_params_
rgr_params1

{'criterion': 'absolute_error',
 'max_depth': 7,
 'max_leaf_nodes': 25,
 'min_samples_split': 10}

In [58]:
model = g2.best_estimator_
y_pred = model.predict(X_test)
print('rmse score:', sqrt(mean_squared_error(y_test,y_pred)))
print('mae score: %.2f' % mean_absolute_error(y_test,y_pred))
print('MAPE:', mean_absolute_percentage_error(y_test, y_pred))

rmse score: 131109.1996429388
mae score: 100651.74
MAPE: 0.7001565019843652
