In [8]:
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import plotly.graph_objects as go
import plotly.io as pio
import plotly.express as px

from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant
import statsmodels.api as sm
from statsmodels.graphics.gofplots import qqplot


from sklearn.metrics import r2_score
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score, KFold
from sklearn.linear_model import Lasso
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge
from sklearn.linear_model import ElasticNet

import warnings
pd.set_option('display.max_columns', None)
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter('ignore', pd.errors.SettingWithCopyWarning)
warnings.simplefilter(action="ignore", category=pd.errors.PerformanceWarning)


In [9]:
train_wOrdinal = pd.read_csv('~/Documents/AmesHousingML/clean_data_ordinal.csv') 
train_originalCleaned = pd.read_csv('~/Documents/AmesHousingML/clean_data_original.csv') 
train_wOnlyDummies = pd.read_csv('~/Documents/AmesHousingML/clean_data_dummified.csv') 

In [10]:
print(train_originalCleaned.shape)
print(train_wOnlyDummies.shape)
print(train_wOrdinal.shape)

(2576, 86)
(2576, 266)
(2576, 217)


## Finding & Dealing with Outliers

In [11]:
outlier_threshold = 5

In [12]:
Q1 = train_wOrdinal['SalePrice'].quantile(.25)
Q3 = train_wOrdinal['SalePrice'].quantile(.75)
IQR = Q3 - Q1
new_bounds = Q3 + outlier_threshold * IQR

In [13]:
before_outlier_total = train_wOrdinal.count()[1]
train_wOrdinal.drop(train_wOrdinal[train_wOrdinal['SalePrice'] > new_bounds].index, axis=0, inplace = True)
post_outlier_total = train_wOrdinal.count()[1]

In [14]:
before_outlier_total - post_outlier_total

5

In [15]:
Q1 = train_originalCleaned['SalePrice'].quantile(.25)
Q3 = train_originalCleaned['SalePrice'].quantile(.75)
IQR = Q3 - Q1
new_bounds = Q3 + outlier_threshold * IQR

In [16]:
before_outlier_total = train_originalCleaned.count()[1]
train_originalCleaned.drop(train_originalCleaned[train_originalCleaned['SalePrice'] > new_bounds].index, axis=0, inplace = True)
post_outlier_total = train_originalCleaned.count()[1]

In [17]:
before_outlier_total - post_outlier_total

5

In [18]:
Q1 = train_wOnlyDummies['SalePrice'].quantile(.25)
Q3 = train_wOnlyDummies['SalePrice'].quantile(.75)
IQR = Q3 - Q1
new_bounds = Q3 + outlier_threshold * IQR

In [19]:
before_outlier_total = train_wOnlyDummies.count()[1]
train_wOnlyDummies.drop(train_wOnlyDummies[train_wOnlyDummies['SalePrice'] > new_bounds].index, axis=0, inplace = True)
post_outlier_total = train_wOnlyDummies.count()[1]

In [20]:
before_outlier_total - post_outlier_total

5

In [25]:
ourFrames = [train_wOrdinal, train_wOnlyDummies]

In [33]:
for frame in ourFrames:
    #the features will be X (independent variables)
    X = frame.drop('SalePrice', axis=1)
    X_array = X.values
    #the target (dependent variable) will be y
    y = frame['SalePrice']
    y_array = y.values

    #Split your training and testing sets of data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Create a Random Forest Regressor
    rf_model = RandomForestRegressor(n_estimators=100, random_state=42)  # You can adjust n_estimators as needed

    # Train the model on the training set
    rf_model.fit(X_train, y_train)

    # Make predictions on the test set
    rf_y_pred = rf_model.predict(X_test)

    # Evaluate the Random Forest model
    rf_r2 = r2_score(y_test, rf_y_pred)
    print(f'R-squared on the test set: {rf_r2}')

    rf_mse = mean_squared_error(y_test, rf_y_pred)
    print(f'Mean Squared Error on the test set: {rf_mse}')

    # Root Mean Squared Error (RMSE) on the test set
    rf_rmse = mean_squared_error(y_test, rf_y_pred, squared=False)
    print("Root Mean Squared Error (RMSE) on the test set:", rf_rmse)

    # Display feature importances
    feature_importances = pd.DataFrame({'Feature': X.columns, 'Importance': rf_model.feature_importances_})
    feature_importances = feature_importances.sort_values(by='Importance', ascending=False)
    print('\nFeature Importances:')
    print(feature_importances)
    # feat_importances = pd.Series(rf_model.feature_importances_, index=X.columns)
    # feat_importances.nlargest(20).plot(kind='barh')

R-squared on the test set: 0.9045741793721593
Mean Squared Error on the test set: 465255035.40146095
Root Mean Squared Error (RMSE) on the test set: 21569.77133400957

Feature Importances:
                   Feature    Importance
5              OverallQual  3.435228e-01
207           TotalHouseSF  2.326118e-01
212          HighQualFinSF  2.250370e-01
210        YearAndRemodAvg  1.871361e-02
1                GrLivArea  1.155774e-02
..                     ...           ...
129          RoofMatl_Roll  1.734466e-09
136     Exterior1st_CBlock  0.000000e+00
202  SaleCondition_AdjLand  0.000000e+00
156    Exterior2nd_PreCast  0.000000e+00
108        Condition2_RRAe  0.000000e+00

[216 rows x 2 columns]
R-squared on the test set: 0.9032621576281301
Mean Squared Error on the test set: 471651886.0541418
Root Mean Squared Error (RMSE) on the test set: 21717.547883086198

Feature Importances:
                 Feature  Importance
5            OverallQual    0.345135
261        HighQualFinSF    0.25