In [36]:
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import plotly.graph_objects as go
import plotly.io as pio
import plotly.express as px

from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant
import statsmodels.api as sm
from statsmodels.graphics.gofplots import qqplot


from sklearn.metrics import r2_score
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import GradientBoostingRegressor


from sklearn.model_selection import cross_val_score, KFold
from sklearn.linear_model import Lasso
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge
from sklearn.linear_model import ElasticNet

import optuna
import xgboost as xgb

import warnings
pd.set_option('display.max_columns', None)
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter('ignore', pd.errors.SettingWithCopyWarning)
warnings.simplefilter(action="ignore", category=pd.errors.PerformanceWarning)


In [37]:
train_wOrdinal = pd.read_csv('~/Documents/AmesHousingML/data_cleaning/data_ordinal.csv') 
train_originalCleaned = pd.read_csv('~/Documents/AmesHousingML/data_cleaning/data_original.csv') 
train_wOnlyDummies = pd.read_csv('~/Documents/AmesHousingML/data_cleaning/data_dummies.csv') 
train_wOnlyDummiesNoDrop = pd.read_csv('~/Documents/AmesHousingML/data_cleaning/data_dummies_noDrop.csv') 

In [38]:
# Create a KFold object
kf = KFold(n_splits=5, shuffle=True, random_state=42)

In [39]:
masterScores = pd.DataFrame(columns = ['Model', 'data_process', 'r2_5kf_mean', 'rmse_5kf_mean'])

In [40]:
ourFrames = [train_wOrdinal, train_wOnlyDummies, train_wOnlyDummiesNoDrop]
outlierType = ['threeXOutlier', 'onlyNormalCondition', 'allOutliers']

In [41]:
for dataframe in ourFrames:
    dataset = 'NaN'
    if (dataframe.equals(train_wOrdinal)):
        dataset = 'ordinal'
    elif (dataframe.equals(train_wOnlyDummies)):
        dataset = 'dummified'
    elif (dataframe.equals(train_wOnlyDummiesNoDrop)):
        dataset = 'dummifiedNoDrop'
        
    for trial in outlierType:
        #this will create the model name by which outlier type, below we will adjust the dataset
        # to match the outlier type being tested, it will then be stored by dataset_outlierType
        model_name = 'scikit_gb'
        data_process = f"{dataset}_{trial}"
        
        #below is filtering out the 3*IQR range if the trial is for that purpose
        if (trial == 'threeXOutlier'):
            frame = dataframe.copy()
            outlier_threshold = 3
            Q1 = frame['SalePrice'].quantile(.25)
            Q3 = frame['SalePrice'].quantile(.75)
            IQR = Q3 - Q1
            new_bounds = Q3 + outlier_threshold * IQR
            frame.drop(frame[frame['SalePrice'] > new_bounds].index, axis=0, inplace = True)
            
        #below is filtering out the houses that sold in normal condition
        elif (trial == 'onlyNormalCondition'):
            frame = dataframe.copy()
            if (frame.equals(train_wOrdinal)):
                frame = frame[frame['SaleCondition'] == 4]
            elif (frame.equals(train_wOnlyDummies)):
                frame = frame[frame['SaleCondition_Normal'] == 1.0]
            elif (frame.equals(train_wOnlyDummiesNoDrop)):
                frame = frame[frame['SaleCondition_Normal'] == 1.0]
                
        elif (trial == 'allOutliers'):
            frame = dataframe.copy()
        
    
        #the features will be X (independent variables)
        X = frame.drop('SalePrice', axis=1)
        X_array = X.values
        #the target (dependent variable) will be y
        y = frame['SalePrice']
        y_array = y.values

        #Split your training and testing sets of data
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

        params = {'n_estimators': 100,'learning_rate': 0.1,'max_depth': 3, 'random_state': 42}
        #Input parameters into the model
        gb_model = GradientBoostingRegressor(**params)

        # Train the model on the training set
        gb_model.fit(X_train, y_train)

        # Make predictions on the test set
        gb_model_y_pred = gb_model.predict(X_test)

       
        # Display feature importances
        feature_importances = pd.DataFrame({'Feature': X.columns, 'Importance': gb_model.feature_importances_})
        feature_importances = feature_importances.sort_values(by='Importance', ascending=False)
        print('\nFeature Importances:')
        print(feature_importances)
        
        # Perform cross-validation
        cv_scores = cross_val_score(gb_model, X, y, cv=kf, scoring='r2')
        
        def rmse_scorer(y_true, y_pred):
            mse = mean_squared_error(y_true, y_pred)
            rmse = np.sqrt(mse)
            return rmse
    
        # Define the scoring function using neg_mean_squared_error
        scorer = make_scorer(rmse_scorer)
        
        # Use cross_val_score with the defined scorer
        rmse_scores = cross_val_score(gb_model, X, y, cv=kf, scoring=scorer)
        
        # Display the cross-validation scores
        print("Cross-Validation Scores:", cv_scores, '\n')
        print(f"Mean R^2: {cv_scores.mean()}", '\n') 
        
      
        record = {'Model': model_name, 'data_process': data_process, 'r2_5kf_mean': cv_scores.mean(), 'rmse_5kf_mean': rmse_scores.mean()}
        masterScores = masterScores.append(record, ignore_index=True)

        print('\n')
        print('\n')
        print('\n')

        # feat_importances = pd.Series(rf_model.feature_importances_, index=X.columns)
        # feat_importances.nlargest(20).plot(kind='barh')


Feature Importances:
               Feature  Importance
80       HighQualFinSF    0.312154
17         OverallQual    0.285288
75        TotalHouseSF    0.131952
19           YearBuilt    0.043418
76  TotalBathroomCount    0.038237
..                 ...         ...
35        BsmtFinType2    0.000000
62          GarageQual    0.000000
45        LowQualFinSF    0.000000
47        BsmtHalfBath    0.000000
42          Electrical    0.000000

[84 rows x 2 columns]
Cross-Validation Scores: [0.93301528 0.9064804  0.93036738 0.91971544 0.86746285] 

Mean R^2: 0.9114082705038372 








Feature Importances:
               Feature  Importance
17         OverallQual    0.315397
80       HighQualFinSF    0.287499
75        TotalHouseSF    0.181956
76  TotalBathroomCount    0.030817
78     YearAndRemodAvg    0.022552
..                 ...         ...
39             Heating    0.000000
45        LowQualFinSF    0.000000
68           3SsnPorch    0.000000
59        GarageFinish    0.000000
42     

In [42]:
masterScores

Unnamed: 0,Model,data_process,r2_5kf_mean,rmse_5kf_mean
0,scikit_gb,ordinal_threeXOutlier,0.911408,19722.476115
1,scikit_gb,ordinal_onlyNormalCondition,0.934772,17913.941231
2,scikit_gb,ordinal_allOutliers,0.913304,22060.491907
3,scikit_gb,dummified_threeXOutlier,0.910339,19856.202269
4,scikit_gb,dummified_onlyNormalCondition,0.933338,18117.270365
5,scikit_gb,dummified_allOutliers,0.914184,21953.270181
6,scikit_gb,dummifiedNoDrop_threeXOutlier,0.909811,19905.529479
7,scikit_gb,dummifiedNoDrop_onlyNormalCondition,0.934652,17922.807704
8,scikit_gb,dummifiedNoDrop_allOutliers,0.915274,21797.217046


In [43]:
for dataframe in ourFrames:
    dataset = 'NaN'
    if (dataframe.equals(train_wOrdinal)):
        dataset = 'ordinal'
    elif (dataframe.equals(train_wOnlyDummies)):
        dataset = 'dummified'
    elif (dataframe.equals(train_wOnlyDummiesNoDrop)):
        dataset = 'dummifiedNoDrop'
        
    for trial in outlierType:
        #this will create the model name by which outlier type, below we will adjust the dataset
        # to match the outlier type being tested, it will then be stored by dataset_outlierType
        model_name = 'scikit_gb_tuned'
        data_process = f"{dataset}_{trial}"
        
        #below is filtering out the 3*IQR range if the trial is for that purpose
        if (trial == 'threeXOutlier'):
            frame = dataframe.copy()
            outlier_threshold = 3
            Q1 = frame['SalePrice'].quantile(.25)
            Q3 = frame['SalePrice'].quantile(.75)
            IQR = Q3 - Q1
            new_bounds = Q3 + outlier_threshold * IQR
            frame.drop(frame[frame['SalePrice'] > new_bounds].index, axis=0, inplace = True)
            
        #below is filtering out the houses that sold in normal condition
        elif (trial == 'onlyNormalCondition'):
            frame = dataframe.copy()
            if (frame.equals(train_wOrdinal)):
                frame = frame[frame['SaleCondition'] == 4]
            elif (frame.equals(train_wOnlyDummies)):
                frame = frame[frame['SaleCondition_Normal'] == 1.0]
            elif (frame.equals(train_wOnlyDummiesNoDrop)):
                frame = frame[frame['SaleCondition_Normal'] == 1.0]
                
        elif (trial == 'allOutliers'):
            frame = dataframe.copy()
        
    
        #the features will be X (independent variables)
        X = frame.drop('SalePrice', axis=1)
        X_array = X.values
        #the target (dependent variable) will be y
        y = frame['SalePrice']
        y_array = y.values

        #Split your training and testing sets of data
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

        params = {'n_estimators': 488, 'learning_rate': 0.09127662860772781, 'max_depth': 3, 'random_state': 42}
        #Input parameters into the model
        gb_model = GradientBoostingRegressor(**params)

        # Train the model on the training set
        gb_model.fit(X_train, y_train)

        # Make predictions on the test set
        gb_model_y_pred = gb_model.predict(X_test)

       
        # Display feature importances
        feature_importances = pd.DataFrame({'Feature': X.columns, 'Importance': gb_model.feature_importances_})
        feature_importances = feature_importances.sort_values(by='Importance', ascending=False)
        print('\nFeature Importances:')
        print(feature_importances)
        
        # Perform cross-validation
        cv_scores = cross_val_score(gb_model, X, y, cv=kf, scoring='r2')
        
        def rmse_scorer(y_true, y_pred):
            mse = mean_squared_error(y_true, y_pred)
            rmse = np.sqrt(mse)
            return rmse
    
        # Define the scoring function using neg_mean_squared_error
        scorer = make_scorer(rmse_scorer)
        
        # Use cross_val_score with the defined scorer
        rmse_scores = cross_val_score(gb_model, X, y, cv=kf, scoring=scorer)
        
        # Display the cross-validation scores
        print("Cross-Validation Scores:", cv_scores, '\n')
        print(f"Mean R^2: {cv_scores.mean()}", '\n') 
        
      
        record = {'Model': model_name, 'data_process': data_process, 'r2_5kf_mean': cv_scores.mean(), 'rmse_5kf_mean': rmse_scores.mean()}
        masterScores = masterScores.append(record, ignore_index=True)

        print('\n')
        print('\n')
        print('\n')

        # feat_importances = pd.Series(rf_model.feature_importances_, index=X.columns)
        # feat_importances.nlargest(20).plot(kind='barh')


Feature Importances:
               Feature  Importance
17         OverallQual    0.274732
80       HighQualFinSF    0.223454
75        TotalHouseSF    0.216544
19           YearBuilt    0.039443
76  TotalBathroomCount    0.037112
..                 ...         ...
42          Electrical    0.000006
14          Condition2    0.000006
35        BsmtFinType2    0.000003
9            Utilities    0.000000
47        BsmtHalfBath    0.000000

[84 rows x 2 columns]
Cross-Validation Scores: [0.94174232 0.91477871 0.94039009 0.9296729  0.87839295] 

Mean R^2: 0.9209953932889585 








Feature Importances:
               Feature  Importance
80       HighQualFinSF    0.376906
17         OverallQual    0.314955
75        TotalHouseSF    0.068115
76  TotalBathroomCount    0.036686
52         KitchenQual    0.020547
..                 ...         ...
70               Fence    0.000005
31            BsmtCond    0.000002
74       SaleCondition    0.000000
14          Condition2    0.000000
9      

In [44]:
masterScores

Unnamed: 0,Model,data_process,r2_5kf_mean,rmse_5kf_mean
0,scikit_gb,ordinal_threeXOutlier,0.911408,19722.476115
1,scikit_gb,ordinal_onlyNormalCondition,0.934772,17913.941231
2,scikit_gb,ordinal_allOutliers,0.913304,22060.491907
3,scikit_gb,dummified_threeXOutlier,0.910339,19856.202269
4,scikit_gb,dummified_onlyNormalCondition,0.933338,18117.270365
5,scikit_gb,dummified_allOutliers,0.914184,21953.270181
6,scikit_gb,dummifiedNoDrop_threeXOutlier,0.909811,19905.529479
7,scikit_gb,dummifiedNoDrop_onlyNormalCondition,0.934652,17922.807704
8,scikit_gb,dummifiedNoDrop_allOutliers,0.915274,21797.217046
9,scikit_gb_tuned,ordinal_threeXOutlier,0.920995,18588.952882


In [45]:
output_file = 'masterScores_scikitBoosting.csv'

masterScores.to_csv(output_file, index=False, mode='w')

print(f"DataFrame exported to '{output_file}' in the same working directory.")

DataFrame exported to 'masterScores_scikitBoosting.csv' in the same working directory.


In [46]:
b - c

NameError: name 'b' is not defined

In [None]:
# def objective(trial):
    
#     params = {
#         'n_estimators': trial.suggest_int('n_estimators', 50, 500),
#         'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1),
#         'max_depth': trial.suggest_int('max_depth', 3, 10),
#     }

#     model = GradientBoostingRegressor(**params, random_state=42)
#     model.fit(X_train, y_train)

#     predictions = model.predict(X_test)
#     mse = mean_squared_error(y_test, predictions)
#     return mse

# study = optuna.create_study(direction='minimize')
# study.optimize(objective, n_trials=100)

# best_params = study.best_params
# best_model = GradientBoostingRegressor(**best_params, random_state=42)
# best_model.fit(X_train, y_train)

In [None]:
best_params

In [None]:
#{'n_estimators': 488, 'learning_rate': 0.09127662860772781, 'max_depth': 3}
# for actual data, had error about left frame in for df name 
# {'n_estimators': 484, 'learning_rate': 0.061149100472762515, 'max_depth': 3}


In [None]:
best_model

In [None]:
best_params = {'n_estimators': 488, 'learning_rate': 0.09127662860772781, 'max_depth': 3}
best_model = GradientBoostingRegressor(**best_params, random_state=42)
best_model.fit(X_train, y_train)

In [None]:
= {'n_estimators': 488, 'learning_rate': 0.09127662860772781, 'max_depth': 3, random_state = 42}

In [None]:
best_pred = best_model.predict(X_test)

In [None]:
model_name = 'scikit_boosting_tuned'

# Evaluate the Random Forest model
best_r2 = r2_score(y_test, best_pred)
print(f'R-squared on the test set: {best_r2}')

best_model_mse = mean_squared_error(y_test, best_pred)
print(f'Mean Squared Error on the test set: {best_model_mse}')




# Create a KFold object
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Perform cross-validation
cv_scores = cross_val_score(best_model, X, y, cv=kf, scoring='r2')
# You can replace 'r2' with other scoring metrics like 'neg_mean_squared_error', etc.

    

# Use cross_val_score with the defined scorer
rmse_scores = cross_val_score(best_model, X, y, cv=kf, scoring=scorer)



# Display the cross-validation scores
print("Cross-Validation Scores Rsquared:", cv_scores, '\n')


# Print the mean and standard deviation of the scores
print(f"Mean R^2: {cv_scores.mean()}", '\n')
print(f"Standard Deviation R^2: {cv_scores.std()}", '\n')

record = {'Model': model_name, 'r2_5kf_mean': cv_scores.mean(), 'rmse_5kf_mean': rmse_scores.mean()}
masterScores = masterScores.append(record, ignore_index=True)

In [None]:
import shap

# Use the SHAP explainer
explainer = shap.Explainer(best_model)

# Calculate SHAP values for a subset of the data (e.g., test set)
shap_values = explainer.shap_values(X_test)

# Plot summary plot
shap.summary_plot(shap_values, X_test)

In [None]:
masterScores

In [None]:
output_file = 'masterScores_scikitBoosting.csv'

masterScores.to_csv(output_file, index=False, mode='w')

print(f"DataFrame exported to '{output_file}' in the same working directory.")