In [107]:
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import plotly.graph_objects as go
import plotly.io as pio
import plotly.express as px

from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant
import statsmodels.api as sm
from statsmodels.graphics.gofplots import qqplot


from sklearn.metrics import r2_score
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score, KFold, GridSearchCV
from sklearn.linear_model import Lasso
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge
from sklearn.linear_model import ElasticNet

import optuna
import xgboost as xgb
import catboost
from catboost import CatBoostRegressor

import warnings
pd.set_option('display.max_columns', None)
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter('ignore', pd.errors.SettingWithCopyWarning)
warnings.simplefilter(action="ignore", category=pd.errors.PerformanceWarning)


In [108]:
train_wOrdinal = pd.read_csv('~/Documents/AmesHousingML/data_cleaning/data_ordinal.csv') 
train_originalCleaned = pd.read_csv('~/Documents/AmesHousingML/data_cleaning/data_original.csv') 
train_wOnlyDummies = pd.read_csv('~/Documents/AmesHousingML/data_cleaning/data_dummies.csv') 
train_wOnlyDummiesNoDrop = pd.read_csv('~/Documents/AmesHousingML/data_cleaning/data_dummies_noDrop.csv') 

In [109]:
masterScores = pd.DataFrame(columns = ['Model', 'data_process', 'r2_5kf_mean', 'rmse_5kf_mean'])
#featureBoard = pd.DataFrame(columns = ['Model', 'top_features'])
#dfTest = pd.DataFrame(columns = ['trial', 'shape'])

In [110]:
# Create a KFold object
kf = KFold(n_splits=5, shuffle=True, random_state=42)

In [111]:
ourFrames = [train_wOrdinal, train_wOnlyDummies, train_wOnlyDummiesNoDrop]
outlierType = ['threeXOutlier', 'onlyNormalCondition', 'allOutliers']

In [112]:
#Below is to set up the inidividual identification of which dataset is being used:
for dataframe in ourFrames:
    dataset = 'NaN'
    if (dataframe.equals(train_wOrdinal)):
        dataset = 'ordinal'
    elif (dataframe.equals(train_wOnlyDummies)):
        dataset = 'dummified'
    elif (dataframe.equals(train_wOnlyDummiesNoDrop)):
        dataset = 'dummifiedNoDrop'
        
    for trial in outlierType:
        #this will create the model name by which outlier type, below we will adjust the dataset
        # to match the outlier type being tested, it will then be stored by dataset_outlierType
        model_name = 'catboost'
        data_process = f"{dataset}_{trial}"
        
        #below is filtering out the 3*IQR range if the trial is for that purpose
        if (trial == 'threeXOutlier'):
            frame = dataframe.copy()
            outlier_threshold = 3
            Q1 = frame['SalePrice'].quantile(.25)
            Q3 = frame['SalePrice'].quantile(.75)
            IQR = Q3 - Q1
            new_bounds = Q3 + outlier_threshold * IQR
            frame.drop(frame[frame['SalePrice'] > new_bounds].index, axis=0, inplace = True)
            
        #below is filtering out the houses that sold in normal condition
        elif (trial == 'onlyNormalCondition'):
            frame = dataframe.copy()
            if (frame.equals(train_wOrdinal)):
                frame = frame[frame['SaleCondition'] == 4]
            elif (frame.equals(train_wOnlyDummies)):
                frame = frame[frame['SaleCondition_Normal'] == 1.0]
            elif (frame.equals(train_wOnlyDummiesNoDrop)):
                frame = frame[frame['SaleCondition_Normal'] == 1.0]
                
        elif (trial == 'allOutliers'):
            frame = dataframe.copy()
        
       
        #the features will be X (independent variables)
        X = frame.drop('SalePrice', axis=1)
        X_array = X.values
        #the target (dependent variable) will be y
        y = frame['SalePrice']
        y_array = y.values

        #Split your training and testing sets of data
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

        params = {'iterations': 100,'learning_rate': 0.1,'depth': 6,'loss_function': 'RMSE',  'random_seed': 42}
        #Input parameters into the model
        cat_model = CatBoostRegressor(**params)

        # Train the model on the training set
        cat_model.fit(X_train, y_train)

        # Make predictions on the test set
        cat_model_y_pred = cat_model.predict(X_test)

        # Display feature importances
        feature_importances = pd.DataFrame({'Feature': X.columns, 'Importance': cat_model.feature_importances_})
        feature_importances = feature_importances.sort_values(by='Importance', ascending=False).head(10)
        print('\nFeature Importances:')
        print(feature_importances)

        # Perform cross-validation
        cv_scores = cross_val_score(cat_model, X, y, cv=kf, scoring='r2')
        
        def rmse_scorer(y_true, y_pred):
            mse = mean_squared_error(y_true, y_pred)
            rmse = np.sqrt(mse)
            return rmse
    
        # Define the scoring function using neg_mean_squared_error
        scorer = make_scorer(rmse_scorer)
        
        # Use cross_val_score with the defined scorer
        rmse_scores = cross_val_score(cat_model, X, y, cv=kf, scoring=scorer)
        
        # Display the cross-validation scores
        print("Cross-Validation Scores:", cv_scores, '\n')
        # Print the mean and standard deviation of the scores
        print(f"Mean R^2: {cv_scores.mean()}", '\n') 
        
      
        
        record = {'Model': model_name, 'data_process': data_process, 'r2_5kf_mean': cv_scores.mean(), 'rmse_5kf_mean': rmse_scores.mean()}
        masterScores = masterScores.append(record, ignore_index=True)
     
        
        print('\n')
        print('\n')
        print('\n')



0:	learn: 62039.0318265	total: 5.59ms	remaining: 553ms
1:	learn: 57901.6380184	total: 7.77ms	remaining: 381ms
2:	learn: 54222.5770881	total: 9.12ms	remaining: 295ms
3:	learn: 50617.1373621	total: 11ms	remaining: 265ms
4:	learn: 47593.4727018	total: 12.5ms	remaining: 238ms
5:	learn: 44581.1659470	total: 14ms	remaining: 219ms
6:	learn: 42073.7362097	total: 15.4ms	remaining: 205ms
7:	learn: 39718.0378720	total: 16.7ms	remaining: 193ms
8:	learn: 37667.0158873	total: 18ms	remaining: 182ms
9:	learn: 35805.6167241	total: 19.3ms	remaining: 174ms
10:	learn: 34120.0620273	total: 20.7ms	remaining: 168ms
11:	learn: 32669.2318919	total: 22.3ms	remaining: 163ms
12:	learn: 31433.9298204	total: 23.7ms	remaining: 159ms
13:	learn: 30261.4275237	total: 25.3ms	remaining: 155ms
14:	learn: 29211.7633444	total: 26.8ms	remaining: 152ms
15:	learn: 28087.8131506	total: 28.5ms	remaining: 149ms
16:	learn: 27250.6205381	total: 30ms	remaining: 146ms
17:	learn: 26450.0853649	total: 31.5ms	remaining: 144ms
18:	learn:

In [113]:
masterScores

Unnamed: 0,Model,data_process,r2_5kf_mean,rmse_5kf_mean
0,catboost,ordinal_threeXOutlier,0.921165,18681.014437
1,catboost,ordinal_onlyNormalCondition,0.938718,17394.528056
2,catboost,ordinal_allOutliers,0.920792,21088.095229
3,catboost,dummified_threeXOutlier,0.920114,18766.314419
4,catboost,dummified_onlyNormalCondition,0.93652,17705.618147
5,catboost,dummified_allOutliers,0.920787,21054.910504
6,catboost,dummifiedNoDrop_threeXOutlier,0.919677,18851.435847
7,catboost,dummifiedNoDrop_onlyNormalCondition,0.935308,17909.736594
8,catboost,dummifiedNoDrop_allOutliers,0.920196,21164.979171


In [114]:
train_wOrdinal.head()

Unnamed: 0,GrLivArea,SalePrice,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,MasVnrArea,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,Heating,HeatingQC,CentralAir,Electrical,1stFlrSF,2ndFlrSF,LowQualFinSF,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,FireplaceQu,GarageType,GarageYrBlt,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,Fence,MoSold,YrSold,SaleType,SaleCondition,TotalHouseSF,TotalBathroomCount,QualityOutdoorSF,YearAndRemodAvg,NonHouseSF,HighQualFinSF,HouseLotRatio,FrontageLotRatio,QualityOutdoorLotRatio
0,856,126000,30,5,60.0,7890,1,1,3,3,0,0,0,19,2,2,0,2,6,6,1939,1950,1,0,13,14,2,0.0,3,3,1,4,4,2,3,238.0,1,0.0,618.0,856.0,1,3,1,3,856,0,0,1.0,0.0,1,0,2,1,3,4,6,1,5,5,1939.0,1,2.0,399.0,4,4,2,0,0,0,0,166,1,3,2010,9,4,1712.0,2.0,166,1944.5,6178.0,1712.0,21.698352,0.760456,2.103929
1,1049,139500,120,5,42.0,4235,1,1,3,3,0,4,0,7,2,2,4,2,5,5,1984,1984,1,0,6,6,1,149.0,4,3,1,5,4,3,3,552.0,2,393.0,104.0,1049.0,1,3,1,3,1049,0,0,1.0,0.0,2,0,2,1,4,5,6,0,1,1,1984.0,3,1.0,266.0,4,4,2,0,105,0,0,0,1,2,2009,9,4,2098.0,3.0,105,1984.0,2137.0,2098.0,49.539551,0.991736,2.479339
2,1001,124900,30,1,60.0,6060,1,1,3,3,0,4,0,10,2,2,0,2,5,9,1930,2007,3,0,8,8,2,0.0,4,3,0,4,4,2,2,737.0,1,0.0,100.0,837.0,1,5,1,3,1001,0,0,0.0,0.0,1,0,2,1,4,5,6,0,1,5,1930.0,1,1.0,216.0,4,2,0,154,0,42,86,0,1,11,2007,9,4,1838.0,1.0,282,1968.5,4222.0,1838.0,30.330033,0.990099,4.653465
3,1039,114000,70,5,80.0,8146,1,1,3,3,0,0,0,18,2,2,0,5,4,8,1900,2003,1,0,8,8,2,0.0,4,4,0,3,4,2,1,0.0,1,0.0,405.0,405.0,1,4,1,3,717,322,0,0.0,0.0,1,0,2,1,3,6,6,0,1,5,1940.0,1,1.0,281.0,4,4,0,0,0,168,0,111,1,5,2009,9,4,1444.0,1.0,279,1951.5,6702.0,1444.0,17.726492,0.982077,3.424994
4,1665,227000,60,5,70.0,8400,1,1,3,3,0,4,0,15,2,2,0,5,8,6,2001,2001,1,0,12,13,2,0.0,4,3,2,5,4,2,3,643.0,1,0.0,167.0,810.0,1,5,1,3,810,855,0,1.0,0.0,2,1,3,1,4,6,6,0,1,1,2001.0,3,2.0,528.0,4,4,2,0,45,0,0,0,1,11,2009,9,4,2475.0,3.5,45,2001.0,5925.0,2475.0,29.464286,0.833333,0.535714


In [115]:
#Below is to set up the inidividual identification of which dataset is being used:
for dataframe in ourFrames:
    dataset = 'NaN'
    if (dataframe.equals(train_wOrdinal)):
        dataset = 'ordinal'
    elif (dataframe.equals(train_wOnlyDummies)):
        dataset = 'dummified'
    elif (dataframe.equals(train_wOnlyDummiesNoDrop)):
        dataset = 'dummifiedNoDrop'
        
    for trial in outlierType:
        #this will create the model name by which outlier type, below we will adjust the dataset
        # to match the outlier type being tested, it will then be stored by dataset_outlierType
        model_name = 'catboost_tuned'
        data_process = f"{dataset}_{trial}"
        
        #below is filtering out the 3*IQR range if the trial is for that purpose
        if (trial == 'threeXOutlier'):
            frame = dataframe.copy()
            outlier_threshold = 3
            Q1 = frame['SalePrice'].quantile(.25)
            Q3 = frame['SalePrice'].quantile(.75)
            IQR = Q3 - Q1
            new_bounds = Q3 + outlier_threshold * IQR
            frame.drop(frame[frame['SalePrice'] > new_bounds].index, axis=0, inplace = True)
            
        #below is filtering out the houses that sold in normal condition
        elif (trial == 'onlyNormalCondition'):
            frame = dataframe.copy()
            if (frame.equals(train_wOrdinal)):
                frame = frame[frame['SaleCondition'] == 4]
            elif (frame.equals(train_wOnlyDummies)):
                frame = frame[frame['SaleCondition_Normal'] == 1.0]
            elif (frame.equals(train_wOnlyDummiesNoDrop)):
                frame = frame[frame['SaleCondition_Normal'] == 1.0]
                
        elif (trial == 'allOutliers'):
            frame = dataframe.copy()
        
       
        #the features will be X (independent variables)
        X = frame.drop('SalePrice', axis=1)
        X_array = X.values
        #the target (dependent variable) will be y
        y = frame['SalePrice']
        y_array = y.values

        #Split your training and testing sets of data
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

        params = {'iterations': 434, 'learning_rate': 0.10599212903564667, 'depth': 5, 'loss_function': 'RMSE', 'random_seed': 42}
        
        #Input parameters into the model
        cat_model = CatBoostRegressor(**params)

        # Train the model on the training set
        cat_model.fit(X_train, y_train)

        # Make predictions on the test set
        cat_model_y_pred = cat_model.predict(X_test)

        # Display feature importances
        feature_importances = pd.DataFrame({'Feature': X.columns, 'Importance': cat_model.feature_importances_})
        feature_importances = feature_importances.sort_values(by='Importance', ascending=False).head(10)
        print('\nFeature Importances:')
        print(feature_importances)

        # Perform cross-validation
        cv_scores = cross_val_score(cat_model, X, y, cv=kf, scoring='r2')
        
        def rmse_scorer(y_true, y_pred):
            mse = mean_squared_error(y_true, y_pred)
            rmse = np.sqrt(mse)
            return rmse
    
        # Define the scoring function using neg_mean_squared_error
        scorer = make_scorer(rmse_scorer)
        
        # Use cross_val_score with the defined scorer
        rmse_scores = cross_val_score(cat_model, X, y, cv=kf, scoring=scorer)
        
        # Display the cross-validation scores
        print("Cross-Validation Scores:", cv_scores, '\n')
        # Print the mean and standard deviation of the scores
        print(f"Mean R^2: {cv_scores.mean()}", '\n') 
        
      
        
        record = {'Model': model_name, 'data_process': data_process, 'r2_5kf_mean': cv_scores.mean(), 'rmse_5kf_mean': rmse_scores.mean()}
        masterScores = masterScores.append(record, ignore_index=True)
     
        
        print('\n')
        print('\n')
        print('\n')

0:	learn: 61750.1691068	total: 1.16ms	remaining: 503ms
1:	learn: 57501.6941764	total: 2.4ms	remaining: 520ms
2:	learn: 53548.7964446	total: 3.69ms	remaining: 530ms
3:	learn: 50011.3242531	total: 4.86ms	remaining: 522ms
4:	learn: 46927.5587709	total: 6.11ms	remaining: 524ms
5:	learn: 44148.3419730	total: 7.19ms	remaining: 513ms
6:	learn: 41567.1317947	total: 8.22ms	remaining: 501ms
7:	learn: 39410.2609918	total: 9.29ms	remaining: 495ms
8:	learn: 37299.2585272	total: 10.7ms	remaining: 505ms
9:	learn: 35470.1420798	total: 12.9ms	remaining: 546ms
10:	learn: 33879.3181875	total: 14.7ms	remaining: 566ms
11:	learn: 32421.0798554	total: 15.9ms	remaining: 558ms
12:	learn: 31069.5504236	total: 16.9ms	remaining: 548ms
13:	learn: 29911.1368652	total: 18.6ms	remaining: 559ms
14:	learn: 28721.0116161	total: 19.7ms	remaining: 551ms
15:	learn: 27831.0061899	total: 21.8ms	remaining: 569ms
16:	learn: 26879.3958387	total: 23.1ms	remaining: 568ms
17:	learn: 26067.2735599	total: 24.7ms	remaining: 572ms
18:

In [116]:
masterScores

Unnamed: 0,Model,data_process,r2_5kf_mean,rmse_5kf_mean
0,catboost,ordinal_threeXOutlier,0.921165,18681.014437
1,catboost,ordinal_onlyNormalCondition,0.938718,17394.528056
2,catboost,ordinal_allOutliers,0.920792,21088.095229
3,catboost,dummified_threeXOutlier,0.920114,18766.314419
4,catboost,dummified_onlyNormalCondition,0.93652,17705.618147
5,catboost,dummified_allOutliers,0.920787,21054.910504
6,catboost,dummifiedNoDrop_threeXOutlier,0.919677,18851.435847
7,catboost,dummifiedNoDrop_onlyNormalCondition,0.935308,17909.736594
8,catboost,dummifiedNoDrop_allOutliers,0.920196,21164.979171
9,catboost_tuned,ordinal_threeXOutlier,0.9276,17793.103666


In [117]:
output_file = 'masterScores_catboost.csv'

masterScores.to_csv(output_file, index=False, mode='w')

print(f"DataFrame exported to '{output_file}' in the same working directory.")

DataFrame exported to 'masterScores_catboost.csv' in the same working directory.


In [118]:
c + a

NameError: name 'c' is not defined

In [None]:

# def objective(trial):
#     params = {
#         'iterations': trial.suggest_int('iterations', 50, 500),
#         'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
#         'depth': trial.suggest_int('depth', 4, 10),
#     }

#     model = CatBoostRegressor(**params, random_state=42)
#     model.fit(X_train, y_train, eval_set=(X_test, y_test), early_stopping_rounds=10, verbose=10)

#     predictions = model.predict(X_test)
#     mse = mean_squared_error(y_test, predictions)
#     return mse

# #study = optuna.create_study(direction='minimize')
# #study.optimize(objective, n_trials=100)

# #best_params = study.best_params
# best_params = {'iterations': 434, 'learning_rate': 0.10599212903564667, 'depth': 5}
# best_model = CatBoostRegressor(**best_params, random_state=42)
# best_model.fit(X_train, y_train)

In [None]:
# best_params

In [None]:
# best_model

In [None]:
# best_pred = best_model.predict(X_test)

In [None]:
#the features will be X (independent variables)
model_name = 'catboost_tuned'

#Split your training and testing sets of data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Evaluate the Random Forest model
best_r2 = r2_score(y_test, best_pred)
print(f'R-squared on the test set: {best_r2}')

best_model_mse = mean_squared_error(y_test, best_pred)
print(f'Mean Squared Error on the test set: {best_model_mse}')




# Create a KFold object
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Perform cross-validation
cv_scores = cross_val_score(best_model, X, y, cv=kf, scoring='r2')
# You can replace 'r2' with other scoring metrics like 'neg_mean_squared_error', etc.

    

# Use cross_val_score with the defined scorer
rmse_scores = cross_val_score(best_model, X, y, cv=kf, scoring=scorer)



# Display the cross-validation scores
print("Cross-Validation Scores Rsquared:", cv_scores, '\n')


# Print the mean and standard deviation of the scores
print(f"Mean R^2: {cv_scores.mean()}", '\n')
print(f"Standard Deviation R^2: {cv_scores.std()}", '\n')

record = {'Model': model_name, 'r2_5kf_mean': cv_scores.mean(), 'rmse_5kf_mean': rmse_scores.mean()}
masterScores = masterScores.append(record, ignore_index=True)


In [None]:
# # Create a scatter plot with Plotly
# fig = go.Figure()

# # Add a scatter plot for actual vs. predicted values

# fig.add_trace(go.Scatter(x=y_test, y=best_pred, mode='markers', marker=dict(color='blue'), 
#                          name='Actual vs. Predicted'))

# # Add a diagonal line for perfect predictions
# fig.add_shape(type="line", x0=min(y), x1=max(y), y0=min(y), y1=max(y),
#               line=dict(color="red", width=3, dash='dash'),
#               name="Perfect Predictions")

# # Update layout and axis labels
# fig.update_layout(title="Actual vs. Predicted Values for Linear Regression Model",
#                   xaxis_title="Actual Values",
#                   yaxis_title="Predicted Values")

# # Show the plot
# fig.show()

In [119]:
import shap

# # Use the SHAP explainer
# explainer = shap.Explainer(best_model)

# # Calculate SHAP values for a subset of the data (e.g., test set)
# shap_values = explainer.shap_values(X_test)

# # Plot summary plot
# shap.summary_plot(shap_values, X_test)


In [120]:
# Use the SHAP explainer
explainer = shap.TreeExplainer(best_model)

# Calculate SHAP values for a subset of the data (e.g., test set)
shap_values = explainer.shap_values(X_test)

# 1. Summary Plot
shap.summary_plot(shap_values, X_test)

# 2. Summary Bar Plot
shap.summary_plot(shap_values, X_test, plot_type="bar")

# 3. Dependence Plot
shap.dependence_plot("OverallQual", shap_values, X_test)
shap.dependence_plot("HighQualFinSF", shap_values, X_test)
shap.dependence_plot("KitchenQual", shap_values, X_test)
shap.dependence_plot("YearAndRemodAvg", shap_values, X_test)
shap.dependence_plot("YearBuilt", shap_values, X_test)


# 4. Force Plot (for a specific prediction)
shap.force_plot(explainer.expected_value, shap_values[0, :], X_test.iloc[0, :])

# 5. Waterfall Plot (for a specific prediction)
shap.waterfall_plot(shap.Explanation(values=shap_values[0, :], base_values=explainer.expected_value), max_display=10)


CatBoostError: /Users/zomb-ml-platform-msk/go-agent-21.2.0/pipelines/BuildMaster/catboost.git/catboost/libs/data/model_dataset_compatibility.cpp:81: At position 2 should be feature with name MSZoning (found LotFrontage).

In [None]:
# #the features will be X (independent variables)
# model_name = 'catboost_tuned_gridsearchcv'

#     #the features will be X (independent variables)
# X = train_wOrdinal.drop('SalePrice', axis=1)
# X_array = X.values
# #the target (dependent variable) will be y
# y = train_wOrdinal['SalePrice']
# y_array = y.values

# #Split your training and testing sets of data
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# # given depth 6 iteration 100 and lr 0.1 r2 = .9186129
# params = {'depth': 5, 'iterations': 500, 'learning_rate': 0.1}
#     # optuna params {'iterations': 434, 'learning_rate': 0.10599212903564667, 'depth': 5}
#     # 'iterations': 100,
#     # 'learning_rate': .9186129,
#     # 'depth': 6,
#     # 'loss_function': 'RMSE',  # Use appropriate loss function for your task
#     # 'random_seed': 42
#     #                     }

# # orig model params = 'iterations': 100,
# #     'learning_rate': 0.1,
# #     'depth': 6,
# #     'loss_function': 'RMSE',  # Use appropriate loss function for your task
# #     'random_seed': 42

# cat_model = CatBoostRegressor(**params)

# # Train the model on the training set
# cat_model.fit(X_train, y_train)

# # Make predictions on the test set
# cat_model_y_pred = cat_model.predict(X_test)

# # Evaluate the Random Forest model
# cat_model_r2 = r2_score(y_test, cat_model_y_pred)
# print(f'R-squared on the test set: {cat_model_r2}')

# cat_model_mse = mean_squared_error(y_test, cat_model_y_pred)
# print(f'Mean Squared Error on the test set: {cat_model_mse}')

# # Root Mean Squared Error (RMSE) on the test set
# cat_model_rmse = mean_squared_error(y_test, cat_model_y_pred, squared=False)
# print("Root Mean Squared Error (RMSE) on the test set:", cat_model_rmse)

# # Display feature importances
# feature_importances = pd.DataFrame({'Feature': X.columns, 'Importance': cat_model.feature_importances_})
# feature_importances = feature_importances.sort_values(by='Importance', ascending=False)
# print('\nFeature Importances:')
# print(feature_importances)



# # Create a KFold object
# kf = KFold(n_splits=5, shuffle=True, random_state=42)

# # Perform cross-validation
# cv_scores = cross_val_score(cat_model, X, y, cv=kf, scoring='r2')
# # You can replace 'r2' with other scoring metrics like 'neg_mean_squared_error', etc.

# def rmse_scorer(y_true, y_pred):
#         mse = mean_squared_error(y_true, y_pred)
#         rmse = np.sqrt(mse)
#         return rmse
    
    
# # Define the scoring function using neg_mean_squared_error
# scorer = make_scorer(rmse_scorer)
# # Use cross_val_score with the defined scorer
# rmse_scores = cross_val_score(cat_model, X, y, cv=kf, scoring=scorer)



# # Display the cross-validation scores
# print("Cross-Validation Scores Rsquared:", cv_scores, '\n')
# print("Cross-Validation Scores RMSE:", rmse_scores, '\n')

# # Print the mean and standard deviation of the scores
# print(f"Mean R^2: {cv_scores.mean()}", '\n')
# print(f"Standard Deviation R^2: {cv_scores.std()}", '\n')
# print(f"Mean RMSE: {rmse_scores.mean()}")

# record = {'Model': model_name, 'r2_5kf_mean': cv_scores.mean(), 'rmse_5kf_mean': rmse_scores.mean()}
# masterScores = masterScores.append(record, ignore_index=True)

In [None]:
# masterScores

In [None]:
# # Define the CatBoostRegressor
# catboost_model = CatBoostRegressor()


# # optuna params {'iterations': 434, 'learning_rate': 0.10599212903564667, 'depth': 5}
# # Define the parameter grid for grid search
# # param_grid = {
# #     'iterations': [300, 400, 500],
# #     'depth': [6, 8, 10],
# #     'learning_rate': [0.1, 0.05, 0.01]
# # }

# param_grid = {
#     'iterations': [100, 200, 300],
#     'depth': [5, 6, 8],
#     'learning_rate': [0.1, 0.05, 0.11]
# }

# # Define custom scoring functions
# def custom_r2_score(model, X, y):
#     predictions = model.predict(X)
#     return r2_score(y, predictions)

# def custom_rmse_score(model, X, y):
#     predictions = model.predict(X)
#     return np.sqrt(mean_squared_error(y, predictions))

# # Create scorer objects
# r2_scorer = make_scorer(custom_r2_score, greater_is_better=True)
# rmse_scorer = make_scorer(custom_rmse_score, greater_is_better=False)

# # Perform grid search with cross-validation
# grid_search = GridSearchCV(
#     catboost_model, param_grid, scoring={'r2': r2_scorer, 'rmse': rmse_scorer}, cv=5, refit='r2'
# )
# grid_search.fit(X_train, y_train)

# # Get the best parameters and best estimator
# best_params = grid_search.best_params_
# best_model = grid_search.best_estimator_

# # Get cross-validated scores
# r2_cv_scores = cross_val_score(best_model, X_train, y_train, cv=5, scoring=r2_scorer)
# rmse_cv_scores = -cross_val_score(best_model, X_train, y_train, cv=5, scoring=rmse_scorer)

# # Print the results
# print("Best Parameters:", best_params)
# print("Cross-validated R-squared scores:", r2_cv_scores)
# print("Cross-validated RMSE scores:", rmse_cv_scores)

In [None]:
# best_params

In [None]:
# best_model

In [None]:
# r2_cv_scores = cross_val_score(best_model, X, y, cv=kf, scoring='r2')

In [None]:
# r2_cv_scores.mean()

In [None]:
# with param grid = 100, 200, 300; depth was 6, 8, 10, and rate at .1 .05 .01... given depth 6 iteration 100 and lr 0.1 r2 = .9186129
# {'depth': 5, 'iterations': 100, 'learning_rate': 0.1}


In [None]:
# # Create the CatBoostRegressor
# catboost_gridmodel = CatBoostRegressor(random_state=42)

# # Define the hyperparameter grid
# param_grid_catboost = {
#     'iterations': [100, 250, 500],
#     'learning_rate': [0.01, 0.05, 0.1],
#     'depth': [5, 10, 15],
# }

# # Create GridSearchCV object
# catboost_cv = GridSearchCV(catboost_gridmodel, param_grid_catboost, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)

# # Fit the model
# catboost_cv.fit(X_train, y_train)

# # Get the best parameters
# best_params_catboost = catboost_cv.best_params_

# print("Best Parameters for CatBoost:", best_params_catboost)


In [None]:
# best_params_catboost