In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
filename = r"C:\Users\BenBuczek\OneDrive - KIPP New Orleans, Inc\Desktop\Springboard\capstone_2.3_diamond_data.csv"
data = pd.read_csv(filename)
data.head().T

Unnamed: 0,0,1,2,3,4
carat_weight,-0.786360,-0.786360,-0.786360,-0.786360,-0.786360
depth_percent,0.102492,0.021808,-0.058876,0.031894,0.324373
table_percent,0.125746,0.125746,0.125746,0.125746,0.075544
meas_length,-1.530031,-1.535701,-1.513024,-1.524362,-1.564047
meas_width,-1.648296,-1.633745,-1.626470,-1.641020,-1.677397
...,...,...,...,...,...
fluor_intensity_Slight,-0.007391,-0.007391,-0.007391,-0.007391,-0.007391
fluor_intensity_Strong,-0.253265,-0.253265,-0.253265,-0.253265,-0.253265
fluor_intensity_Very Slight,-0.112150,-0.112150,-0.112150,-0.112150,-0.112150
fluor_intensity_Very Strong,-0.070709,-0.070709,-0.070709,-0.070709,-0.070709


In [3]:
# Features and target
X = data.drop('target_price', axis=1) 
y = data['target_price']             

# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2,
    random_state=42
)

In [4]:
lr_model = LinearRegression()

lr_model.fit(X_train, y_train)

In [5]:
gb_model = GradientBoostingRegressor(random_state=42)

gb_model.fit(X_train, y_train)

In [6]:
rf_model = RandomForestRegressor(n_estimators=100, max_depth=None, random_state=42)

rf_model.fit(X_train, y_train)

In [7]:
# Predictions with Linear Regression
lr_predictions = lr_model.predict(X_test)

# Predictions with Gradient Boosting
gb_predictions = gb_model.predict(X_test)

# Predictions with Random Forest
rf_predictions = rf_model.predict(X_test)

# Evaluate the performance
lr_mse = mean_squared_error(y_test, lr_predictions)
gb_mse = mean_squared_error(y_test, gb_predictions)
rf_mse = mean_squared_error(y_test, rf_predictions)

print(f"Linear Regression MSE: {lr_mse}")
print(f"Gradient Boosting MSE: {gb_mse}")
print(f"Random Forest MSE: {rf_mse}")

# R-squared score
lr_r2 = r2_score(y_test, lr_predictions)
gb_r2 = r2_score(y_test, gb_predictions)
rf_r2 = r2_score(y_test, rf_predictions)

print("Linear Regression R^2:", lr_r2)
print("Gradient Boosting R^2:", gb_r2)
print("Random Forest R^2:", rf_r2)

Linear Regression MSE: 0.4696912398043861
Gradient Boosting MSE: 0.2824626306467341
Random Forest MSE: 0.24146056617565229
Linear Regression R^2: 0.5925220227396027
Gradient Boosting R^2: 0.7549511431477478
Random Forest R^2: 0.7905222521621186


In [8]:
from scipy.stats import randint

param_distributions = {
    'n_estimators': randint(100, 500), 
    'max_depth': randint(3, 20), 
    'min_samples_split': randint(2, 11),  
    'min_samples_leaf': randint(1, 11)  
}

In [9]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor

# Initialize the model
rf_regressor = RandomForestRegressor(random_state=42)

# Initialize RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=rf_regressor,
    param_distributions=param_distributions,
    n_iter=100,  
    cv=5,  
    verbose=1,  
    random_state=42,
    n_jobs=-1  
)

# Fit RandomizedSearchCV
random_search.fit(X_train, y_train)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


In [10]:
print("Best parameters found: ", random_search.best_params_)
print("Best score found: ", random_search.best_score_)

Best parameters found:  {'max_depth': 19, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 403}
Best score found:  0.8123829198667181


In [11]:
# Retrain the model with the best parameters
best_rf_regressor = random_search.best_estimator_

# Predict and evaluate
predictions = best_rf_regressor.predict(X_test)
mse = mean_squared_error(y_test, predictions)
r2 = r2_score(y_test, predictions)

print(f"Test MSE: {mse}")
print(f"Test R^2: {r2}")

Test MSE: 0.2502004946453036
Test R^2: 0.7829399766747211


In [12]:
from sklearn.model_selection import KFold, cross_val_score
from sklearn.ensemble import RandomForestRegressor
import numpy as np

In [13]:
model = RandomForestRegressor(n_estimators=100, random_state=42)

In [14]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)

In [15]:
scores = cross_val_score(model, X_train, y_train, scoring='neg_mean_squared_error', cv=kf)

# Convert scores to positive MSE scores
mse_scores = -scores

print("MSE scores for each fold:", mse_scores)
print("Mean MSE:", np.mean(mse_scores))
print("Standard deviation of MSE:", np.std(mse_scores))

MSE scores for each fold: [0.18806836 0.25137862 0.08324004 0.165493   0.2697218 ]
Mean MSE: 0.19158036539035755
Standard deviation of MSE: 0.06650177106026627


In [16]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [100, 150],  
    'max_depth': [5, 10],  
    'min_samples_split': [4, 6],  
    'min_samples_leaf': [2, 3]  
}

rf_model = RandomForestRegressor(random_state=42)

grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=3, verbose=2, n_jobs=-1, scoring='neg_mean_squared_error')

grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
best_score = -grid_search.best_score_  # Convert from negative MSE to positive MSE

best_rf_model = RandomForestRegressor(**best_params, random_state=42)
best_rf_model.fit(X_train, y_train)

predictions = best_rf_model.predict(X_test)

mse = mean_squared_error(y_test, predictions)
r2 = r2_score(y_test, predictions)

print(f"Best Parameters: {best_params}")
print(f"Best CV MSE: {best_score}")
print(f"Test MSE: {mse}")
print(f"Test R^2: {r2}")

Fitting 3 folds for each of 16 candidates, totalling 48 fits
Best Parameters: {'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 4, 'n_estimators': 100}
Best CV MSE: 0.20258061928190418
Test MSE: 0.2597743196674125
Test R^2: 0.7746342589519921


In [17]:
# Get the feature importance (coefficients)
feature_importance_lr = lr_model.coef_

importance_lr = pd.DataFrame({'Feature': X_train.columns, 'Importance': feature_importance_lr})

print(importance_lr.sort_values(by='Importance', key=abs, ascending=False))

          Feature    Importance
32    clarity_SI1  1.840495e+11
36    clarity_VS2  1.831939e+11
35    clarity_VS1  1.808532e+11
33    clarity_SI2  1.685552e+11
38   clarity_VVS2  1.636218e+11
..            ...           ...
4      meas_width -1.757076e-01
3     meas_length -1.452135e-01
2   table_percent  3.479745e-02
5      meas_depth -6.854753e-03
1   depth_percent  5.372063e-03

[75 rows x 2 columns]


In [19]:
# Get the feature importance
rf_model.fit(X_train, y_train)

feature_importance_rf = rf_model.feature_importances_

importance_rf = pd.DataFrame({'Feature': X_train.columns, 'Importance': feature_importance_rf})

print(importance_rf.sort_values(by='Importance', ascending=False))

              Feature    Importance
0        carat_weight  6.887507e-01
17            color_D  2.862327e-02
3         meas_length  2.666283e-02
4          meas_width  2.320706e-02
1       depth_percent  2.313313e-02
..                ...           ...
41   cut_quality_Good  5.375583e-07
40   cut_quality_Fair  1.002925e-08
34        clarity_SI3  8.431263e-09
42  cut_quality_Ideal  2.289178e-10
56        polish_Poor  5.092810e-11

[75 rows x 2 columns]


In [20]:
# Get the feature importance of the best model
best_rf_model.fit(X_train, y_train)

feature_importance_rf = best_rf_model.feature_importances_

importance_rf = pd.DataFrame({'Feature': X_train.columns, 'Importance': feature_importance_rf})

print(importance_rf.sort_values(by='Importance', ascending=False))

              Feature  Importance
0        carat_weight    0.758915
17            color_D    0.031619
18            color_E    0.024130
20            color_G    0.018226
3         meas_length    0.017419
..                ...         ...
56        polish_Poor    0.000000
41   cut_quality_Good    0.000000
42  cut_quality_Ideal    0.000000
60       culet_size_M    0.000000
40   cut_quality_Fair    0.000000

[75 rows x 2 columns]
