In [145]:
# Read in the data file
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, SGDRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR, LinearSVR
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import numpy as np
import time

df = pd.read_csv('data/diamonds.csv')

print(df.head())

   Unnamed: 0  carat      cut color clarity  depth  table     x     y     z  \
0           1   0.23    Ideal     E     SI2   61.5   55.0  3.95  3.98  2.43   
1           2   0.21  Premium     E     SI1   59.8   61.0  3.89  3.84  2.31   
2           3   0.23     Good     E     VS1   56.9   65.0  4.05  4.07  2.31   
3           4   0.29  Premium     I     VS2   62.4   58.0  4.20  4.23  2.63   
4           5   0.31     Good     J     SI2   63.3   58.0  4.34  4.35  2.75   

   price  
0    326  
1    326  
2    327  
3    334  
4    335  


In [129]:
# Encode the non-numerical features
# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Apply LabelEncoder to the 'cut' column
df['cut_encoded'] = label_encoder.fit_transform(df['cut'])

# Similarly, apply to 'color' and 'clarity' if they are ordinal
df['color_encoded'] = label_encoder.fit_transform(df['color'])
df['clarity_encoded'] = label_encoder.fit_transform(df['clarity'])

In [130]:
# Normalize and scale the features
# Initialize the scaler
scaler = StandardScaler()

# Apply scaling to the numerical features
df[['carat', 'depth', 'table', 'x', 'y', 'z']] = scaler.fit_transform(df[['carat', 'depth', 'table', 'x', 'y', 'z']])

print(df.head())

   Unnamed: 0     carat      cut color clarity     depth     table         x  \
0           1 -1.198168    Ideal     E     SI2 -0.174092 -1.099672 -1.587837   
1           2 -1.240361  Premium     E     SI1 -1.360738  1.585529 -1.641325   
2           3 -1.198168     Good     E     VS1 -3.385019  3.375663 -1.498691   
3           4 -1.071587  Premium     I     VS2  0.454133  0.242928 -1.364971   
4           5 -1.029394     Good     J     SI2  1.082358  0.242928 -1.240167   

          y         z  price  cut_encoded  color_encoded  clarity_encoded  
0 -1.536196 -1.571129    326            2              1                3  
1 -1.658774 -1.741175    326            3              1                2  
2 -1.457395 -1.741175    327            1              1                4  
3 -1.317305 -1.287720    334            3              5                5  
4 -1.212238 -1.117674    335            1              6                3  


In [131]:
# Remove the non encoded features
df = df.drop(["cut", "clarity", "color"], axis=1)

print(df.head())

   Unnamed: 0     carat     depth     table         x         y         z  \
0           1 -1.198168 -0.174092 -1.099672 -1.587837 -1.536196 -1.571129   
1           2 -1.240361 -1.360738  1.585529 -1.641325 -1.658774 -1.741175   
2           3 -1.198168 -3.385019  3.375663 -1.498691 -1.457395 -1.741175   
3           4 -1.071587  0.454133  0.242928 -1.364971 -1.317305 -1.287720   
4           5 -1.029394  1.082358  0.242928 -1.240167 -1.212238 -1.117674   

   price  cut_encoded  color_encoded  clarity_encoded  
0    326            2              1                3  
1    326            3              1                2  
2    327            1              1                4  
3    334            3              5                5  
4    335            1              6                3  


In [132]:
# Create a test and train set

print(df.head())
# Define features (X) and target (y)
X = df.drop(['price'], axis=1)
y = df['price']

# Split the data into training and test sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=309)

# Create a dict for storing models results
results = {}

   Unnamed: 0     carat     depth     table         x         y         z  \
0           1 -1.198168 -0.174092 -1.099672 -1.587837 -1.536196 -1.571129   
1           2 -1.240361 -1.360738  1.585529 -1.641325 -1.658774 -1.741175   
2           3 -1.198168 -3.385019  3.375663 -1.498691 -1.457395 -1.741175   
3           4 -1.071587  0.454133  0.242928 -1.364971 -1.317305 -1.287720   
4           5 -1.029394  1.082358  0.242928 -1.240167 -1.212238 -1.117674   

   price  cut_encoded  color_encoded  clarity_encoded  
0    326            2              1                3  
1    326            3              1                2  
2    327            1              1                4  
3    334            3              5                5  
4    335            1              6                3  


In [133]:
# Run the linear regression model
# Initialize and train the model
model = LinearRegression()
model.fit(X_train, y_train)

# Start timer for execution time
start_time = time.time()

# Predict on the test set
y_pred = model.predict(X_test)

# Calculate metrics
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)

# Calculate Relative Squared Error (RSE)
rse = np.sum((y_test - y_pred) ** 2) / np.sum((y_test - np.mean(y_test)) ** 2)

# Calculate R-squared
r2 = r2_score(y_test, y_pred)

# End timer
execution_time = time.time() - start_time

# Save metrics to dictionary for later use
results["linear_regression"] = {"mean_squared_error" : mse, 
                                "root_mean_squared_error" : rmse,
                                "relative_root_error" : rse,
                                "mean_absolute_error" : mae,
                                "r-squared" : r2,
                                "execution_time" : execution_time}

print(results['linear_regression'])


{'mean_squared_error': np.float64(1861137.934092107), 'root_mean_squared_error': np.float64(1364.2352927893733), 'relative_root_error': np.float64(0.113746704450048), 'mean_absolute_error': np.float64(865.1690280535171), 'r-squared': 0.886253295549952, 'execution_time': 0.009409904479980469}


In [134]:
# Initialize and train the KNN model
model = KNeighborsRegressor(n_neighbors=5)  # You can adjust n_neighbors as needed
model.fit(X_train, y_train)

# Start timer for execution time
start_time = time.time()

# Predict on the test set
y_pred = model.predict(X_test)

# Calculate metrics
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)

# Calculate Relative Squared Error (RSE)
rse = np.sum((y_test - y_pred) ** 2) / np.sum((y_test - np.mean(y_test)) ** 2)

# Calculate R-squared
r2 = r2_score(y_test, y_pred)

# End timer
execution_time = time.time() - start_time

# Save metrics to dictionary for later use
results["knn_regression"] = {
    "mean_squared_error": mse, 
    "root_mean_squared_error": rmse,
    "relative_squared_error": rse,
    "mean_absolute_error": mae,
    "r_squared": r2,
    "execution_time": execution_time
}

print(results['knn_regression'])

{'mean_squared_error': np.float64(59742.3045569151), 'root_mean_squared_error': np.float64(244.4223896391554), 'relative_squared_error': np.float64(0.003651255576022165), 'mean_absolute_error': np.float64(25.717630700778624), 'r_squared': 0.9963487444239778, 'execution_time': 0.041956186294555664}


In [135]:
# Initialize and train the Ridge regression model
model = Ridge(alpha=1.0)  # You can adjust the alpha parameter as needed
model.fit(X_train, y_train)

# Start timer for execution time
start_time = time.time()

# Predict on the test set
y_pred = model.predict(X_test)

# Calculate metrics
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)

# Calculate Relative Squared Error (RSE)
rse = np.sum((y_test - y_pred) ** 2) / np.sum((y_test - np.mean(y_test)) ** 2)

# Calculate R-squared
r2 = r2_score(y_test, y_pred)

# End timer
execution_time = time.time() - start_time

# Save metrics to dictionary for later use
results["ridge_regression"] = {
    "mean_squared_error": mse, 
    "root_mean_squared_error": rmse,
    "relative_squared_error": rse,
    "mean_absolute_error": mae,
    "r_squared": r2,
    "execution_time": execution_time
}

print(results['ridge_regression'])

{'mean_squared_error': np.float64(1861138.3344333707), 'root_mean_squared_error': np.float64(1364.2354395167172), 'relative_squared_error': np.float64(0.1137467289176055), 'mean_absolute_error': np.float64(865.2830225763197), 'r_squared': 0.8862532710823945, 'execution_time': 0.004446983337402344}


In [136]:
# Initialize and train the Decision Tree regression model
model = DecisionTreeRegressor()  # You can adjust parameters like max_depth as needed
model.fit(X_train, y_train)

# Start timer for execution time
start_time = time.time()

# Predict on the test set
y_pred = model.predict(X_test)

# Calculate metrics
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)

# Calculate Relative Squared Error (RSE)
rse = np.sum((y_test - y_pred) ** 2) / np.sum((y_test - np.mean(y_test)) ** 2)

# Calculate R-squared
r2 = r2_score(y_test, y_pred)

# End timer
execution_time = time.time() - start_time

# Save metrics to dictionary for later use
results["decision_tree_regression"] = {
    "mean_squared_error": mse, 
    "root_mean_squared_error": rmse,
    "relative_squared_error": rse,
    "mean_absolute_error": mae,
    "r_squared": r2,
    "execution_time": execution_time
}

print(results['decision_tree_regression'])

{'mean_squared_error': np.float64(3713.347793845013), 'root_mean_squared_error': np.float64(60.93724471819359), 'relative_squared_error': np.float64(0.00022694775400017344), 'mean_absolute_error': np.float64(4.065072302558399), 'r_squared': 0.9997730522459998, 'execution_time': 0.005082130432128906}


In [137]:
# Initialize and train the Random Forest regression model
model = RandomForestRegressor(n_estimators=100)  # You can adjust n_estimators and other parameters as needed
model.fit(X_train, y_train)

# Start timer for execution time
start_time = time.time()

# Predict on the test set
y_pred = model.predict(X_test)

# Calculate metrics
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)

# Calculate Relative Squared Error (RSE)
rse = np.sum((y_test - y_pred) ** 2) / np.sum((y_test - np.mean(y_test)) ** 2)

# Calculate R-squared
r2 = r2_score(y_test, y_pred)

# End timer
execution_time = time.time() - start_time

# Save metrics to dictionary for later use
results["random_forest_regression"] = {
    "mean_squared_error": mse, 
    "root_mean_squared_error": rmse,
    "relative_squared_error": rse,
    "mean_absolute_error": mae,
    "r_squared": r2,
    "execution_time": execution_time
}

print(results['random_forest_regression'])

{'mean_squared_error': np.float64(2262.1957817575085), 'root_mean_squared_error': np.float64(47.56254599742857), 'relative_squared_error': np.float64(0.00013825805722521054), 'mean_absolute_error': np.float64(3.6907749351130983), 'r_squared': 0.9998617419427748, 'execution_time': 0.262739896774292}


In [138]:
# Initialize and train the Gradient Boosting regression model
model = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1)  # You can adjust parameters as needed
model.fit(X_train, y_train)

# Start timer for execution time
start_time = time.time()

# Predict on the test set
y_pred = model.predict(X_test)

# Calculate metrics
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)

# Calculate Relative Squared Error (RSE)
rse = np.sum((y_test - y_pred) ** 2) / np.sum((y_test - np.mean(y_test)) ** 2)

# Calculate R-squared
r2 = r2_score(y_test, y_pred)

# End timer
execution_time = time.time() - start_time

# Save metrics to dictionary for later use
results["gradient_boosting_regression"] = {
    "mean_squared_error": mse, 
    "root_mean_squared_error": rmse,
    "relative_squared_error": rse,
    "mean_absolute_error": mae,
    "r_squared": r2,
    "execution_time": execution_time
}

print(results['gradient_boosting_regression'])

{'mean_squared_error': np.float64(49945.92460110151), 'root_mean_squared_error': np.float64(223.48584877146362), 'relative_squared_error': np.float64(0.003052532657584699), 'mean_absolute_error': np.float64(132.2580326121307), 'r_squared': 0.9969474673424153, 'execution_time': 0.023200035095214844}


In [139]:
# Initialize and train the SGD regression model
model = SGDRegressor(max_iter=1000, tol=1e-3)  # You can adjust parameters as needed
model.fit(X_train, y_train)

# Start timer for execution time
start_time = time.time()

# Predict on the test set
y_pred = model.predict(X_test)

# Calculate metrics
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)

# Calculate Relative Squared Error (RSE)
rse = np.sum((y_test - y_pred) ** 2) / np.sum((y_test - np.mean(y_test)) ** 2)

# Calculate R-squared
r2 = r2_score(y_test, y_pred)

# End timer
execution_time = time.time() - start_time

# Save metrics to dictionary for later use
results["sgd_regression"] = {
    "mean_squared_error": mse, 
    "root_mean_squared_error": rmse,
    "relative_squared_error": rse,
    "mean_absolute_error": mae,
    "r_squared": r2,
    "execution_time": execution_time
}

print(results['sgd_regression'])

{'mean_squared_error': np.float64(4.69242018288542e+34), 'root_mean_squared_error': np.float64(2.16619947901513e+17), 'relative_squared_error': np.float64(2.867854778095631e+27), 'mean_absolute_error': np.float64(1.879255520353191e+17), 'r_squared': -2.867854778095631e+27, 'execution_time': 0.0048370361328125}


In [140]:
# Initialize and train the Support Vector Regression model
model = SVR(kernel='rbf', C=1.0, epsilon=0.1)  # You can adjust parameters as needed
model.fit(X_train, y_train)

# Start timer for execution time
start_time = time.time()

# Predict on the test set
y_pred = model.predict(X_test)

# Calculate metrics
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)

# Calculate Relative Squared Error (RSE)
rse = np.sum((y_test - y_pred) ** 2) / np.sum((y_test - np.mean(y_test)) ** 2)

# Calculate R-squared
r2 = r2_score(y_test, y_pred)

# End timer
execution_time = time.time() - start_time

# Save metrics to dictionary for later use
results["support_vector_regression"] = {
    "mean_squared_error": mse, 
    "root_mean_squared_error": rmse,
    "relative_squared_error": rse,
    "mean_absolute_error": mae,
    "r_squared": r2,
    "execution_time": execution_time
}

print(results['support_vector_regression'])

{'mean_squared_error': np.float64(15101463.093481181), 'root_mean_squared_error': np.float64(3886.0600990567787), 'relative_squared_error': np.float64(0.9229523657500719), 'mean_absolute_error': np.float64(2208.557476621165), 'r_squared': 0.07704763424992811, 'execution_time': 23.329512119293213}


In [143]:
# Initialize and train the Linear Support Vector Regression model
model = LinearSVR(max_iter=1000, C=1.0, epsilon=0.1)  # You can adjust parameters as needed
model.fit(X_train, y_train)

# Start timer for execution time
start_time = time.time()

# Predict on the test set
y_pred = model.predict(X_test)

# Calculate metrics
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)

# Calculate Relative Squared Error (RSE)
rse = np.sum((y_test - y_pred) ** 2) / np.sum((y_test - np.mean(y_test)) ** 2)

# Calculate R-squared
r2 = r2_score(y_test, y_pred)

# End timer
execution_time = time.time() - start_time

# Save metrics to dictionary for later use
results["linear_support_vector_regression"] = {
    "mean_squared_error": mse, 
    "root_mean_squared_error": rmse,
    "relative_squared_error": rse,
    "mean_absolute_error": mae,
    "r_squared": r2,
    "execution_time": execution_time
}

print(results['linear_support_vector_regression'])

{'mean_squared_error': np.float64(7280778.7122297445), 'root_mean_squared_error': np.float64(2698.291813764728), 'relative_squared_error': np.float64(0.44497754259691125), 'mean_absolute_error': np.float64(1876.5861790733954), 'r_squared': 0.5550224574030888, 'execution_time': 0.0029058456420898438}




In [146]:
# Initialize and train the Multi-Layer Perceptron Regression model
model = MLPRegressor(hidden_layer_sizes=(100,), max_iter=1000, random_state=1)  # Adjust parameters as needed
model.fit(X_train, y_train)

# Start timer for execution time
start_time = time.time()

# Predict on the test set
y_pred = model.predict(X_test)

# Calculate metrics
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)

# Calculate Relative Squared Error (RSE)
rse = np.sum((y_test - y_pred) ** 2) / np.sum((y_test - np.mean(y_test)) ** 2)

# Calculate R-squared
r2 = r2_score(y_test, y_pred)

# End timer
execution_time = time.time() - start_time

# Save metrics to dictionary for later use
results["mlp_regression"] = {
    "mean_squared_error": mse, 
    "root_mean_squared_error": rmse,
    "relative_squared_error": rse,
    "mean_absolute_error": mae,
    "r_squared": r2,
    "execution_time": execution_time
}

print(results['mlp_regression'])

{'mean_squared_error': np.float64(976068.0191580654), 'root_mean_squared_error': np.float64(987.9615474086354), 'relative_squared_error': np.float64(0.05965410648215916), 'mean_absolute_error': np.float64(534.8736379945348), 'r_squared': 0.9403458935178408, 'execution_time': 0.005889892578125}


In [149]:
#print(results)

# Convert the results dictionary to a DataFrame
results_df = pd.DataFrame.from_dict({k: v for k, v in results.items()}).T

# Display the DataFrame
print(results_df)

                          mean_squared_error  root_mean_squared_error  \
linear_regression               1.861138e+06              1364.235293   
knn_regression                  5.974230e+04               244.422390   
ridge_regression                1.861138e+06              1364.235440   
decision_tree_regression        3.713348e+03                60.937245   
random_forest_regression        2.262196e+03                47.562546   

                          relative_root_error  mean_absolute_error  r-squared  \
linear_regression                    0.113747           865.169028   0.886253   
knn_regression                            NaN            25.717631        NaN   
ridge_regression                          NaN           865.283023        NaN   
decision_tree_regression                  NaN             4.065072        NaN   
random_forest_regression                  NaN             3.690775        NaN   

                          execution_time  relative_squared_error  r_square