In [None]:
from sklearn.preprocessing import StandardScaler
import numpy as np
import pandas as pd

In [None]:
#Reading in Data
Diamonds_Train = pd.read_csv('/content/diamonds_train.csv')
Diamonds_Test = pd.read_csv('/content/diamonds_test.csv')
Diamonds_Val = pd.read_csv('/content/diamonds_val.csv')

In [None]:
def get_data(split: str):
    df = pd.read_csv(f'diamonds_{split}.csv')
    df = df.rename(columns = {'Unnamed: 0': 'Index'})
    # print(df.dtypes)
    x = df.drop(['Index', 'price'], axis = 1) # index col not needed
    # print(x.describe())
    y = df['price']
    # print(y.describe())
    # x_arr = x.to_numpy()
    # y_arr = y.to_numpy()
    # # print(x_arr[1])
    # print(y_arr[0])
    return x, y

In [None]:
def ohe(df: pd.DataFrame, col: str):
  '''
  One Hot Encodes a Categorical Variable
  Arguments:
    df: Pandas DataFrame
    col: column to encode
  Returns:
    DataFrame with One Hot Encoded Columns

  '''
  visited = []
  for val in df[col]:
    if val not in visited:
      visited.append(val)
  for val in visited:
    if f'{val}_{col}' not in df.columns:
      df[f'{val}_{col}'] = df[col].apply(lambda x: 1 if x == val else 0)

  df = df.drop(col, axis = 1)
  return df

In [None]:
def feature_engineering(split):
  _, y_train = get_data('train')
  x_split, y_split = get_data(split)
  print(x_split.head())
  x_split = ohe(x_split, 'color')
  x_split = ohe(x_split, 'cut')
  x_split = ohe(x_split, 'clarity')
  scaler = StandardScaler()
  scaler.fit(y_train.to_numpy().reshape(-1, 1))
  y_split = scaler.transform(y_split.to_numpy().reshape(-1, 1))
  n_feats = x_split.shape[1]
  print(n_feats)
  x_split = x_split.to_numpy()
  return x_split, y_split, n_feats

# change param to use train as a basis for fit


In [None]:
x_train, y_train, n_feats = feature_engineering('train')
x_val, y_val, _  =feature_engineering('val')
x_test, y_test, _ = feature_engineering('test')

   carat        cut color clarity  depth  table     x     y     z
0   0.58      Ideal     F     VS1   60.4   57.0  5.42  5.37  3.26
1   1.05  Very Good     H     SI1   63.5   57.0  6.47  6.44  4.10
2   0.71    Premium     G     SI1   58.9   62.0  5.85  5.79  3.43
3   0.50      Ideal     G     VS1   62.0   55.0  5.05  5.11  3.15
4   0.52      Ideal     D     VS2   62.3   54.0  5.19  5.14  3.22
26
   carat        cut color clarity  depth  table     x     y     z
0   0.29  Very Good     E     VS1   62.8   44.0  4.20  4.24  2.65
1   0.30       Good     G      IF   63.5   55.0  4.26  4.28  2.71
2   1.01      Ideal     E     SI1   62.0   57.0  6.38  6.45  3.98
3   1.60    Premium     H     VS1   62.5   58.0  7.47  7.37  4.64
4   0.78      Ideal     H     VS2   61.6   56.0  5.94  5.91  3.64
26
   carat        cut color clarity  depth  table     x     y     z
0   0.31    Premium     H    VVS2   61.2   60.0  4.39  4.37  2.68
1   0.50    Premium     E     VS2   61.8   59.0  5.12  5.08  3.15
2   

In [None]:
x_train

array([[ 0.58, 60.4 , 57.  , ...,  0.  ,  0.  ,  0.  ],
       [ 1.05, 63.5 , 57.  , ...,  0.  ,  0.  ,  0.  ],
       [ 0.71, 58.9 , 62.  , ...,  0.  ,  0.  ,  0.  ],
       ...,
       [ 0.5 , 62.4 , 53.  , ...,  1.  ,  0.  ,  0.  ],
       [ 0.32, 61.9 , 55.  , ...,  0.  ,  0.  ,  0.  ],
       [ 1.54, 60.9 , 56.4 , ...,  0.  ,  0.  ,  0.  ]])

In [None]:
# Convert x_train back to DataFrame and give appropriate column names
column_names = ['carat', 'depth', 'table', 'x', 'y', 'z',
                'I_color', 'D_color', 'E_color', 'F_color', 'G_color', 'H_color', 'J_color',
                'Ideal_cut', 'Premium_cut', 'Very Good_cut', 'Good_cut', 'Fair_cut',
                'SI2_clarity', 'SI1_clarity', 'VS1_clarity', 'VS2_clarity', 'VVS2_clarity', 'VVS1_clarity', 'I1_clarity', 'IF_clarity']
x_train = pd.DataFrame(x_train, columns=column_names)
# Convert y_train back to DataFrame and give appropriate column name
column_names = ['price']
y_train=pd.DataFrame(y_train, columns=column_names)
# Convert x_val back to DataFrame and give appropriate column names
column_names = ['carat', 'depth', 'table', 'x', 'y', 'z',
                'I_color', 'D_color', 'E_color', 'F_color', 'G_color', 'H_color', 'J_color',
                'Ideal_cut', 'Premium_cut', 'Very Good_cut', 'Good_cut', 'Fair_cut',
                'SI2_clarity', 'SI1_clarity', 'VS1_clarity', 'VS2_clarity', 'VVS2_clarity', 'VVS1_clarity', 'I1_clarity', 'IF_clarity']
x_val = pd.DataFrame(x_val, columns=column_names)
# Convert y_val back to DataFrame and give appropriate column name
column_names = ['price']
y_val=pd.DataFrame(y_val, columns=column_names)
# Convert x_test back to DataFrame and give appropriate column names
column_names = ['carat', 'depth', 'table', 'x', 'y', 'z',
                'I_color', 'D_color', 'E_color', 'F_color', 'G_color', 'H_color', 'J_color',
                'Ideal_cut', 'Premium_cut', 'Very Good_cut', 'Good_cut', 'Fair_cut',
                'SI2_clarity', 'SI1_clarity', 'VS1_clarity', 'VS2_clarity', 'VVS2_clarity', 'VVS1_clarity', 'I1_clarity', 'IF_clarity']
x_test = pd.DataFrame(x_test, columns=column_names)
# Convert y_test back to DataFrame and give appropriate column name
column_names = ['price']
y_test=pd.DataFrame(y_test, columns=column_names)

In [None]:
#Create linear regression model with carat, x, y and z as predictors, price as response
import statsmodels.api as sm
X=x_train[['carat', 'x', 'y', 'z']]
X = sm.add_constant(X)
y=y_train['price']
model=sm.OLS(y,X).fit()

In [None]:
#Find the validation error using the validation set and training model
X_val=x_val[['carat', 'x', 'y', 'z']]
X_val = sm.add_constant(X_val)
Y_val=y_val['price']
Y_pred=model.predict(X_val)
#Show testing error (MSE) on validation set
from sklearn.metrics import mean_squared_error
mse=mean_squared_error(Y_val, Y_pred)
print('Validation MSE:', mse)
#Show testing error (MSE) on testing set
X_test=x_test[['carat', 'x', 'y', 'z']]
X_test = sm.add_constant(X_test)
Y_test=y_test['price']
Y_pred=model.predict(X_test)
mse=mean_squared_error(Y_test, Y_pred)
print('Testing MSE:', mse)



Validation MSE: 0.1485176082080068
Testing MSE: 0.1550386086380108


In [None]:
#Perform forward selection using all predictors
import statsmodels.api as sm
from sklearn.metrics import mean_squared_error
import numpy as np

predictors =  ['carat', 'depth', 'table', 'x', 'y', 'z', 'I_color', 'D_color', 'E_color', 'F_color', 'G_color', 'H_color', 'J_color',
              'Ideal_cut', 'Premium_cut', 'Very Good_cut', 'Good_cut', 'Fair_cut',
              'SI2_clarity', 'SI1_clarity', 'VS1_clarity', 'VS2_clarity', 'VVS2_clarity', 'VVS1_clarity', 'I1_clarity', 'IF_clarity']

def forward_selection_validation(x_train, y_train, x_val, y_val):
    remaining_features = x_train.columns.tolist()
    selected_features = []
    results = []
    best_score = float('inf')
    while len(remaining_features) > 0:
        scores_with_candidates = []
        for feature in remaining_features:
            features_subset = selected_features + [feature]
            X_train_subset = sm.add_constant(x_train[features_subset])
            model = sm.OLS(y_train, X_train_subset).fit()
            X_val_subset = sm.add_constant(x_val[features_subset])
            y_val_pred = model.predict(X_val_subset)
            mse_val = mean_squared_error(y_val, y_val_pred)
            scores_with_candidates.append((mse_val, feature, features_subset))
        scores_with_candidates.sort()
        best_score_candidate, best_feature_to_add, new_features = scores_with_candidates[0]
        if best_score_candidate < best_score:
              best_score = best_score_candidate
              selected_features = new_features
              remaining_features.remove(best_feature_to_add)
              results.append((best_score, selected_features.copy()))
        else:
              break
    return results

# Apply forward selection based on Validation MSE
X_train_fs = x_train[predictors]
y_train_fs = y_train['price']
X_val_fs = x_val[predictors]
y_val_fs = y_val['price']

results_forward_val = forward_selection_validation(X_train_fs, y_train_fs, X_val_fs, y_val_fs)
#Get best model
best_mse_forward_val, best_subset_forward_val = results_forward_val[-1]
print("Best subset by validation MSE (Forward Selection):", best_subset_forward_val)
print("Validation MSE (Forward Selection):", best_mse_forward_val)


Best subset by validation MSE (Forward Selection): ['carat', 'IF_clarity', 'J_color', 'x', 'H_color', 'I1_clarity', 'SI2_clarity', 'VVS1_clarity', 'depth', 'table', 'G_color', 'Fair_cut', 'I_color', 'Premium_cut', 'z']
Validation MSE (Forward Selection): 0.10620742405563631


In [None]:
#Create linear regression model
import statsmodels.api as sm
X=x_train[['carat', 'IF_clarity', 'J_color', 'x', 'H_color', 'I1_clarity', 'SI2_clarity', 'VVS1_clarity', 'depth', 'table', 'G_color', 'Fair_cut', 'I_color', 'Premium_cut', 'z']]
X = sm.add_constant(X)
y=y_train['price']
model=sm.OLS(y,X).fit()
model.summary()

0,1,2,3
Dep. Variable:,price,R-squared:,0.894
Model:,OLS,Adj. R-squared:,0.894
Method:,Least Squares,F-statistic:,24250.0
Date:,"Thu, 01 May 2025",Prob (F-statistic):,0.0
Time:,15:09:01,Log-Likelihood:,-12809.0
No. Observations:,43152,AIC:,25650.0
Df Residuals:,43136,BIC:,25790.0
Df Model:,15,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,2.0885,0.115,18.237,0.000,1.864,2.313
carat,2.8225,0.016,180.133,0.000,2.792,2.853
IF_clarity,-0.8991,0.014,-65.045,0.000,-0.926,-0.872
J_color,-0.4313,0.007,-58.840,0.000,-0.446,-0.417
x,-0.3063,0.009,-34.419,0.000,-0.324,-0.289
H_color,-0.2215,0.005,-40.332,0.000,-0.232,-0.211
I1_clarity,0.3339,0.009,37.146,0.000,0.316,0.351
SI2_clarity,0.1755,0.004,39.410,0.000,0.167,0.184
VVS1_clarity,0.2516,0.006,38.739,0.000,0.239,0.264

0,1,2,3
Omnibus:,12644.235,Durbin-Watson:,1.996
Prob(Omnibus):,0.0,Jarque-Bera (JB):,233909.925
Skew:,0.94,Prob(JB):,0.0
Kurtosis:,14.25,Cond. No.,6200.0


In [None]:
#Show MSE of this linear regression model
X_val=x_val[['carat', 'IF_clarity', 'J_color', 'x', 'H_color', 'I1_clarity', 'SI2_clarity', 'VVS1_clarity', 'depth', 'table', 'G_color', 'Fair_cut', 'I_color', 'Premium_cut', 'z']]
X_val = sm.add_constant(X_val)
Y_val=y_val['price']
Y_pred=model.predict(X_val)
#Show testing error (MSE) on validation set
mse=mean_squared_error(Y_val, Y_pred)
print('Validation MSE:', mse)
#Show error on testing set
X_test=x_test[['carat', 'IF_clarity', 'J_color', 'x', 'H_color', 'I1_clarity', 'SI2_clarity', 'VVS1_clarity', 'depth', 'table', 'G_color', 'Fair_cut', 'I_color', 'Premium_cut', 'z']]
X_test = sm.add_constant(X_test)
Y_test=y_test['price']
Y_pred=model.predict(X_test)
mse=mean_squared_error(Y_test, Y_pred)
print('Testing MSE:', mse)

Validation MSE: 0.10620742405563631
Testing MSE: 0.20937723349892412


In [None]:
#Perform ridge regression with my previous linear model
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV
ridge_model = Ridge()
X=x_train[['carat', 'IF_clarity', 'J_color', 'x', 'H_color', 'I1_clarity', 'SI2_clarity', 'VVS1_clarity', 'depth', 'table', 'G_color', 'Fair_cut', 'I_color', 'Premium_cut', 'z']]
X = sm.add_constant(X)
param_grid = {'alpha': [0.001, 0.01, 0.1,1,1.3,2, 10, 100]}
grid_search = GridSearchCV(ridge_model, param_grid, cv=5)
grid_search.fit(X, y_train)
best_alpha = grid_search.best_params_['alpha']
best_ridge_model = Ridge(alpha=best_alpha)
best_ridge_model.fit(X, y_train)
#Show results with this model
X_val=x_val[['carat', 'IF_clarity', 'J_color', 'x', 'H_color', 'I1_clarity', 'SI2_clarity', 'VVS1_clarity', 'depth', 'table', 'G_color', 'Fair_cut', 'I_color', 'Premium_cut', 'z']]
X_val = sm.add_constant(X_val)
Y_val=y_val['price']
Y_pred=best_ridge_model.predict(X_val)
mse=mean_squared_error(Y_val, Y_pred)
print('Validation MSE:', mse)
#Show best alpha
print('Best alpha:', best_alpha)
#Show error of model with alpha of 1.3 on testing set
X_test=x_test[['carat', 'IF_clarity', 'J_color', 'x', 'H_color', 'I1_clarity', 'SI2_clarity', 'VVS1_clarity', 'depth', 'table', 'G_color', 'Fair_cut', 'I_color', 'Premium_cut', 'z']]
X_test = sm.add_constant(X_test)
Y_test=y_test['price']
Y_pred=best_ridge_model.predict(X_test)
mse=mean_squared_error(Y_test, Y_pred)
print('Testing MSE:', mse)

Validation MSE: 0.10620485897155604
Best alpha: 1.3
Testing MSE: 0.20911880172820987
