In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn import preprocessing 
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from collections import Counter
from sklearn.datasets import make_classification
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.linear_model import SGDRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
from math import sqrt

# Data load & cleaning

In [2]:
diamonds = pd.read_csv('../data/diamond_train.csv')

In [3]:
diamonds

Unnamed: 0,clarity,city,color,cut,x,y,z,depth,table,price,carat
0,VS2,Dubai,J,Premium,6.83,6.79,4.25,62.4,58.0,4268,1.21
1,VS2,Kimberly,H,Very Good,4.35,4.38,2.75,63.0,57.0,505,0.32
2,VS1,Las Vegas,G,Fair,5.62,5.53,3.65,65.5,55.0,2686,0.71
3,SI1,Kimberly,D,Good,4.68,4.72,3.00,63.8,56.0,738,0.41
4,SI1,Dubai,G,Ideal,6.55,6.51,3.95,60.5,59.0,4882,1.02
...,...,...,...,...,...,...,...,...,...,...,...
40450,VS1,Antwerp,G,Ideal,7.10,7.04,4.43,62.7,57.0,10070,1.34
40451,SI2,Madrid,F,Good,8.31,8.25,4.73,57.1,60.0,12615,2.02
40452,SI1,Kimberly,H,Ideal,6.37,6.42,4.01,62.7,56.0,5457,1.01
40453,VS1,Kimberly,J,Ideal,4.45,4.47,2.76,61.9,54.3,456,0.33


In [4]:
diamonds.shape

(40455, 11)

In [5]:
diamonds.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40455 entries, 0 to 40454
Data columns (total 11 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   clarity  40455 non-null  object 
 1   city     40455 non-null  object 
 2   color    40455 non-null  object 
 3   cut      40455 non-null  object 
 4   x        40455 non-null  float64
 5   y        40455 non-null  float64
 6   z        40455 non-null  float64
 7   depth    40455 non-null  float64
 8   table    40455 non-null  float64
 9   price    40455 non-null  int64  
 10  carat    40455 non-null  float64
dtypes: float64(6), int64(1), object(4)
memory usage: 3.4+ MB


In [6]:
diamonds.describe()

Unnamed: 0,x,y,z,depth,table,price,carat
count,40455.0,40455.0,40455.0,40455.0,40455.0,40455.0,40455.0
mean,5.729392,5.732819,3.537154,61.752841,57.446133,3928.444469,0.797706
std,1.124453,1.14665,0.697062,1.431725,2.233535,3992.416147,0.475544
min,0.0,0.0,0.0,43.0,43.0,326.0,0.2
25%,4.71,4.72,2.91,61.0,56.0,945.0,0.4
50%,5.69,5.71,3.52,61.8,57.0,2397.0,0.7
75%,6.54,6.54,4.035,62.5,59.0,5331.0,1.04
max,10.23,58.9,8.06,79.0,95.0,18823.0,4.5


In [7]:
diamonds.dtypes

clarity     object
city        object
color       object
cut         object
x          float64
y          float64
z          float64
depth      float64
table      float64
price        int64
carat      float64
dtype: object

# Pre-processing & EDA

In [8]:
diamonds.columns

Index(['clarity', 'city', 'color', 'cut', 'x', 'y', 'z', 'depth', 'table',
       'price', 'carat'],
      dtype='object')

In [9]:
X = diamonds[['x', 'y', 'z', 'depth', 'table',
       'price', 'carat']].drop('price', axis=1)
y = diamonds['price']

# Featuring Engineering

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# Model Selection

In [11]:
random_forest_model = RandomForestRegressor(n_estimators=1000, random_state=42)

random_forest_model.fit(X_train, y_train)

# Model Evaluation

In [12]:
random_forest_pred = random_forest_model.predict(X_test)

In [13]:
random_forest_pred

array([ 2209.39366667,  1974.17      ,   732.82913333, ...,
        3211.05583333,  3868.943     , 12350.311     ])

In [14]:
# Random Forest
random_forest_mae = mean_absolute_error(y_test, random_forest_pred)
random_forest_mse = mean_squared_error(y_test, random_forest_pred)
random_forest_rmse = sqrt(random_forest_mse)  # Calculate RMSE
random_forest_r2 = r2_score(y_test, random_forest_pred)

print("Random Forest Regression Model:")
print("Mean Absolute Error (MAE):", random_forest_mae)
print("Mean Squared Error (MSE):", random_forest_mse)
print("Root Mean Squared Error (RMSE):", random_forest_rmse)
print("R-squared (R2) Score:", random_forest_r2)

Random Forest Regression Model:
Mean Absolute Error (MAE): 779.0575047097428
Mean Squared Error (MSE): 1870694.6149997907
Root Mean Squared Error (RMSE): 1367.7333859344776
R-squared (R2) Score: 0.8851263076073305


In [15]:
# RMSE has better results on Random Forest than Linear Regression

# Model Deployment

In [16]:
test_data = pd.read_csv('../data/diamonds_test.csv')

test_data. info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13485 entries, 0 to 13484
Data columns (total 11 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   id       13485 non-null  int64  
 1   carat    13485 non-null  float64
 2   cut      13485 non-null  object 
 3   color    13485 non-null  object 
 4   clarity  13485 non-null  object 
 5   depth    13485 non-null  float64
 6   table    13485 non-null  float64
 7   x        13485 non-null  float64
 8   y        13485 non-null  float64
 9   z        13485 non-null  float64
 10  city     13485 non-null  object 
dtypes: float64(6), int64(1), object(4)
memory usage: 1.1+ MB


In [17]:
test_data.columns

Index(['id', 'carat', 'cut', 'color', 'clarity', 'depth', 'table', 'x', 'y',
       'z', 'city'],
      dtype='object')

In [18]:
test_data = test_data[['x', 'y', 'z', 'depth', 'table','carat']]

In [19]:
new_predictions = random_forest_model.predict(test_data)

In [20]:
test_data['price'] = new_predictions
test_data

Unnamed: 0,x,y,z,depth,table,carat,price
0,5.82,5.89,3.67,62.7,60.0,0.79,3736.748000
1,6.81,6.89,4.18,61.0,57.0,1.20,7513.781000
2,7.38,7.32,4.57,62.2,61.0,1.57,9313.460000
3,6.09,6.13,3.90,63.8,54.0,0.90,3758.114086
4,5.05,5.09,3.19,62.9,58.0,0.50,1460.829350
...,...,...,...,...,...,...,...
13480,5.35,5.32,3.30,61.9,56.0,0.57,2091.615400
13481,5.71,5.73,3.56,62.2,55.0,0.71,3000.021550
13482,5.75,5.71,3.53,61.6,55.0,0.70,2665.945717
13483,5.85,5.89,3.45,58.8,57.0,0.70,2520.845000


In [21]:
data_to_deliver = test_data.drop(['carat','depth', 'table', 'x', 'y', 'z'], axis=1)
data_to_deliver['id'] = np.arange(data_to_deliver.shape[0])
data_to_deliver = data_to_deliver[['id','price']]
data_to_deliver

Unnamed: 0,id,price
0,0,3736.748000
1,1,7513.781000
2,2,9313.460000
3,3,3758.114086
4,4,1460.829350
...,...,...
13480,13480,2091.615400
13481,13481,3000.021550
13482,13482,2665.945717
13483,13483,2520.845000


In [22]:
data_to_deliver.to_csv('../delivery/price_deliveryrestart3.csv', index=False) 