In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn import preprocessing 
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from collections import Counter
from sklearn.datasets import make_classification
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.linear_model import SGDRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
from math import sqrt


# Data load & cleaning

In [2]:
diamonds = pd.read_csv('../data/diamond_train.csv')

In [3]:
diamonds

Unnamed: 0,clarity,city,color,cut,x,y,z,depth,table,price,carat
0,VS2,Dubai,J,Premium,6.83,6.79,4.25,62.4,58.0,4268,1.21
1,VS2,Kimberly,H,Very Good,4.35,4.38,2.75,63.0,57.0,505,0.32
2,VS1,Las Vegas,G,Fair,5.62,5.53,3.65,65.5,55.0,2686,0.71
3,SI1,Kimberly,D,Good,4.68,4.72,3.00,63.8,56.0,738,0.41
4,SI1,Dubai,G,Ideal,6.55,6.51,3.95,60.5,59.0,4882,1.02
...,...,...,...,...,...,...,...,...,...,...,...
40450,VS1,Antwerp,G,Ideal,7.10,7.04,4.43,62.7,57.0,10070,1.34
40451,SI2,Madrid,F,Good,8.31,8.25,4.73,57.1,60.0,12615,2.02
40452,SI1,Kimberly,H,Ideal,6.37,6.42,4.01,62.7,56.0,5457,1.01
40453,VS1,Kimberly,J,Ideal,4.45,4.47,2.76,61.9,54.3,456,0.33


In [4]:
diamonds.shape

(40455, 11)

In [5]:
diamonds = diamonds.replace(0, np.nan)

In [6]:
diamonds.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40455 entries, 0 to 40454
Data columns (total 11 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   clarity  40455 non-null  object 
 1   city     40455 non-null  object 
 2   color    40455 non-null  object 
 3   cut      40455 non-null  object 
 4   x        40450 non-null  float64
 5   y        40451 non-null  float64
 6   z        40439 non-null  float64
 7   depth    40455 non-null  float64
 8   table    40455 non-null  float64
 9   price    40455 non-null  int64  
 10  carat    40455 non-null  float64
dtypes: float64(6), int64(1), object(4)
memory usage: 3.4+ MB


In [7]:
diamonds = diamonds.dropna()

In [8]:
diamonds.describe()

Unnamed: 0,x,y,z,depth,table,price,carat
count,40439.0,40439.0,40439.0,40439.0,40439.0,40439.0,40439.0
mean,5.729616,5.732897,3.538553,61.753013,57.445543,3926.535448,0.797425
std,1.122384,1.145002,0.693639,1.431306,2.233055,3990.024501,0.475257
min,3.77,3.72,1.07,43.0,43.0,326.0,0.2
25%,4.71,4.72,2.91,61.0,56.0,945.0,0.4
50%,5.69,5.71,3.52,61.8,57.0,2396.0,0.7
75%,6.54,6.54,4.04,62.5,59.0,5329.5,1.04
max,10.23,58.9,8.06,79.0,95.0,18823.0,4.5


In [9]:
diamonds.dtypes

clarity     object
city        object
color       object
cut         object
x          float64
y          float64
z          float64
depth      float64
table      float64
price        int64
carat      float64
dtype: object

In [10]:
diamonds.info()

<class 'pandas.core.frame.DataFrame'>
Index: 40439 entries, 0 to 40454
Data columns (total 11 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   clarity  40439 non-null  object 
 1   city     40439 non-null  object 
 2   color    40439 non-null  object 
 3   cut      40439 non-null  object 
 4   x        40439 non-null  float64
 5   y        40439 non-null  float64
 6   z        40439 non-null  float64
 7   depth    40439 non-null  float64
 8   table    40439 non-null  float64
 9   price    40439 non-null  int64  
 10  carat    40439 non-null  float64
dtypes: float64(6), int64(1), object(4)
memory usage: 3.7+ MB


# Pre-processing & EDA

In [11]:
diamonds.columns

Index(['clarity', 'city', 'color', 'cut', 'x', 'y', 'z', 'depth', 'table',
       'price', 'carat'],
      dtype='object')

In [12]:
diamonds['clarity'].unique()

array(['VS2', 'VS1', 'SI1', 'SI2', 'IF', 'VVS1', 'VVS2', 'I1'],
      dtype=object)

In [13]:
diamonds['color'].unique()

array(['J', 'H', 'G', 'D', 'F', 'E', 'I'], dtype=object)

In [14]:
diamonds['cut'].unique()
diamonds_clean = diamonds.drop('city', axis=1)

In [15]:
label_cut = {'Premium': 4, 'Very Good': 3, 'Fair': 2, 'Good':1,'Ideal': 5}
label_clarity = {'IF': 8, 'VVS1': 7, 'VVS2': 6,'VS1': 5, 'VS2': 4, 'SI1': 3, 'SI2':2,'I1':1}
label_color = {'J':7, 'H':6, 'G':5, 'D':4, 'F':3, 'E':2, 'I':1}
# Apply the mapping to the column
diamonds_clean['cut'] = diamonds_clean['cut'].map(label_cut)
diamonds_clean['clarity'] = diamonds_clean['clarity'].map(label_clarity)
diamonds_clean['color'] = diamonds_clean['color'].map(label_color)


In [16]:
X = diamonds_clean.drop('price', axis=1)
y = diamonds_clean['price']

In [17]:
X

Unnamed: 0,clarity,color,cut,x,y,z,depth,table,carat
0,4,7,4,6.83,6.79,4.25,62.4,58.0,1.21
1,4,6,3,4.35,4.38,2.75,63.0,57.0,0.32
2,5,5,2,5.62,5.53,3.65,65.5,55.0,0.71
3,3,4,1,4.68,4.72,3.00,63.8,56.0,0.41
4,3,5,5,6.55,6.51,3.95,60.5,59.0,1.02
...,...,...,...,...,...,...,...,...,...
40450,5,5,5,7.10,7.04,4.43,62.7,57.0,1.34
40451,2,3,1,8.31,8.25,4.73,57.1,60.0,2.02
40452,3,6,5,6.37,6.42,4.01,62.7,56.0,1.01
40453,5,7,5,4.45,4.47,2.76,61.9,54.3,0.33


In [18]:
y

0         4268
1          505
2         2686
3          738
4         4882
         ...  
40450    10070
40451    12615
40452     5457
40453      456
40454     6232
Name: price, Length: 40439, dtype: int64

# Featuring Engineering

In [19]:
scaler = RobustScaler()
X_scaled = scaler.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)


# Model Selection

In [20]:
random_forest_model = RandomForestRegressor(n_estimators=500, random_state=42)

random_forest_model.fit(X_train, y_train)

# Model Evaluation

In [21]:
random_forest_pred = random_forest_model.predict(X_test)

In [22]:
random_forest_pred

array([4315.248, 7755.962,  754.072, ..., 2377.496,  958.534,  418.064])

In [23]:
# Random Forest
random_forest_mae = mean_absolute_error(y_test, random_forest_pred)
random_forest_mse = mean_squared_error(y_test, random_forest_pred)
random_forest_rmse = sqrt(random_forest_mse)  # Calculate RMSE
random_forest_r2 = r2_score(y_test, random_forest_pred)

print("Random Forest Regression Model:")
print("Mean Absolute Error (MAE):", random_forest_mae)
print("Mean Squared Error (MSE):", random_forest_mse)
print("Root Mean Squared Error (RMSE):", random_forest_rmse)
print("R-squared (R2) Score:", random_forest_r2)

Random Forest Regression Model:
Mean Absolute Error (MAE): 273.08678877996795
Mean Squared Error (MSE): 310404.26870974735
Root Mean Squared Error (RMSE): 557.1393620179311
R-squared (R2) Score: 0.9806875690581109


In [24]:
# RMSE has better results on Random Forest than Linear Regression

# Model Deployment

In [25]:
test_data = pd.read_csv('../data/diamonds_test.csv')

test_data. info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13485 entries, 0 to 13484
Data columns (total 11 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   id       13485 non-null  int64  
 1   carat    13485 non-null  float64
 2   cut      13485 non-null  object 
 3   color    13485 non-null  object 
 4   clarity  13485 non-null  object 
 5   depth    13485 non-null  float64
 6   table    13485 non-null  float64
 7   x        13485 non-null  float64
 8   y        13485 non-null  float64
 9   z        13485 non-null  float64
 10  city     13485 non-null  object 
dtypes: float64(6), int64(1), object(4)
memory usage: 1.1+ MB


In [26]:
test_data.columns
test_data = test_data.drop('city', axis=1)

In [27]:
label_cut = {'Premium': 4, 'Very Good': 3, 'Fair': 2, 'Good':1,'Ideal': 5}
label_clarity = {'IF': 8, 'VVS1': 7, 'VVS2': 6,'VS1': 5, 'VS2': 4, 'SI1': 3, 'SI2':2,'I1':1}
label_color = {'J':7, 'H':6, 'G':5, 'D':4, 'F':3, 'E':2, 'I':1}
# Apply the mapping to the column
test_data['cut'] = test_data['cut'].map(label_cut)
test_data['clarity'] = test_data['clarity'].map(label_clarity)
test_data['color'] = test_data['color'].map(label_color)

# Apply normal label encoding to the 'city' column
test_data_clean = test_data.drop(['id'], axis=1)
test_data_clean

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z
0,0.79,3,3,3,62.7,60.0,5.82,5.89,3.67
1,1.20,5,7,5,61.0,57.0,6.81,6.89,4.18
2,1.57,4,6,3,62.2,61.0,7.38,7.32,4.57
3,0.90,3,3,3,63.8,54.0,6.09,6.13,3.90
4,0.50,3,3,5,62.9,58.0,5.05,5.09,3.19
...,...,...,...,...,...,...,...,...,...
13480,0.57,5,2,3,61.9,56.0,5.35,5.32,3.30
13481,0.71,5,1,4,62.2,55.0,5.71,5.73,3.56
13482,0.70,5,3,5,61.6,55.0,5.75,5.71,3.53
13483,0.70,3,3,2,58.8,57.0,5.85,5.89,3.45


In [28]:
scaler = RobustScaler()
test_data_scaled = scaler.fit_transform(test_data_clean)
test_data_scaled

array([[ 0.140625  , -0.5       , -0.33333333, ...,  0.06666667,
         0.09444444,  0.125     ],
       [ 0.78125   ,  0.5       ,  1.        , ...,  0.61666667,
         0.65      ,  0.58035714],
       [ 1.359375  ,  0.        ,  0.66666667, ...,  0.93333333,
         0.88888889,  0.92857143],
       ...,
       [ 0.        ,  0.5       , -0.33333333, ...,  0.02777778,
        -0.00555556,  0.        ],
       [ 0.        , -0.5       , -0.33333333, ...,  0.08333333,
         0.09444444, -0.07142857],
       [-0.46875   ,  0.5       , -1.        , ..., -0.55555556,
        -0.55      , -0.52678571]])

In [29]:
new_predictions = random_forest_model.predict(test_data_scaled)
new_predictions

array([ 4975.138     ,  6105.753     , 10466.25666667, ...,
        2616.402     ,  1628.184     ,  2096.622     ])

In [30]:
test_data_scaled = pd.DataFrame(new_predictions, columns=['price'])
test_data_scaled 

Unnamed: 0,price
0,4975.138000
1,6105.753000
2,10466.256667
3,10094.146000
4,3942.388000
...,...
13480,1714.286000
13481,2823.224000
13482,2616.402000
13483,1628.184000


In [31]:

test_data_scaled['id'] = np.arange(test_data_scaled.shape[0])
test_data_scaled=test_data_scaled[['id','price']]
test_data_scaled

Unnamed: 0,id,price
0,0,4975.138000
1,1,6105.753000
2,2,10466.256667
3,3,10094.146000
4,4,3942.388000
...,...,...
13480,13480,1714.286000
13481,13481,2823.224000
13482,13482,2616.402000
13483,13483,1628.184000


In [32]:
test_data_scaled.to_csv('../delivery/price_deliveryrestart11.csv', index=False) 