## Analyzing Diamond Price Data 

Data source: https://www.kaggle.com/datasets/shivam2503/diamonds

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso

In [2]:
data = pd.read_csv("diamonds.csv")
data

Unnamed: 0.1,Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,1,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,2,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,3,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,4,0.29,Premium,I,VS2,62.4,58.0,334,4.20,4.23,2.63
4,5,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75
...,...,...,...,...,...,...,...,...,...,...,...
53935,53936,0.72,Ideal,D,SI1,60.8,57.0,2757,5.75,5.76,3.50
53936,53937,0.72,Good,D,SI1,63.1,55.0,2757,5.69,5.75,3.61
53937,53938,0.70,Very Good,D,SI1,62.8,60.0,2757,5.66,5.68,3.56
53938,53939,0.86,Premium,H,SI2,61.0,58.0,2757,6.15,6.12,3.74


In [3]:
data.drop(data.columns[0], axis=1, inplace=True)

In [4]:
y = data['price']
X = data.drop('price', axis=1)

In [5]:
y

0         326
1         326
2         327
3         334
4         335
         ... 
53935    2757
53936    2757
53937    2757
53938    2757
53939    2757
Name: price, Length: 53940, dtype: int64

In [6]:
X

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,4.20,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,4.34,4.35,2.75
...,...,...,...,...,...,...,...,...,...
53935,0.72,Ideal,D,SI1,60.8,57.0,5.75,5.76,3.50
53936,0.72,Good,D,SI1,63.1,55.0,5.69,5.75,3.61
53937,0.70,Very Good,D,SI1,62.8,60.0,5.66,5.68,3.56
53938,0.86,Premium,H,SI2,61.0,58.0,6.15,6.12,3.74


In [7]:
print(f"Cuts: {len(X['cut'].unique())}")
print(f"Colors: {len(X['color'].unique())}")
print(f"Clarities: {len(X['clarity'].unique())}")

Cuts: 5
Colors: 7
Clarities: 8


In [10]:
encoder = LabelEncoder()

X['cut'] = encoder.fit_transform(X['cut'])
cut_mappings = {index: label for index, label in enumerate(encoder.classes_)}

X['color'] = encoder.fit_transform(X['color'])
color_mappings = {index: label for index, label in enumerate(encoder.classes_)}

X['clarity'] = encoder.fit_transform(X['clarity'])
clarity_mappings = {index: label for index, label in enumerate(encoder.classes_)}

In [11]:
print(cut_mappings)
print(color_mappings)
print(clarity_mappings)

{0: 'Fair', 1: 'Good', 2: 'Ideal', 3: 'Premium', 4: 'Very Good'}
{0: 'D', 1: 'E', 2: 'F', 3: 'G', 4: 'H', 5: 'I', 6: 'J'}
{0: 'I1', 1: 'IF', 2: 'SI1', 3: 'SI2', 4: 'VS1', 5: 'VS2', 6: 'VVS1', 7: 'VVS2'}


In [12]:
X

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z
0,0.23,2,1,3,61.5,55.0,3.95,3.98,2.43
1,0.21,3,1,2,59.8,61.0,3.89,3.84,2.31
2,0.23,1,1,4,56.9,65.0,4.05,4.07,2.31
3,0.29,3,5,5,62.4,58.0,4.20,4.23,2.63
4,0.31,1,6,3,63.3,58.0,4.34,4.35,2.75
...,...,...,...,...,...,...,...,...,...
53935,0.72,2,0,2,60.8,57.0,5.75,5.76,3.50
53936,0.72,1,0,2,63.1,55.0,5.69,5.75,3.61
53937,0.70,4,0,2,62.8,60.0,5.66,5.68,3.56
53938,0.86,3,4,3,61.0,58.0,6.15,6.12,3.74


In [13]:
scaler = MinMaxScaler()
X = scaler.fit_transform(X)
pd.DataFrame(X)

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,0.006237,0.50,0.166667,0.428571,0.513889,0.230769,0.367784,0.067572,0.076415
1,0.002079,0.75,0.166667,0.285714,0.466667,0.346154,0.362197,0.065195,0.072642
2,0.006237,0.25,0.166667,0.571429,0.386111,0.423077,0.377095,0.069100,0.072642
3,0.018711,0.75,0.833333,0.714286,0.538889,0.288462,0.391061,0.071817,0.082704
4,0.022869,0.25,1.000000,0.428571,0.563889,0.288462,0.404097,0.073854,0.086478
...,...,...,...,...,...,...,...,...,...
53935,0.108108,0.50,0.000000,0.285714,0.494444,0.269231,0.535382,0.097793,0.110063
53936,0.108108,0.25,0.000000,0.285714,0.558333,0.230769,0.529795,0.097623,0.113522
53937,0.103950,1.00,0.000000,0.285714,0.550000,0.326923,0.527002,0.096435,0.111950
53938,0.137214,0.75,0.666667,0.428571,0.500000,0.288462,0.572626,0.103905,0.117610


In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8)

In [15]:
std_model = LinearRegression()
l1_model = Lasso(alpha=1)
l2_model = Ridge(alpha=1)

In [16]:
std_model.fit(X_train, y_train)
l1_model.fit(X_train, y_train)
l2_model.fit(X_train, y_train)

In [17]:
print(f"---Without regularization: {std_model.score(X_test, y_test)}")
print(f"Lasso (L1) regularization: {l1_model.score(X_test, y_test)}")
print(f"Ridge (L2) regularization: {l2_model.score(X_test, y_test)}")

---Without regularization: 0.8837363343498376
Lasso (L1) regularization: 0.8827815408826014
Ridge (L2) regularization: 0.8832831967070176


In [27]:
l2_model = Ridge(alpha=0.25)
l2_model.fit(X_train, y_train)
print(f"Ridge (L2) regularization: {l2_model.score(X_test, y_test)}")

Ridge (L2) regularization: 0.8836598255517789
