In [1]:
import pandas as pd
from math import sqrt
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

- `price`: price in US dollars (326$ - 18,823$)
- `carat`: weight of the diamond (0.2 - 5.01)
- `cut`: quality of the cut (Fair, Good, Very Good, Premium, Ideal)
- `color`: diamond color, from J (worst) to D (best)  :JIHGFED
- `clarity`: a measurement of how clear the diamond is (I1 (worst) , SI2, SI1, VS2, VS1, VVS2, VVS1, IF (best))
- `x`: length in mm (0 - 10.74)
- `y`: width in mm (0 - 58.9)
- `z`: depth in mm (0 - 31.8)

In [2]:
d = pd.read_csv("./data/diamonds.csv", index_col=0)
d.shape

(53940, 10)

In [3]:
d.head(7)

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
1,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
2,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
3,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
4,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
5,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75
6,0.24,Very Good,J,VVS2,62.8,57.0,336,3.94,3.96,2.48
7,0.24,Very Good,I,VVS1,62.3,57.0,336,3.95,3.98,2.47


### Preprocessing

In [5]:
d.color = d.color.apply(list('JIHGFED').index)
d.head(7)

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
1,0.23,Ideal,5,SI2,61.5,55.0,326,3.95,3.98,2.43
2,0.21,Premium,5,SI1,59.8,61.0,326,3.89,3.84,2.31
3,0.23,Good,5,VS1,56.9,65.0,327,4.05,4.07,2.31
4,0.29,Premium,1,VS2,62.4,58.0,334,4.2,4.23,2.63
5,0.31,Good,0,SI2,63.3,58.0,335,4.34,4.35,2.75
6,0.24,Very Good,0,VVS2,62.8,57.0,336,3.94,3.96,2.48
7,0.24,Very Good,1,VVS1,62.3,57.0,336,3.95,3.98,2.47


In [6]:
d.cut = d.cut.apply(['Fair', 'Good', 'Very Good', 'Premium', 'Ideal'].index)
d.head(7)

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
1,0.23,4,5,SI2,61.5,55.0,326,3.95,3.98,2.43
2,0.21,3,5,SI1,59.8,61.0,326,3.89,3.84,2.31
3,0.23,1,5,VS1,56.9,65.0,327,4.05,4.07,2.31
4,0.29,3,1,VS2,62.4,58.0,334,4.2,4.23,2.63
5,0.31,1,0,SI2,63.3,58.0,335,4.34,4.35,2.75
6,0.24,2,0,VVS2,62.8,57.0,336,3.94,3.96,2.48
7,0.24,2,1,VVS1,62.3,57.0,336,3.95,3.98,2.47


In [7]:
d.clarity = d.clarity.apply(['I1', 'SI2', 'SI1', 'VS2', 'VS1', 'VVS2', 'VVS1', 'IF'].index)
d.head(7)

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
1,0.23,4,5,1,61.5,55.0,326,3.95,3.98,2.43
2,0.21,3,5,2,59.8,61.0,326,3.89,3.84,2.31
3,0.23,1,5,4,56.9,65.0,327,4.05,4.07,2.31
4,0.29,3,1,3,62.4,58.0,334,4.2,4.23,2.63
5,0.31,1,0,1,63.3,58.0,335,4.34,4.35,2.75
6,0.24,2,0,5,62.8,57.0,336,3.94,3.96,2.48
7,0.24,2,1,6,62.3,57.0,336,3.95,3.98,2.47


In [8]:
d.describe()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
count,53940.0,53940.0,53940.0,53940.0,53940.0,53940.0,53940.0,53940.0,53940.0,53940.0
mean,0.79794,2.904097,3.405803,3.05102,61.749405,57.457184,3932.799722,5.731157,5.734526,3.538734
std,0.474011,1.1166,1.701105,1.647136,1.432621,2.234491,3989.439738,1.121761,1.142135,0.705699
min,0.2,0.0,0.0,0.0,43.0,43.0,326.0,0.0,0.0,0.0
25%,0.4,2.0,2.0,2.0,61.0,56.0,950.0,4.71,4.72,2.91
50%,0.7,3.0,3.0,3.0,61.8,57.0,2401.0,5.7,5.71,3.53
75%,1.04,4.0,5.0,4.0,62.5,59.0,5324.25,6.54,6.54,4.04
max,5.01,4.0,6.0,7.0,79.0,95.0,18823.0,10.74,58.9,31.8


In [9]:
X = d.loc[:,(d.columns != 'price')]
Y = d.price

In [10]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.30, random_state=1)
X_train.shape

(37758, 9)

In [11]:
X_test.shape

(16182, 9)

In [12]:
# Training with All Features Except Cut, Color, Clarity
model = LinearRegression()

In [13]:
new_X_train = X_train.loc[
    :,
    (X_train.columns != "cut") & (X_train.columns != "color") & (X_train.columns != "clarity")
]
new_X_test = X_test.loc[
    :,
    (X_test.columns  != "cut") & (X_test.columns  != "color") & (X_test.columns  != "clarity")
]

In [14]:
model.fit(new_X_train, Y_train)

In [15]:
Y_pred = model.predict(new_X_test)

In [16]:
mean_absolute_error(Y_test, Y_pred)

886.7702123501662

In [17]:
sqrt(mean_squared_error(Y_test,Y_pred))

1458.672207781572

In [18]:
# Training with Carat, Clarity, Cut, Color
columns = ["carat", "cut", "color", "clarity"]
model = LinearRegression()
model.fit(X_train[columns], Y_train)

In [19]:
Y_pred = model.predict(X_test[columns])

In [20]:
mean_absolute_error(Y_test, Y_pred)

846.1694717317039

In [21]:
sqrt(mean_squared_error(Y_test,Y_pred))

1208.415882782286

### Training with Carat, Clarity

In [22]:
columns = ["clarity", "carat"]
model = LinearRegression()
model.fit(X_train[columns], Y_train)
Y_pred = model.predict(X_test[columns])

In [23]:
mean_absolute_error(Y_test, Y_pred)

902.0705563892825

In [24]:
sqrt(mean_squared_error(Y_test,Y_pred))

1323.6203317230902