In [25]:
import pandas as pd
import numpy as np
from pathlib import Path

In [26]:
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn.preprocessing import StandardScaler
from sklearn import preprocessing
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

from sklearn.ensemble import RandomForestClassifier


In [27]:
# Set path to CSV and read in CSV
csv_path = Path('Resources/diamonds.csv')
df=pd.read_csv(csv_path)
df.head(-1)

Unnamed: 0.1,Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,1,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,2,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,3,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,4,0.29,Premium,I,VS2,62.4,58.0,334,4.20,4.23,2.63
4,5,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75
...,...,...,...,...,...,...,...,...,...,...,...
53934,53935,0.72,Premium,D,SI1,62.7,59.0,2757,5.69,5.73,3.58
53935,53936,0.72,Ideal,D,SI1,60.8,57.0,2757,5.75,5.76,3.50
53936,53937,0.72,Good,D,SI1,63.1,55.0,2757,5.69,5.75,3.61
53937,53938,0.70,Very Good,D,SI1,62.8,60.0,2757,5.66,5.68,3.56


In [28]:
df = df.drop(columns='Unnamed: 0')
df.head(-1)

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.20,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75
...,...,...,...,...,...,...,...,...,...,...
53934,0.72,Premium,D,SI1,62.7,59.0,2757,5.69,5.73,3.58
53935,0.72,Ideal,D,SI1,60.8,57.0,2757,5.75,5.76,3.50
53936,0.72,Good,D,SI1,63.1,55.0,2757,5.69,5.75,3.61
53937,0.70,Very Good,D,SI1,62.8,60.0,2757,5.66,5.68,3.56


In [29]:
df.cut.unique()

array(['Ideal', 'Premium', 'Good', 'Very Good', 'Fair'], dtype=object)

In [30]:
# cut dictionary
cut_num = {
    'Ideal': 1,
    'Premium': 2,
    'Good': 3,
    'Very Good': 4,
    'Fair': 5,
}

In [31]:
# Cut names encoded using the dictionary values
df["cut_num"] = df["cut"].apply(lambda x: cut_num[x])
df.head(-1)

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z,cut_num
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43,1
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31,2
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31,3
3,0.29,Premium,I,VS2,62.4,58.0,334,4.20,4.23,2.63,2
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75,3
...,...,...,...,...,...,...,...,...,...,...,...
53934,0.72,Premium,D,SI1,62.7,59.0,2757,5.69,5.73,3.58,2
53935,0.72,Ideal,D,SI1,60.8,57.0,2757,5.75,5.76,3.50,1
53936,0.72,Good,D,SI1,63.1,55.0,2757,5.69,5.75,3.61,3
53937,0.70,Very Good,D,SI1,62.8,60.0,2757,5.66,5.68,3.56,4


In [32]:
df['p_bin'] = pd.qcut(df['price'], q=10, precision=0)
df.head(-1)

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z,cut_num,p_bin
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43,1,"(325.0, 646.0]"
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31,2,"(325.0, 646.0]"
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31,3,"(325.0, 646.0]"
3,0.29,Premium,I,VS2,62.4,58.0,334,4.20,4.23,2.63,2,"(325.0, 646.0]"
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75,3,"(325.0, 646.0]"
...,...,...,...,...,...,...,...,...,...,...,...,...
53934,0.72,Premium,D,SI1,62.7,59.0,2757,5.69,5.73,3.58,2,"(2401.0, 3465.0]"
53935,0.72,Ideal,D,SI1,60.8,57.0,2757,5.75,5.76,3.50,1,"(2401.0, 3465.0]"
53936,0.72,Good,D,SI1,63.1,55.0,2757,5.69,5.75,3.61,3,"(2401.0, 3465.0]"
53937,0.70,Very Good,D,SI1,62.8,60.0,2757,5.66,5.68,3.56,4,"(2401.0, 3465.0]"


In [33]:
df['p_bin'].value_counts()

(325.0, 646.0]       5411
(1698.0, 2401.0]     5405
(837.0, 1087.0]      5396
(6301.0, 9821.0]     5395
(3465.0, 4662.0]     5394
(9821.0, 18823.0]    5393
(4662.0, 6301.0]     5389
(1087.0, 1698.0]     5388
(646.0, 837.0]       5385
(2401.0, 3465.0]     5384
Name: p_bin, dtype: int64

In [34]:
le = preprocessing.LabelEncoder()

In [35]:
le.fit(df['p_bin'])

LabelEncoder()

In [36]:
list(le.classes_)

[Interval(325.0, 646.0, closed='right'),
 Interval(646.0, 837.0, closed='right'),
 Interval(837.0, 1087.0, closed='right'),
 Interval(1087.0, 1698.0, closed='right'),
 Interval(1698.0, 2401.0, closed='right'),
 Interval(2401.0, 3465.0, closed='right'),
 Interval(3465.0, 4662.0, closed='right'),
 Interval(4662.0, 6301.0, closed='right'),
 Interval(6301.0, 9821.0, closed='right'),
 Interval(9821.0, 18823.0, closed='right')]

In [37]:
df['p_bin_num'] = le.transform(df['p_bin'])
df.head(-1)

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z,cut_num,p_bin,p_bin_num
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43,1,"(325.0, 646.0]",0
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31,2,"(325.0, 646.0]",0
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31,3,"(325.0, 646.0]",0
3,0.29,Premium,I,VS2,62.4,58.0,334,4.20,4.23,2.63,2,"(325.0, 646.0]",0
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75,3,"(325.0, 646.0]",0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
53934,0.72,Premium,D,SI1,62.7,59.0,2757,5.69,5.73,3.58,2,"(2401.0, 3465.0]",5
53935,0.72,Ideal,D,SI1,60.8,57.0,2757,5.75,5.76,3.50,1,"(2401.0, 3465.0]",5
53936,0.72,Good,D,SI1,63.1,55.0,2757,5.69,5.75,3.61,3,"(2401.0, 3465.0]",5
53937,0.70,Very Good,D,SI1,62.8,60.0,2757,5.66,5.68,3.56,4,"(2401.0, 3465.0]",5


In [38]:
# Dropping cut & columns
df.drop(['cut','price','p_bin'], axis=1, inplace=True)
df.head(-1)

Unnamed: 0,carat,color,clarity,depth,table,x,y,z,cut_num,p_bin_num
0,0.23,E,SI2,61.5,55.0,3.95,3.98,2.43,1,0
1,0.21,E,SI1,59.8,61.0,3.89,3.84,2.31,2,0
2,0.23,E,VS1,56.9,65.0,4.05,4.07,2.31,3,0
3,0.29,I,VS2,62.4,58.0,4.20,4.23,2.63,2,0
4,0.31,J,SI2,63.3,58.0,4.34,4.35,2.75,3,0
...,...,...,...,...,...,...,...,...,...,...
53934,0.72,D,SI1,62.7,59.0,5.69,5.73,3.58,2,5
53935,0.72,D,SI1,60.8,57.0,5.75,5.76,3.50,1,5
53936,0.72,D,SI1,63.1,55.0,5.69,5.75,3.61,3,5
53937,0.70,D,SI1,62.8,60.0,5.66,5.68,3.56,4,5


In [39]:
# Encode required data (converts str fields to float)
df_enc = pd.get_dummies(df, columns=['color','clarity'])
df_enc.head(-1)

Unnamed: 0,carat,depth,table,x,y,z,cut_num,p_bin_num,color_D,color_E,...,color_I,color_J,clarity_I1,clarity_IF,clarity_SI1,clarity_SI2,clarity_VS1,clarity_VS2,clarity_VVS1,clarity_VVS2
0,0.23,61.5,55.0,3.95,3.98,2.43,1,0,0,1,...,0,0,0,0,0,1,0,0,0,0
1,0.21,59.8,61.0,3.89,3.84,2.31,2,0,0,1,...,0,0,0,0,1,0,0,0,0,0
2,0.23,56.9,65.0,4.05,4.07,2.31,3,0,0,1,...,0,0,0,0,0,0,1,0,0,0
3,0.29,62.4,58.0,4.20,4.23,2.63,2,0,0,0,...,1,0,0,0,0,0,0,1,0,0
4,0.31,63.3,58.0,4.34,4.35,2.75,3,0,0,0,...,0,1,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53934,0.72,62.7,59.0,5.69,5.73,3.58,2,5,1,0,...,0,0,0,0,1,0,0,0,0,0
53935,0.72,60.8,57.0,5.75,5.76,3.50,1,5,1,0,...,0,0,0,0,1,0,0,0,0,0
53936,0.72,63.1,55.0,5.69,5.75,3.61,3,5,1,0,...,0,0,0,0,1,0,0,0,0,0
53937,0.70,62.8,60.0,5.66,5.68,3.56,4,5,1,0,...,0,0,0,0,1,0,0,0,0,0


In [40]:
# Split data into training/test

# Create features
X = df_enc.drop(columns=['p_bin_num'])

# Create target
y = df_enc.p_bin_num

In [41]:
# Check the balance of our target values
y.value_counts()

0    5411
4    5405
2    5396
8    5395
6    5394
9    5393
7    5389
3    5388
1    5385
5    5384
Name: p_bin_num, dtype: int64

In [42]:
df.isna().sum()

carat        0
color        0
clarity      0
depth        0
table        0
x            0
y            0
z            0
cut_num      0
p_bin_num    0
dtype: int64

In [43]:
# Split the X and y into X_train, X_test, y_train, y_test
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=7)

In [44]:
# Creating StandardScaler instance
scaler = StandardScaler()

In [45]:
# Fitting Standard Scaller
X_scaler = scaler.fit(X_train)

In [46]:
# Scaling data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [47]:
# Create random forest classifier instance
model = RandomForestClassifier(n_estimators=500, random_state=7)

In [48]:
# Fitting the model
model = model.fit(X_train_scaled, y_train)

In [49]:
# Making predictions using the testing data
predictions = model.predict(X_test_scaled)

In [50]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=['Actual 1','Actual 2','Actual 3','Actual 4','Actual 5','Actual 6','Actual 7','Actual 8','Actual 9','Actual 10'], 
    columns=['Predicted 1','Predicted 2','Predicted 3','Predicted 4','Predicted 5','Predicted 6','Predicted 7','Predicted 8','Predicted 9','Predicted 10']
)

# Calculating the accuracy score
acc_score = accuracy_score(y_test, predictions)

In [51]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 1,Predicted 2,Predicted 3,Predicted 4,Predicted 5,Predicted 6,Predicted 7,Predicted 8,Predicted 9,Predicted 10
Actual 1,1209,151,1,0,0,0,0,0,0,0
Actual 2,154,1041,151,2,0,0,0,0,0,0
Actual 3,2,188,1059,95,1,0,0,0,0,0
Actual 4,0,5,136,1092,131,2,0,0,0,0
Actual 5,0,0,1,119,1110,151,0,0,0,0
Actual 6,0,0,0,1,117,1067,116,5,0,0
Actual 7,0,0,0,1,3,123,1012,201,1,0
Actual 8,0,0,0,0,0,2,160,1031,128,0
Actual 9,0,0,0,0,0,1,2,122,1150,97
Actual 10,1,0,0,0,0,0,0,0,88,1255


Accuracy Score : 0.817649239896181
Classification Report
              precision    recall  f1-score   support

           0       0.89      0.89      0.89      1361
           1       0.75      0.77      0.76      1348
           2       0.79      0.79      0.79      1345
           3       0.83      0.80      0.82      1366
           4       0.81      0.80      0.81      1381
           5       0.79      0.82      0.80      1306
           6       0.78      0.75      0.77      1341
           7       0.76      0.78      0.77      1321
           8       0.84      0.84      0.84      1372
           9       0.93      0.93      0.93      1344

    accuracy                           0.82     13485
   macro avg       0.82      0.82      0.82     13485
weighted avg       0.82      0.82      0.82     13485

