In [1]:
import warnings
warnings.filterwarnings('ignore')

## Import the required Libraries

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

## Load the Data

In [3]:
df = pd.read_csv('data/diamonds.csv')

df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


`price` price in US dollars (\\$ 326 - \\$ 18,823)

`carat` weight of the diamond (0.2 - 5.01)

`cut` quality of the cut (Fair, Good, Very Good, Premium, Ideal)

`color` diamond colour, from J (worst) to D (best)

`clarity` a measurement of how clear the diamond is (I1 (worst), SI2, SI1, VS2, VS1, VVS2, VVS1, IF (best))

`x` length in mm (0 - 10.74)

`y` width in mm (0 - 58.9)

`z` depth in mm (0 - 31.8)

`depth` total depth percentage = z / mean(x, y) = 2 * z / (x + y) (43 - 79)

`table` width of top of diamond relative to widest point (43 - 95)

In [4]:
df.columns

Index(['carat', 'cut', 'color', 'clarity', 'depth', 'table', 'price', 'x', 'y',
       'z'],
      dtype='object')

In [5]:
df.shape

(53940, 10)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53940 entries, 0 to 53939
Data columns (total 10 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   carat    53940 non-null  float64
 1   cut      53940 non-null  object 
 2   color    53940 non-null  object 
 3   clarity  53940 non-null  object 
 4   depth    53940 non-null  float64
 5   table    53940 non-null  float64
 6   price    53940 non-null  int64  
 7   x        53940 non-null  float64
 8   y        53940 non-null  float64
 9   z        53940 non-null  float64
dtypes: float64(6), int64(1), object(3)
memory usage: 4.1+ MB


In [7]:
df.isnull().sum()

carat      0
cut        0
color      0
clarity    0
depth      0
table      0
price      0
x          0
y          0
z          0
dtype: int64

### Machine Learning Problem

**Build a system which can take features of diamond like carat, cut, color, clarity, x, y, z, etc.. and `predicts the price of diamond`.**

Target Variable: ______

In [8]:
df = df[['carat', 'cut', 'color', 'clarity', 'depth', 'table', 'x', 'y', 'z', 'price']]
df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,price
0,0.23,Ideal,E,SI2,61.5,55.0,3.95,3.98,2.43,326
1,0.21,Premium,E,SI1,59.8,61.0,3.89,3.84,2.31,326
2,0.23,Good,E,VS1,56.9,65.0,4.05,4.07,2.31,327
3,0.29,Premium,I,VS2,62.4,58.0,4.2,4.23,2.63,334
4,0.31,Good,J,SI2,63.3,58.0,4.34,4.35,2.75,335


`price` price in US dollars (\\$ 326 - \\$ 18,823)

`carat` weight of the diamond (0.2 - 5.01)

`cut` quality of the cut (Fair, Good, Very Good, Premium, Ideal)

`color` diamond colour, from J (worst) to D (best)

`clarity` a measurement of how clear the diamond is (I1 (worst), SI2, SI1, VS2, VS1, VVS2, VVS1, IF (best))

`x` length in mm (0 - 10.74)

`y` width in mm (0 - 58.9)

`z` depth in mm (0 - 31.8)

`depth` total depth percentage = z / mean(x, y) = 2 * z / (x + y) (43 - 79)

`table` width of top of diamond relative to widest point (43 - 95)

## Exploratory Data Analysis

- Univariate Analysis **`(Assignment)`**
- Bivariate Analysis **`(Assignment)`**
- Missing Values **`(Assignment)`**
- Outliers **`(Assignment)`**

## Data Preparation

- Encoding for Categorical Columns 
    - Ordinal : LabelEncoding or OrdinalEncoding
    - Nominal : OneHotEncoding or get_dummies
- Encoding for Numerical Columns
    - Standardization (z-transformation)
- Train Test Split

**We will be following below mentioned steps:**  
a. Separating Categorical and Numerical Columns  
b. Applying OneHotEncoding on Categorical Columns  
c. Encoding Ordinal Columns  
d. Rescaling Numerical Columns (Standardization or z-transformation)  
e. Concatinating the Encoded Categorical Features and Scaled Numerical Features  
f. Splitting data into train and test

### a. Separating Categorical and Numerical Columns: 

In [9]:
df.dtypes

carat      float64
cut         object
color       object
clarity     object
depth      float64
table      float64
x          float64
y          float64
z          float64
price        int64
dtype: object

In [10]:
# Setting the target (y) and inputs (X)

target = df['price']
inputs = df[['carat', 'cut', 'color', 'clarity', 'depth', 'table', 'x', 'y', 'z']]

In [11]:
df_cat = inputs.select_dtypes(include=['object'])

df_cat.head()

Unnamed: 0,cut,color,clarity
0,Ideal,E,SI2
1,Premium,E,SI1
2,Good,E,VS1
3,Premium,I,VS2
4,Good,J,SI2


In [12]:
df_num = inputs.select_dtypes(include=['int64', 'float64'])

df_num.head()

Unnamed: 0,carat,depth,table,x,y,z
0,0.23,61.5,55.0,3.95,3.98,2.43
1,0.21,59.8,61.0,3.89,3.84,2.31
2,0.23,56.9,65.0,4.05,4.07,2.31
3,0.29,62.4,58.0,4.2,4.23,2.63
4,0.31,63.3,58.0,4.34,4.35,2.75


In [13]:
categorical = inputs.select_dtypes(include=['object'])

categorical.head()

Unnamed: 0,cut,color,clarity
0,Ideal,E,SI2
1,Premium,E,SI1
2,Good,E,VS1
3,Premium,I,VS2
4,Good,J,SI2


### b. Applying OneHotEncoding on Categorical Columns

In [14]:
# OneHotEncoding the categorical features

from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder(drop='first', sparse=False)

# column names are (annoyingly) lost after OneHotEncoding
# (i.e. the dataframe is converted to a numpy ndarray)

categorical = pd.DataFrame(encoder.fit_transform(categorical), 
                      columns=encoder.get_feature_names(categorical.columns))

categorical.head()

Unnamed: 0,cut_Good,cut_Ideal,cut_Premium,cut_Very Good,color_E,color_F,color_G,color_H,color_I,color_J,clarity_IF,clarity_SI1,clarity_SI2,clarity_VS1,clarity_VS2,clarity_VVS1,clarity_VVS2
0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


### c. Encoding Ordinal Columns:

In [15]:
df_cat.cut.unique()

array(['Ideal', 'Premium', 'Good', 'Very Good', 'Fair'], dtype=object)

In [16]:
cut_encoder = {'Fair' : 1, 'Good' : 2, 'Very Good' : 3, 'Ideal' : 4, 'Premium' : 5}

df_cat['cut'] = df_cat['cut'].apply(lambda x : cut_encoder[x])

df_cat.head()

Unnamed: 0,cut,color,clarity
0,4,E,SI2
1,5,E,SI1
2,2,E,VS1
3,5,I,VS2
4,2,J,SI2


In [17]:
df_cat.color.unique()

array(['E', 'I', 'J', 'H', 'F', 'G', 'D'], dtype=object)

In [18]:
color_encoder = {'J':1, 'I':2, 'H':3, 'G':4, 'F':5, 'E':6, 'D':7}

df_cat['color'] = df_cat['color'].apply(lambda x : color_encoder[x])

df_cat.head()

Unnamed: 0,cut,color,clarity
0,4,6,SI2
1,5,6,SI1
2,2,6,VS1
3,5,2,VS2
4,2,1,SI2


In [19]:
df_cat.clarity.unique()

array(['SI2', 'SI1', 'VS1', 'VS2', 'VVS2', 'VVS1', 'I1', 'IF'],
      dtype=object)

In [20]:
clarity_encoder = {'I1':1, 'SI2':2, 'SI1':3, 'VS2':4, 'VS1':5, 'VVS2':6, 'VVS1':7, 'IF':8}

df_cat['clarity'] = df_cat['clarity'].apply(lambda x : clarity_encoder[x])

df_cat.head()

Unnamed: 0,cut,color,clarity
0,4,6,2
1,5,6,3
2,2,6,5
3,5,2,4
4,2,1,2


### d. Scaling the Numerical Features

In [21]:
# scaling the numerical features
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

# column names are (annoyingly) lost after Scaling
# (i.e. the dataframe is converted to a numpy ndarray)

df_num = pd.DataFrame(scaler.fit_transform(df_num), 
                      columns = df_num.columns)

df_num.head()

Unnamed: 0,carat,depth,table,x,y,z
0,-1.198168,-0.174092,-1.099672,-1.587837,-1.536196,-1.571129
1,-1.240361,-1.360738,1.585529,-1.641325,-1.658774,-1.741175
2,-1.198168,-3.385019,3.375663,-1.498691,-1.457395,-1.741175
3,-1.071587,0.454133,0.242928,-1.364971,-1.317305,-1.28772
4,-1.029394,1.082358,0.242928,-1.240167,-1.212238,-1.117674


### e. Concatinating the Encoded Categorical Features and Scaled Numerical Features:

In [22]:
inputs = pd.concat([df_num, df_cat], axis=1)

inputs.head()

Unnamed: 0,carat,depth,table,x,y,z,cut,color,clarity
0,-1.198168,-0.174092,-1.099672,-1.587837,-1.536196,-1.571129,4,6,2
1,-1.240361,-1.360738,1.585529,-1.641325,-1.658774,-1.741175,5,6,3
2,-1.198168,-3.385019,3.375663,-1.498691,-1.457395,-1.741175,2,6,5
3,-1.071587,0.454133,0.242928,-1.364971,-1.317305,-1.28772,5,2,4
4,-1.029394,1.082358,0.242928,-1.240167,-1.212238,-1.117674,2,1,2


### f. Splitting data into Train and Test DataFrame

In [23]:
# split into train and test
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(inputs, target, train_size=0.7, random_state=100)

In [24]:
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(37758, 9) (37758,)
(16182, 9) (16182,)


## Linear Regression

In [25]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)

LinearRegression()

In [26]:
y_test_pred = regressor.predict(X_test)

In [27]:
temp_df = pd.DataFrame({'Actual': y_test, 'Predicted': y_test_pred})

temp_df.head()

Unnamed: 0,Actual,Predicted
52264,2491,2363.83474
21073,9248,7469.644228
42161,1284,643.298938
35974,921,1516.80821
7641,4268,5721.128606


In [28]:
from sklearn import metrics

print('Mean Absolute Error: ', metrics.mean_absolute_error(y_test, y_test_pred))

print('Mean Squared Error: ', metrics.mean_squared_error(y_test, y_test_pred))

print('Root Mean Squared Error: ', np.sqrt(metrics.mean_squared_error(y_test, y_test_pred)))

Mean Absolute Error:  795.4560135879743
Mean Squared Error:  1471939.1560759977
Root Mean Squared Error:  1213.2349962295011


## KNN Regression

In [29]:
from sklearn.neighbors import KNeighborsRegressor
regressor = KNeighborsRegressor()
regressor.fit(X_train, y_train)

KNeighborsRegressor()

In [30]:
y_test_pred = regressor.predict(X_test)

In [31]:
temp_df = pd.DataFrame({'Actual': y_test, 'Predicted': y_test_pred})

temp_df.head()

Unnamed: 0,Actual,Predicted
52264,2491,1779.6
21073,9248,9000.6
42161,1284,1136.6
35974,921,960.4
7641,4268,5510.6


In [32]:
print('Mean Absolute Error: ', metrics.mean_absolute_error(y_test, y_test_pred))

print('Mean Squared Error: ', metrics.mean_squared_error(y_test, y_test_pred))

print('Root Mean Squared Error: ', np.sqrt(metrics.mean_squared_error(y_test, y_test_pred)))

Mean Absolute Error:  383.5874057594859
Mean Squared Error:  564774.9778049685
Root Mean Squared Error:  751.5151214745905


## Decision Tree Regression

In [33]:
from sklearn.tree import DecisionTreeRegressor
regressor = DecisionTreeRegressor()
regressor.fit(X_train, y_train)

DecisionTreeRegressor()

In [34]:
y_test_pred = regressor.predict(X_test)

In [35]:
temp_df = pd.DataFrame({'Actual': y_test, 'Predicted': y_test_pred})

temp_df.head()

Unnamed: 0,Actual,Predicted
52264,2491,1625.0
21073,9248,9625.0
42161,1284,1237.0
35974,921,1031.0
7641,4268,4844.0


In [36]:
print('Mean Absolute Error: ', metrics.mean_absolute_error(y_test, y_test_pred))

print('Mean Squared Error: ', metrics.mean_squared_error(y_test, y_test_pred))

print('Root Mean Squared Error: ', np.sqrt(metrics.mean_squared_error(y_test, y_test_pred)))

Mean Absolute Error:  363.6495694804927
Mean Squared Error:  551096.5729582938
Root Mean Squared Error:  742.3587899111143


## Random Forest Regression

In [37]:
from sklearn.ensemble import RandomForestRegressor
regressor = RandomForestRegressor()
regressor.fit(X_train, y_train)

RandomForestRegressor()

In [38]:
y_test_pred = regressor.predict(X_test)

In [39]:
temp_df = pd.DataFrame({'Actual': y_test, 'Predicted': y_test_pred})

temp_df.head()

Unnamed: 0,Actual,Predicted
52264,2491,1827.08
21073,9248,8602.81
42161,1284,1239.7
35974,921,1029.92
7641,4268,4361.03


In [40]:
print('Mean Absolute Error: ', metrics.mean_absolute_error(y_test, y_test_pred))

print('Mean Squared Error: ', metrics.mean_squared_error(y_test, y_test_pred))

print('Root Mean Squared Error: ', np.sqrt(metrics.mean_squared_error(y_test, y_test_pred)))

Mean Absolute Error:  270.0958314929091
Mean Squared Error:  290077.25192051084
Root Mean Squared Error:  538.5882025448672
