In [None]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd

from sklearn.linear_model import LinearRegression
from sklearn import metrics
from sklearn.model_selection import train_test_split # module to split our data into train and test sets
from sklearn.preprocessing import StandardScaler

import statsmodels.api as sm
import statsmodels.tools 

%matplotlib inline

  import pandas.util.testing as tm


In [None]:
# read data and create the dataframe
diamonds = pd.read_csv("https://resources.digitalfutures.com/data-science/diamonds.csv")
#show the head 5 of the dataframe
diamonds.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [None]:
# show the column cut's unique value
diamonds.cut.unique()

array(['Ideal', 'Premium', 'Good', 'Very Good', 'Fair'], dtype=object)

In [None]:
# scale the data

to_scale = ['carat','depth','table','price','x','y','z']

st_scaler = StandardScaler()
diamonds_scaled = st_scaler.fit_transform(diamonds[to_scale])
diamonds_scaled = pd.DataFrame(diamonds_scaled, columns = to_scale)

In [None]:
#encode the data
to_ohe = ['cut','color','clarity']

one_hot_encoded = pd.get_dummies(diamonds[to_ohe],columns = to_ohe, prefix = to_ohe, drop_first = True)

In [None]:
# join to dataframe, (original cut, color and clarity have been dropped when scale it)
diamonds = diamonds_scaled.join(one_hot_encoded)

In [None]:
# show the head of the scaled dataframe
diamonds.head()

Unnamed: 0,carat,depth,table,price,x,y,z,cut_Good,cut_Ideal,cut_Premium,...,color_H,color_I,color_J,clarity_IF,clarity_SI1,clarity_SI2,clarity_VS1,clarity_VS2,clarity_VVS1,clarity_VVS2
0,-1.198168,-0.174092,-1.099672,-0.904095,-1.587837,-1.536196,-1.571129,0,1,0,...,0,0,0,0,0,1,0,0,0,0
1,-1.240361,-1.360738,1.585529,-0.904095,-1.641325,-1.658774,-1.741175,0,0,1,...,0,0,0,0,1,0,0,0,0,0
2,-1.198168,-3.385019,3.375663,-0.903844,-1.498691,-1.457395,-1.741175,1,0,0,...,0,0,0,0,0,0,1,0,0,0
3,-1.071587,0.454133,0.242928,-0.90209,-1.364971,-1.317305,-1.28772,0,0,1,...,0,1,0,0,0,0,0,1,0,0
4,-1.029394,1.082358,0.242928,-0.901839,-1.240167,-1.212238,-1.117674,1,0,0,...,0,0,1,0,0,1,0,0,0,0


## Train & Test Spliting

In [None]:
#show the column names of the dataframe
diamonds.columns

Index(['carat', 'depth', 'table', 'price', 'x', 'y', 'z', 'cut_Good',
       'cut_Ideal', 'cut_Premium', 'cut_Very Good', 'color_E', 'color_F',
       'color_G', 'color_H', 'color_I', 'color_J', 'clarity_IF', 'clarity_SI1',
       'clarity_SI2', 'clarity_VS1', 'clarity_VS2', 'clarity_VVS1',
       'clarity_VVS2'],
      dtype='object')

In [None]:
# set the columns (except price column) as feature columns
feature_cols = list(diamonds.columns)
feature_cols.remove ('price')
feature_cols

['carat',
 'depth',
 'table',
 'x',
 'y',
 'z',
 'cut_Good',
 'cut_Ideal',
 'cut_Premium',
 'cut_Very Good',
 'color_E',
 'color_F',
 'color_G',
 'color_H',
 'color_I',
 'color_J',
 'clarity_IF',
 'clarity_SI1',
 'clarity_SI2',
 'clarity_VS1',
 'clarity_VS2',
 'clarity_VVS1',
 'clarity_VVS2']

In [None]:
# split the data
X = diamonds[feature_cols]
y = diamonds['price']

In [None]:
# train the splited data
from pandas.core.common import random_state

X_train, X_test, y_train, y_test = train_test_split(X,y, train_size = 0.8, random_state = 20)


## Model Building

In [None]:
# OLS linear regression summary of the train price to other feature columns
linreg = sm.OLS(y_train, X_train[feature_cols])
results = linreg.fit()
results.summary()

0,1,2,3
Dep. Variable:,price,R-squared (uncentered):,0.91
Model:,OLS,Adj. R-squared (uncentered):,0.91
Method:,Least Squares,F-statistic:,18960.0
Date:,"Thu, 30 Jun 2022",Prob (F-statistic):,0.0
Time:,09:49:58,Log-Likelihood:,-9411.1
No. Observations:,43152,AIC:,18870.0
Df Residuals:,43129,BIC:,19070.0
Df Model:,23,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
carat,1.3119,0.007,194.661,0.000,1.299,1.325
depth,-0.0495,0.002,-26.182,0.000,-0.053,-0.046
table,-0.0405,0.002,-21.099,0.000,-0.044,-0.037
x,-0.2700,0.010,-25.904,0.000,-0.290,-0.250
y,0.0104,0.006,1.757,0.079,-0.001,0.022
z,-0.0097,0.007,-1.446,0.148,-0.023,0.003
cut_Good,-0.1405,0.009,-15.409,0.000,-0.158,-0.123
cut_Ideal,-0.1176,0.009,-13.368,0.000,-0.135,-0.100
cut_Premium,-0.1141,0.009,-13.356,0.000,-0.131,-0.097

0,1,2,3
Omnibus:,12505.666,Durbin-Watson:,1.999
Prob(Omnibus):,0.0,Jarque-Bera (JB):,970137.654
Skew:,-0.46,Prob(JB):,0.0
Kurtosis:,26.21,Cond. No.,37.6


In [None]:
# predict the price with the train data
X_train['y_pred'] = results.predict(X_train[feature_cols])


In [None]:
# show RMSE 
rmse = statsmodels.tools.eval_measures.rmse(y_train, X_train['y_pred'])

print(rmse)

In [None]:
## evaluate in Test

linreg_test = sm.OLS(y_test, X_test[feature_cols])
results_t = linreg_test.fit()
results_t.summary()

0,1,2,3
Dep. Variable:,price,R-squared (uncentered):,0.911
Model:,OLS,Adj. R-squared (uncentered):,0.91
Method:,Least Squares,F-statistic:,4766.0
Date:,"Thu, 30 Jun 2022",Prob (F-statistic):,0.0
Time:,09:50:29,Log-Likelihood:,-2149.8
No. Observations:,10788,AIC:,4346.0
Df Residuals:,10765,BIC:,4513.0
Df Model:,23,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
carat,1.3907,0.015,94.797,0.000,1.362,1.420
depth,-0.0543,0.004,-14.010,0.000,-0.062,-0.047
table,-0.0237,0.004,-6.407,0.000,-0.031,-0.016
x,-0.8743,0.055,-15.757,0.000,-0.983,-0.766
y,0.5164,0.056,9.144,0.000,0.406,0.627
z,0.0101,0.018,0.574,0.566,-0.024,0.045
cut_Good,-0.1909,0.018,-10.899,0.000,-0.225,-0.157
cut_Ideal,-0.1357,0.017,-8.045,0.000,-0.169,-0.103
cut_Premium,-0.1247,0.016,-7.632,0.000,-0.157,-0.093

0,1,2,3
Omnibus:,2554.34,Durbin-Watson:,1.995
Prob(Omnibus):,0.0,Jarque-Bera (JB):,94300.023
Skew:,0.404,Prob(JB):,0.0
Kurtosis:,17.462,Cond. No.,54.0


In [None]:
## evaluate in Test

X_test['y_pred'] = results_t.predict(X_test[feature_cols])

rmse = statsmodels.tools.eval_measures.rmse(y_test, X_test['y_pred'])

print(rmse)

0.29532891136448575
