# Multiple Linear Regression - Diamonds

### Importing the libraries

In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

### Importing the dataset

In [2]:
dataset=pd.read_csv('Diamonds.csv')

In [3]:
dataset.head()

Unnamed: 0,carat,cut,color,clarity,table,price,length,width,depth
0,0.23,Ideal,E,SI2,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,58.0,334,4.2,4.23,2.63
4,0.31,Good,J,SI2,58.0,335,4.34,4.35,2.75


### Split the dataset into independent and Dependent variables

In [4]:
dataset.columns

Index(['carat', 'cut', 'color', 'clarity', 'table', 'price', 'length', 'width',
       'depth'],
      dtype='object')

In [5]:
X=dataset[['price']].drop
Y=dataset[['price']]

### Work with the catagorical data

In [6]:
X=pd.get_dummies(dataset[['carat', 'cut', 'color', 'clarity', 'table', 'length', 'width', 'depth']],drop_first=True)

In [7]:
X.head()

Unnamed: 0,carat,table,length,width,depth,cut_Good,cut_Ideal,cut_Premium,cut_Very Good,color_E,...,color_H,color_I,color_J,clarity_IF,clarity_SI1,clarity_SI2,clarity_VS1,clarity_VS2,clarity_VVS1,clarity_VVS2
0,0.23,55.0,3.95,3.98,2.43,0,1,0,0,1,...,0,0,0,0,0,1,0,0,0,0
1,0.21,61.0,3.89,3.84,2.31,0,0,1,0,1,...,0,0,0,0,1,0,0,0,0,0
2,0.23,65.0,4.05,4.07,2.31,1,0,0,0,1,...,0,0,0,0,0,0,1,0,0,0
3,0.29,58.0,4.2,4.23,2.63,0,0,1,0,0,...,0,1,0,0,0,0,0,1,0,0
4,0.31,58.0,4.34,4.35,2.75,1,0,0,0,0,...,0,0,1,0,0,1,0,0,0,0


## EXPERIMENT 1

### Splitting the dataset into the Training set, Test set, and Validation Set

In [8]:
from sklearn.model_selection import train_test_split
X_train,X_rest,Y_train,Y_rest=train_test_split(X,Y,test_size=.2,random_state=50)

In [9]:
X_test,X_val,Y_test,Y_val=train_test_split(X_rest,Y_rest,test_size=.5,random_state=50)

### Training the Multiple Linear Regression model on the Training set

In [10]:
from sklearn.linear_model import LinearRegression
regressor=LinearRegression()
regressor.fit(X_train.values,Y_train)

LinearRegression()

### Intercept and Coefficient

In [11]:
print('Coefficients: ',regressor.coef_)
print("Intercept: ", regressor.intercept_)

Coefficients:  [[11094.745265     -15.49460052  -761.38613887   154.51119774
   -578.22760163   633.50815934   928.67606831   840.12318761
    797.74150935  -215.57812995  -284.16010361  -486.37775246
   -979.17044522 -1463.87590708 -2380.46439866  5335.62630699
   3656.60365153  2697.64473009  4594.85787736  4260.01818025
   5002.59438188  4951.82217531]]
Intercept:  [-2706.90335638]


### Validation set

In [12]:
y_predval=regressor.predict(X_val.values)
y_predval

array([[1121.47386992],
       [6367.26966873],
       [5071.4465916 ],
       ...,
       [2197.85027048],
       [6264.1573314 ],
       [2352.10886992]])

In [13]:
Y_validate=Y_val.to_numpy()

In [14]:
np.hstack((y_predval,Y_validate)).round()

array([[1121.,  828.],
       [6367., 5632.],
       [5071., 5129.],
       ...,
       [2198., 1621.],
       [6264., 5625.],
       [2352., 1754.]])

### Predicting the Test set results

In [15]:
y_pred=regressor.predict(X_test.values)

### Calculate MSE, R-Square

In [16]:
from sklearn.metrics import mean_squared_error, r2_score
import math
print(f'R squared: {r2_score(Y_test,y_pred):.2f}')
print(f'MSE: {mean_squared_error(Y_test,y_pred):.2f}')

R squared: 0.92
MSE: 1322135.88


#### CATEGORICAL VARIABLE PATTERNS
     - CUT
         Good =      1000
         Ideal =     0100
         Premium =   0010
         Very Good = 0001
         Fair =      0000
     - COLOR
         D = 000000
         E = 100000
         F = 010000
         G = 001000
         H = 000100
         I = 000010
         J = 000001
     - CLARITY
         I1 =   0000000
         IF =   1000000
         SI1 =  0100000
         SI2 =  0010000
         VS1 =  0001000
         VS2 =  0000100
         VVS1 = 0000010
         VVS2 = 0000001

## EXPERIMENT 2

### Splitting the dataset into the Training set, Test set, and Validation Set

In [17]:
from sklearn.model_selection import train_test_split
X_train,X_rest,Y_train,Y_rest=train_test_split(X,Y,test_size=.3,random_state=50)

In [18]:
X_test,X_val,Y_test,Y_val=train_test_split(X_rest,Y_rest,test_size=.5,random_state=50)

### Training the Multiple Linear Regression model on the Training set

In [19]:
from sklearn.linear_model import LinearRegression
regressor=LinearRegression()
regressor.fit(X_train.values,Y_train)

LinearRegression()

### Intercept and Coefficient

In [20]:
print('Coefficients: ',regressor.coef_)
print("Intercept: ", regressor.intercept_)

Coefficients:  [[11028.55832948   -15.44254771  -812.05080618   199.32159793
   -534.11110132   626.24865184   924.30611456   847.24100616
    791.44838188  -231.70109293  -295.16406283  -497.90493262
   -980.30171618 -1470.93354145 -2381.78276996  5395.04637709
   3709.87280211  2763.27427282  4642.03672715  4314.4265382
   5061.83203261  5008.08750941]]
Intercept:  [-2825.31983799]


### Validation set

In [21]:
y_predval=regressor.predict(X_val.values)
y_predval

array([[ 7286.65765759],
       [15314.33342971],
       [ 3608.59627173],
       ...,
       [ 3690.79914446],
       [  582.19099964],
       [ 2500.91911297]])

In [22]:
Y_validate=Y_val.to_numpy()

In [23]:
np.hstack((y_predval,Y_validate)).round()

array([[ 7287.,  5318.],
       [15314., 13753.],
       [ 3609.,  3483.],
       ...,
       [ 3691.,  2784.],
       [  582.,   687.],
       [ 2501.,  1792.]])

### Predicting the Test set results

In [24]:
y_pred=regressor.predict(X_test.values)

### Calculate MSE, R-Square

In [25]:
from sklearn.metrics import mean_squared_error, r2_score
import math
print(f'R squared: {r2_score(Y_test,y_pred):.2f}')
print(f'MSE: {mean_squared_error(Y_test,y_pred):.2f}')

R squared: 0.92
MSE: 1314244.08


## EXPERIMENT 3

### Splitting the dataset into the Training set, Test set, and Validation Set

In [26]:
from sklearn.model_selection import train_test_split
X_train,X_rest,Y_train,Y_rest=train_test_split(X,Y,test_size=.1,random_state=50)

In [27]:
X_test,X_val,Y_test,Y_val=train_test_split(X_rest,Y_rest,test_size=.5,random_state=50)

### Training the Multiple Linear Regression model on the Training set

In [28]:
from sklearn.linear_model import LinearRegression
regressor=LinearRegression()
regressor.fit(X_train.values,Y_train)

LinearRegression()

### Intercept and Coefficient

In [29]:
print('Coefficients: ',regressor.coef_)
print("Intercept: ", regressor.intercept_)

Coefficients:  [[11120.47306928   -11.29610169  -973.38787436   154.47022315
   -244.54567365   657.156262     981.2090724    894.36349392
    837.76801929  -209.91844095  -274.02769822  -487.77258804
   -986.05228202 -1466.02728624 -2377.93070367  5364.740656
   3669.41899655  2713.88658902  4607.74694676  4279.41739264
   5034.73373074  4967.10331991]]
Intercept:  [-2997.78549962]


### Validation set

In [30]:
y_predval=regressor.predict(X_val.values)
y_predval

array([[1593.25245448],
       [3967.2173702 ],
       [ 854.50694343],
       ...,
       [9183.44509518],
       [6282.92374129],
       [7489.51785106]])

In [31]:
Y_validate=Y_val.to_numpy()

In [32]:
np.hstack((y_predval,Y_validate)).round()

array([[1593.,  897.],
       [3967., 3171.],
       [ 855.,  960.],
       ...,
       [9183., 8245.],
       [6283., 5030.],
       [7490., 6175.]])

### Predicting the Test set results

In [33]:
y_pred=regressor.predict(X_test.values)

### Calculate MSE, R-Square

In [34]:
from sklearn.metrics import mean_squared_error, r2_score
import math
print(f'R squared: {r2_score(Y_test,y_pred):.2f}')
print(f'MSE: {mean_squared_error(Y_test,y_pred):.2f}')

R squared: 0.92
MSE: 1292091.16
