# Gradient Boosting Machine using Scikit-Learn

#### Python Imports

In [16]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import r2_score

#### Load and display the Diamond Prices data set

Source: [R Data Sets - Diamond Pricing](https://vincentarelbundock.github.io/Rdatasets/csv/Ecdat/Diamond.csv)

Data Dictionary:

    - carat : weight of diamond stones in carat unit
    - colour: a factor with levels (D,E,F,G,H,I). D is the highest quality
    - clarity: a factor with levels (IF,VVS1,VVS2,VS1,VS2). IF is the highest quality
    - certification: certification body, a factor with levels ( GIA, IGI, HRD)
    - price: price in Singapore dollars

In [17]:
url = './data/diamond-prices.csv'
diamond_prices_df = pd.read_csv(url, usecols=['carat', 'colour', 'clarity', 'certification', 'price'])
diamond_prices_df

Unnamed: 0,carat,colour,clarity,certification,price
0,0.30,D,VS2,GIA,1302
1,0.30,E,VS1,GIA,1510
2,0.30,G,VVS1,GIA,1510
3,0.30,G,VS1,GIA,1260
4,0.31,D,VS1,GIA,1641
...,...,...,...,...,...
303,1.01,I,VS1,HRD,8175
304,1.02,F,VVS2,HRD,10796
305,1.06,H,VVS2,HRD,9890
306,1.02,H,VS2,HRD,8959


#### Using the data dictionary, create a Python dict of the ordinal features with a dictionary that maps the labels to their respective numerical values

In [18]:
ord_features_dict = {
    'colour': {'D': 6, 'E': 5, 'F': 4, 'G': 3, 'H': 2 , 'I': 1},
    'clarity': {'IF': 5, 'VVS1': 4, 'VVS2': 3, 'VS1': 2, 'VS2': 1}
}

#### Replace the ordinal feature labels with their numerical equivalent in the data set

In [19]:
for key, val in ord_features_dict.items():
    diamond_prices_df[key] = diamond_prices_df[key].map(val)
diamond_prices_df

Unnamed: 0,carat,colour,clarity,certification,price
0,0.30,6,1,GIA,1302
1,0.30,5,2,GIA,1510
2,0.30,3,4,GIA,1510
3,0.30,3,2,GIA,1260
4,0.31,6,2,GIA,1641
...,...,...,...,...,...
303,1.01,1,2,HRD,8175
304,1.02,4,3,HRD,10796
305,1.06,2,3,HRD,9890
306,1.02,2,1,HRD,8959


#### Create a dummy binary variable for the nominal feature `certification`

In [20]:
cert_encoded_df = pd.get_dummies(diamond_prices_df[['certification']], prefix_sep='.', sparse=False)
cert_encoded_df

Unnamed: 0,certification.GIA,certification.HRD,certification.IGI
0,1,0,0
1,1,0,0
2,1,0,0
3,1,0,0
4,1,0,0
...,...,...,...
303,0,1,0
304,0,1,0
305,0,1,0
306,0,1,0


#### Drop the nominal feature and replace with dummy variables

In [21]:
diamond_prices_df = diamond_prices_df.drop('certification', axis=1)
diamond_prices_df = pd.concat([diamond_prices_df, cert_encoded_df], axis=1)
diamond_prices_df

Unnamed: 0,carat,colour,clarity,price,certification.GIA,certification.HRD,certification.IGI
0,0.30,6,1,1302,1,0,0
1,0.30,5,2,1510,1,0,0
2,0.30,3,4,1510,1,0,0
3,0.30,3,2,1260,1,0,0
4,0.31,6,2,1641,1,0,0
...,...,...,...,...,...,...,...
303,1.01,1,2,8175,0,1,0
304,1.02,4,3,10796,0,1,0
305,1.06,2,3,9890,0,1,0
306,1.02,2,1,8959,0,1,0


#### Display the feature information about the diamond prices data set

In [22]:
diamond_prices_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 308 entries, 0 to 307
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   carat              308 non-null    float64
 1   colour             308 non-null    int64  
 2   clarity            308 non-null    int64  
 3   price              308 non-null    int64  
 4   certification.GIA  308 non-null    uint8  
 5   certification.HRD  308 non-null    uint8  
 6   certification.IGI  308 non-null    uint8  
dtypes: float64(1), int64(3), uint8(3)
memory usage: 10.7 KB


#### Create the training and testing data sets

In [23]:
X_train, X_test, y_train, y_test = train_test_split(diamond_prices_df, diamond_prices_df['price'], test_size=0.25, random_state=101)

#### Drop the target `price` from the training and testing data sets

In [24]:
X_train = X_train.drop('price', axis=1)
X_test = X_test.drop('price', axis=1)

#### Initialize and fit the Gradient Boosting model for Regression

The hyperparameter `max_depth` controls the maximum depth of the Decision Tree
The hyperparameter `n_estimators` indicates the number of Decision Trees to use - the default is 50
The hyperparameter `learning_rate` controls the magnitude of the performance coefficient $\alpha$. Smaller the values smaller the weights at each iteration

In [25]:
model = GradientBoostingRegressor(max_depth=5, n_estimators=200, learning_rate=0.01, random_state=101)
model.fit(X_train, y_train)

GradientBoostingRegressor(learning_rate=0.01, max_depth=5, n_estimators=200,
                          random_state=101)

#### Predict the target `price` using the testing data set

In [26]:
y_predict = model.predict(X_test)

#### Display the R2 score

In [27]:
r2_score(y_test, y_predict)

0.9631043761861598