<a href="https://colab.research.google.com/github/bhushanmandava/Gradient-Boosting/blob/main/xgboost_regressor.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# XGBoost Regressor

## Part 1 - Data Preprocessing

### Importing the dataset

In [73]:
import numpy as np
import pandas as pd

In [74]:
dataset = pd.read_csv('insurance.csv')

### Checking missing data

In [75]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


### Handling categorical variables

In [76]:
dataset.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


Sex column

In [77]:
dataset['sex'].unique()

array(['female', 'male'], dtype=object)

In [78]:
dataset['sex']=dataset['sex'].apply(lambda x : 0 if x=="female" else 1)

In [79]:
dataset.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,0,27.9,0,yes,southwest,16884.924
1,18,1,33.77,1,no,southeast,1725.5523
2,28,1,33.0,3,no,southeast,4449.462
3,33,1,22.705,0,no,northwest,21984.47061
4,32,1,28.88,0,no,northwest,3866.8552


Smoker column

In [80]:
dataset['smoker'].unique()

array(['yes', 'no'], dtype=object)

In [81]:
dataset['smoker']=dataset['smoker'].apply(lambda x: 0 if x=="no" else 1)

In [82]:
dataset.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,0,27.9,0,1,southwest,16884.924
1,18,1,33.77,1,0,southeast,1725.5523
2,28,1,33.0,3,0,southeast,4449.462
3,33,1,22.705,0,0,northwest,21984.47061
4,32,1,28.88,0,0,northwest,3866.8552


Region column

In [83]:
dataset['region'].unique()

array(['southwest', 'southeast', 'northwest', 'northeast'], dtype=object)

In [84]:
region_dummies =pd.get_dummies(dataset['region'],drop_first=True)

In [85]:
region_dummies

Unnamed: 0,northwest,southeast,southwest
0,False,False,True
1,False,True,False
2,False,True,False
3,True,False,False
4,True,False,False
...,...,...,...
1333,True,False,False
1334,False,False,False
1335,False,True,False
1336,False,False,True


In [86]:
dataset =pd.concat([region_dummies,dataset],axis=1)

In [87]:
dataset.head()

Unnamed: 0,northwest,southeast,southwest,age,sex,bmi,children,smoker,region,charges
0,False,False,True,19,0,27.9,0,1,southwest,16884.924
1,False,True,False,18,1,33.77,1,0,southeast,1725.5523
2,False,True,False,28,1,33.0,3,0,southeast,4449.462
3,True,False,False,33,1,22.705,0,0,northwest,21984.47061
4,True,False,False,32,1,28.88,0,0,northwest,3866.8552


In [88]:
dataset = dataset.loc[:,~dataset.columns.duplicated()]


In [89]:
dataset.head()

Unnamed: 0,northwest,southeast,southwest,age,sex,bmi,children,smoker,region,charges
0,False,False,True,19,0,27.9,0,1,southwest,16884.924
1,False,True,False,18,1,33.77,1,0,southeast,1725.5523
2,False,True,False,28,1,33.0,3,0,southeast,4449.462
3,True,False,False,33,1,22.705,0,0,northwest,21984.47061
4,True,False,False,32,1,28.88,0,0,northwest,3866.8552


In [90]:
dataset = dataset.drop('region',axis=1)

In [91]:
dataset.head()

Unnamed: 0,northwest,southeast,southwest,age,sex,bmi,children,smoker,charges
0,False,False,True,19,0,27.9,0,1,16884.924
1,False,True,False,18,1,33.77,1,0,1725.5523
2,False,True,False,28,1,33.0,3,0,4449.462
3,True,False,False,33,1,22.705,0,0,21984.47061
4,True,False,False,32,1,28.88,0,0,3866.8552


### Creating the Training Set and the Test Set

Getting the inputs and output

In [92]:
x=dataset.iloc[:,:-1].values
y=dataset.iloc[:,-1].values

Getting the Training Set and the Test Set

In [93]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=0)

## Part 2 - Building and training the model

### Building the model

In [94]:
import xgboost as xgb

### Training the model

In [95]:
model =xgb.XGBRegressor(max_depth = 2, learning_rate = 0.1, n_estimators = 100)
model.fit(x_train,y_train)

### Inference

In [96]:
y_pred =model.predict(x_test)

## Part 3: Evaluating the model

### R-Squared

In [97]:
from sklearn.metrics import r2_score
r2_score(y_test,y_pred)

0.903036622417214

### Adjusted R-Squared

In [98]:
k=x.shape[1]
n=x.shape[0]
adjr2= 1-(1-r2_score(y_test,y_pred))*((n-1)/(n-k-1))
adjr2

0.9024529452007639

### k-Fold Cross Validation

In [99]:
from sklearn.model_selection import cross_val_score
r2s = cross_val_score(estimator = model,
                      X = x,
                      y = y,
                      scoring = 'r2',
                      cv = 10)
print("Average R-Squared: {:.3f}".format(r2s.mean()))
print("Standard Deviation: {:.3f}".format(r2s.std()))

Average R-Squared: 0.861
Standard Deviation: 0.043
