<a href="https://colab.research.google.com/github/Yogi-Puvvala/Machine_Learning/blob/main/Gradient_Boost.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Gradient Boost (Classifier)**

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report

In [2]:
df = pd.read_csv("drive/MyDrive/Colab_Projects/telecom.csv")

In [3]:
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [4]:
df.shape

(7043, 21)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


In [6]:
df.isna().sum()

Unnamed: 0,0
customerID,0
gender,0
SeniorCitizen,0
Partner,0
Dependents,0
tenure,0
PhoneService,0
MultipleLines,0
InternetService,0
OnlineSecurity,0


In [7]:
X = df.drop("Churn", axis = 1)
y = df["Churn"]

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
nominal_cols = [col for col in X.columns if df[col].dtype == "O"]
numerical_cols = [col for col in X.columns if (df[col].dtype == "int64" or df[col].dtype == "float64")]

In [10]:
preprocessor = ColumnTransformer([
    ("num", "passthrough", numerical_cols),
    ("nom", OneHotEncoder(handle_unknown="ignore", sparse_output=False), nominal_cols)
])

In [11]:
gbc = Pipeline([
    ("preprocess", preprocessor),
    ("model", GradientBoostingClassifier())
])

gbc.fit(X_train, y_train)

In [12]:
print("Training Score:", gbc.score(X_train, y_train))
print("Testing Score:", gbc.score(X_test, y_test))

Training Score: 0.8132765353212638
Testing Score: 0.8161816891412349


In [13]:
print("Classification report:", classification_report(y_test, gbc.predict(X_test)))

Classification report:               precision    recall  f1-score   support

          No       0.84      0.92      0.88      1036
         Yes       0.71      0.52      0.60       373

    accuracy                           0.82      1409
   macro avg       0.78      0.72      0.74      1409
weighted avg       0.81      0.82      0.81      1409



# **Gradient Boost (Regressor)**

In [14]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import r2_score

In [15]:
df = pd.read_csv("drive/MyDrive/Colab_Projects/insurance.csv")

In [16]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [17]:
df.shape

(1338, 7)

In [18]:
df.isna().sum()

Unnamed: 0,0
age,0
sex,0
bmi,0
children,0
smoker,0
region,0
charges,0


In [19]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


In [20]:
X = df.drop("charges", axis =1)
y = df["charges"]

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=43)

In [22]:
nominal_cols = [col for col in X.columns if df[col].dtype == "O"]
numerical_cols = [col for col in X.columns if (df[col].dtype == "int64" or df[col].dtype == "float64")]

In [23]:
preprocessor = ColumnTransformer([
    ("num", "passthrough", numerical_cols),
    ("nom", OneHotEncoder(handle_unknown="ignore", sparse_output=False), nominal_cols)
])

In [24]:
gbr = Pipeline([
    ("preprocess", preprocessor),
    ("model", GradientBoostingRegressor())
])

gbr.fit(X_train, y_train)

In [25]:
print("Training Score:", gbr.score(X_train, y_train))
print("Testing Score:", gbr.score(X_test, y_test))

Training Score: 0.9205785682930007
Testing Score: 0.7844804336542883


In [26]:
param_grid = {
    'model__n_estimators': [100, 200],
    'model__learning_rate': [0.05, 0.1],
    'model__max_depth': [3, 5],
    'model__min_samples_split': [2, 5],
    'model__min_samples_leaf': [1, 2],
    'model__subsample': [0.8, 1.0],
    'model__max_features': ['sqrt', 'log2']
}

gscv = GridSearchCV(estimator=gbr, param_grid=param_grid, cv=5, scoring='r2', verbose=2, n_jobs=-1)
gscv.fit(X_train, y_train)

Fitting 5 folds for each of 128 candidates, totalling 640 fits


In [27]:
print("Training Score:", gscv.score(X_train, y_train))
print("Testing Score:", gscv.score(X_test, y_test))

Training Score: 0.9058211639259431
Testing Score: 0.7875412332350964
