# AdaBoost and GradientBoost Classifier


# Input packages and datasets

In [1]:
import pandas as pd
import numpy as np

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import AdaBoostClassifier,GradientBoostingClassifier
from sklearn.metrics import accuracy_score

Using Kaggle datasets https://www.kaggle.com/benroshan/factors-affecting-campus-placement

In [4]:
df = pd.read_csv('Placement_Data_Full_Class.csv', header=None, sep=',')

# Preprocessing


Checking datasets

In [5]:
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,sl_no,gender,ssc_p,ssc_b,hsc_p,hsc_b,hsc_s,degree_p,degree_t,workex,etest_p,specialisation,mba_p,status,salary
1,1,M,67.00,Others,91.00,Others,Commerce,58.00,Sci&Tech,No,55,Mkt&HR,58.8,Placed,270000
2,2,M,79.33,Central,78.33,Others,Science,77.48,Sci&Tech,Yes,86.5,Mkt&Fin,66.28,Placed,200000
3,3,M,65.00,Central,68.00,Central,Arts,64.00,Comm&Mgmt,No,75,Mkt&Fin,57.8,Placed,250000
4,4,M,56.00,Central,52.00,Central,Science,52.00,Sci&Tech,No,66,Mkt&HR,59.43,Not Placed,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
211,211,M,80.60,Others,82.00,Others,Commerce,77.60,Comm&Mgmt,No,91,Mkt&Fin,74.49,Placed,400000
212,212,M,58.00,Others,60.00,Others,Science,72.00,Sci&Tech,No,74,Mkt&Fin,53.62,Placed,275000
213,213,M,67.00,Others,67.00,Others,Commerce,73.00,Comm&Mgmt,Yes,59,Mkt&Fin,69.72,Placed,295000
214,214,F,74.00,Others,66.00,Others,Commerce,58.00,Comm&Mgmt,No,70,Mkt&HR,60.23,Placed,204000


In [6]:
df.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
count,216,216,216.0,216,216.0,216,216,216.0,216,216,216,216,216.0,216,149
unique,216,3,104.0,3,98.0,3,4,90.0,4,3,101,3,206.0,3,46
top,41,M,62.0,Central,63.0,Others,Commerce,65.0,Comm&Mgmt,No,60,Mkt&Fin,56.7,Placed,300000
freq,1,139,11.0,116,14.0,131,113,20.0,145,141,14,120,3.0,148,22


Changing features name 

In [7]:
new_header = df.iloc[0] 
df = df[1:]
df.columns = new_header
df.reset_index(inplace=True, drop=True)
df.head()

Unnamed: 0,sl_no,gender,ssc_p,ssc_b,hsc_p,hsc_b,hsc_s,degree_p,degree_t,workex,etest_p,specialisation,mba_p,status,salary
0,1,M,67.0,Others,91.0,Others,Commerce,58.0,Sci&Tech,No,55.0,Mkt&HR,58.8,Placed,270000.0
1,2,M,79.33,Central,78.33,Others,Science,77.48,Sci&Tech,Yes,86.5,Mkt&Fin,66.28,Placed,200000.0
2,3,M,65.0,Central,68.0,Central,Arts,64.0,Comm&Mgmt,No,75.0,Mkt&Fin,57.8,Placed,250000.0
3,4,M,56.0,Central,52.0,Central,Science,52.0,Sci&Tech,No,66.0,Mkt&HR,59.43,Not Placed,
4,5,M,85.8,Central,73.6,Central,Commerce,73.3,Comm&Mgmt,No,96.8,Mkt&Fin,55.5,Placed,425000.0


Checking null

Pada dataset ini ditemukan null untuk bagian salary, namun untuk saat ini kami tidak melakukan klasifikasi pada salary karena AdaBoost dan GradientBoost classifier dinilai model kurang tepat untuk memperkirakan salary, sehingga hanya untuk mengklasifikasikan status 'placed / not placed'

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 215 entries, 0 to 214
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   sl_no           215 non-null    object
 1   gender          215 non-null    object
 2   ssc_p           215 non-null    object
 3   ssc_b           215 non-null    object
 4   hsc_p           215 non-null    object
 5   hsc_b           215 non-null    object
 6   hsc_s           215 non-null    object
 7   degree_p        215 non-null    object
 8   degree_t        215 non-null    object
 9   workex          215 non-null    object
 10  etest_p         215 non-null    object
 11  specialisation  215 non-null    object
 12  mba_p           215 non-null    object
 13  status          215 non-null    object
 14  salary          148 non-null    object
dtypes: object(15)
memory usage: 25.3+ KB


Drop kolom "s1_no" yang merupakan index, dan "salary" karena tidak diperlukan

In [9]:
df = df.drop(columns='sl_no')

In [10]:
df = df.drop(columns='salary')

Changing categorical data

Data yang dirubah dapat dilihat pada kode dibawah

In [11]:
df['gender'] = df['gender'].replace(['M','F'],['1','0'])

In [12]:
df['hsc_s'] = df['hsc_s'].replace(['Commerce','Science','Arts'],['1','2','3'])

In [13]:
df['hsc_b'] = df['hsc_b'].replace(['Others','Central'],['0','1'])

In [14]:
df['ssc_b'] = df['ssc_b'].replace(['Others','Central'],['0','1'])

In [15]:
df['degree_t'] = df['degree_t'].replace(['Comm&Mgmt','Sci&Tech','Others'],['1','2','3'])

In [16]:
df['workex'] = df['workex'].replace(['Yes','No'],['1','0'])

In [17]:
df['specialisation'] = df['specialisation'].replace(['Mkt&HR','Mkt&Fin'],['1','2'])

Melihat data setelah diperbaiki

In [18]:
df

Unnamed: 0,gender,ssc_p,ssc_b,hsc_p,hsc_b,hsc_s,degree_p,degree_t,workex,etest_p,specialisation,mba_p,status
0,1,67.00,0,91.00,0,1,58.00,2,0,55,1,58.8,Placed
1,1,79.33,1,78.33,0,2,77.48,2,1,86.5,2,66.28,Placed
2,1,65.00,1,68.00,1,3,64.00,1,0,75,2,57.8,Placed
3,1,56.00,1,52.00,1,2,52.00,2,0,66,1,59.43,Not Placed
4,1,85.80,1,73.60,1,1,73.30,1,0,96.8,2,55.5,Placed
...,...,...,...,...,...,...,...,...,...,...,...,...,...
210,1,80.60,0,82.00,0,1,77.60,1,0,91,2,74.49,Placed
211,1,58.00,0,60.00,0,2,72.00,2,0,74,2,53.62,Placed
212,1,67.00,0,67.00,0,1,73.00,1,1,59,2,69.72,Placed
213,0,74.00,0,66.00,0,1,58.00,1,0,70,1,60.23,Placed


# Define X and Y

In [19]:
dfx = df.drop(columns='status')

In [20]:
dfy = df['status']

# Split Train and Test Data

In [21]:
from sklearn.model_selection import train_test_split

In [22]:
X_train, X_test, y_train, y_test = train_test_split(dfx, dfy, test_size=0.30, random_state=0)

# Modeling 

AdaBoost Classifier default parameters


In [32]:
abc = AdaBoostClassifier()
abc.fit(X_train, y_train)
y_pred = abc.predict(X_test)
accuracy_score(y_test, y_pred)

0.8153846153846154

In [33]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

  Not Placed       0.73      0.58      0.65        19
      Placed       0.84      0.91      0.87        46

    accuracy                           0.82        65
   macro avg       0.79      0.75      0.76        65
weighted avg       0.81      0.82      0.81        65



Parameters tuning with gridsearchcv

In [34]:
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import GridSearchCV

model = AdaBoostClassifier(random_state=100)
params = {
    'n_estimators':(50,100),
    'learning_rate':[0.5, 1]
}
gs = GridSearchCV(model, params)
gs.fit(X_train, y_train)
y_pred = gs.predict(X_test)
print('Best parameter: ', gs.best_params_)

Best parameter:  {'learning_rate': 1, 'n_estimators': 100}


AdaBoost Classifier with best parameter

In [36]:
abc = AdaBoostClassifier(learning_rate=1, n_estimators=100)
abc.fit(X_train, y_train)
y_pred = abc.predict(X_test)
accuracy_score(y_test, y_pred)

0.8153846153846154

In [37]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

  Not Placed       0.73      0.58      0.65        19
      Placed       0.84      0.91      0.87        46

    accuracy                           0.82        65
   macro avg       0.79      0.75      0.76        65
weighted avg       0.81      0.82      0.81        65



GradientBoost with default parameters

In [38]:
gbc = GradientBoostingClassifier()
gbc.fit(X_train, y_train)
y_pred = gbc.predict(X_test)
accuracy_score(y_test, y_pred)

0.7846153846153846

In [39]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

  Not Placed       0.65      0.58      0.61        19
      Placed       0.83      0.87      0.85        46

    accuracy                           0.78        65
   macro avg       0.74      0.72      0.73        65
weighted avg       0.78      0.78      0.78        65



Parameters tuning

In [41]:
model = GradientBoostingClassifier(random_state=100)
params = {
        'n_estimators':range(1,130),
        'learning_rate':[0.001, 0.01, 0.1, 0.2, 0.5, 1]
}
gs = GridSearchCV(model, params)
gs.fit(X_train, y_train)
print('Best parameter: ', gs.best_params_)

Best parameter:  {'learning_rate': 0.5, 'n_estimators': 38}


GradientBoost with best parameters


In [28]:
gbc = GradientBoostingClassifier(learning_rate=0.5, n_estimators=38)
gbc.fit(X_train, y_train)
y_pred = gbc.predict(X_test)
accuracy_score(y_test, y_pred)

0.8

In [40]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

  Not Placed       0.65      0.58      0.61        19
      Placed       0.83      0.87      0.85        46

    accuracy                           0.78        65
   macro avg       0.74      0.72      0.73        65
weighted avg       0.78      0.78      0.78        65



# Summary

Dengan hasil diatas maka dapat diambil kesimpulan bahwa 81.5% data dapat dijelaskan oleh model Adaboost Classifier dan 80% data dapat dijelaskan oleh model GradientBoost Classifier. Sisanya merupakan error atau pengaruh lainnya.