# Ensemble Methods and Random Forests

Placement Data Full Class by Ben Roshan

https://www.kaggle.com/benroshan/factors-affecting-campus-placement

In [1]:
import pandas as pd
import os 

In [2]:
data = pd.read_csv("Placement_Data_Full_Class.csv")
data

Unnamed: 0,sl_no,gender,ssc_p,ssc_b,hsc_p,hsc_b,hsc_s,degree_p,degree_t,workex,etest_p,specialisation,mba_p,status,salary
0,1,M,67.00,Others,91.00,Others,Commerce,58.00,Sci&Tech,No,55.0,Mkt&HR,58.80,Placed,270000.0
1,2,M,79.33,Central,78.33,Others,Science,77.48,Sci&Tech,Yes,86.5,Mkt&Fin,66.28,Placed,200000.0
2,3,M,65.00,Central,68.00,Central,Arts,64.00,Comm&Mgmt,No,75.0,Mkt&Fin,57.80,Placed,250000.0
3,4,M,56.00,Central,52.00,Central,Science,52.00,Sci&Tech,No,66.0,Mkt&HR,59.43,Not Placed,
4,5,M,85.80,Central,73.60,Central,Commerce,73.30,Comm&Mgmt,No,96.8,Mkt&Fin,55.50,Placed,425000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
210,211,M,80.60,Others,82.00,Others,Commerce,77.60,Comm&Mgmt,No,91.0,Mkt&Fin,74.49,Placed,400000.0
211,212,M,58.00,Others,60.00,Others,Science,72.00,Sci&Tech,No,74.0,Mkt&Fin,53.62,Placed,275000.0
212,213,M,67.00,Others,67.00,Others,Commerce,73.00,Comm&Mgmt,Yes,59.0,Mkt&Fin,69.72,Placed,295000.0
213,214,F,74.00,Others,66.00,Others,Commerce,58.00,Comm&Mgmt,No,70.0,Mkt&HR,60.23,Placed,204000.0


In [5]:
data.nunique()

sl_no             215
gender              2
ssc_p             103
ssc_b               2
hsc_p              97
hsc_b               2
hsc_s               3
degree_p           89
degree_t            3
workex              2
etest_p           100
specialisation      2
mba_p             205
status              2
salary             45
dtype: int64

In [7]:
data.describe()

Unnamed: 0,sl_no,ssc_p,hsc_p,degree_p,etest_p,mba_p,salary
count,215.0,215.0,215.0,215.0,215.0,215.0,148.0
mean,108.0,67.303395,66.333163,66.370186,72.100558,62.278186,288655.405405
std,62.209324,10.827205,10.897509,7.358743,13.275956,5.833385,93457.45242
min,1.0,40.89,37.0,50.0,50.0,51.21,200000.0
25%,54.5,60.6,60.9,61.0,60.0,57.945,240000.0
50%,108.0,67.0,65.0,66.0,71.0,62.0,265000.0
75%,161.5,75.7,73.0,72.0,83.5,66.255,300000.0
max,215.0,89.4,97.7,91.0,98.0,77.89,940000.0


In [8]:
data.dtypes

sl_no               int64
gender             object
ssc_p             float64
ssc_b              object
hsc_p             float64
hsc_b              object
hsc_s              object
degree_p          float64
degree_t           object
workex             object
etest_p           float64
specialisation     object
mba_p             float64
status             object
salary            float64
dtype: object

# Data Cleaning

In [13]:
dropped_columns = data.copy()

dropped_columns = dropped_columns.drop(["sl_no", "salary"], axis=1)
dropped_columns

Unnamed: 0,gender,ssc_p,ssc_b,hsc_p,hsc_b,hsc_s,degree_p,degree_t,workex,etest_p,specialisation,mba_p,status
0,M,67.00,Others,91.00,Others,Commerce,58.00,Sci&Tech,No,55.0,Mkt&HR,58.80,Placed
1,M,79.33,Central,78.33,Others,Science,77.48,Sci&Tech,Yes,86.5,Mkt&Fin,66.28,Placed
2,M,65.00,Central,68.00,Central,Arts,64.00,Comm&Mgmt,No,75.0,Mkt&Fin,57.80,Placed
3,M,56.00,Central,52.00,Central,Science,52.00,Sci&Tech,No,66.0,Mkt&HR,59.43,Not Placed
4,M,85.80,Central,73.60,Central,Commerce,73.30,Comm&Mgmt,No,96.8,Mkt&Fin,55.50,Placed
...,...,...,...,...,...,...,...,...,...,...,...,...,...
210,M,80.60,Others,82.00,Others,Commerce,77.60,Comm&Mgmt,No,91.0,Mkt&Fin,74.49,Placed
211,M,58.00,Others,60.00,Others,Science,72.00,Sci&Tech,No,74.0,Mkt&Fin,53.62,Placed
212,M,67.00,Others,67.00,Others,Commerce,73.00,Comm&Mgmt,Yes,59.0,Mkt&Fin,69.72,Placed
213,F,74.00,Others,66.00,Others,Commerce,58.00,Comm&Mgmt,No,70.0,Mkt&HR,60.23,Placed


Label Encoder

In [14]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

label_encoded_data = dropped_columns.copy()

columns_to_label_encode = ["gender", "workex", "ssc_b", "hsc_b", "specialisation"]

label_encoded_data[columns_to_label_encode] = label_encoded_data[columns_to_label_encode].apply(le.fit_transform)

label_encoded_data

Unnamed: 0,gender,ssc_p,ssc_b,hsc_p,hsc_b,hsc_s,degree_p,degree_t,workex,etest_p,specialisation,mba_p,status
0,1,67.00,1,91.00,1,Commerce,58.00,Sci&Tech,0,55.0,1,58.80,Placed
1,1,79.33,0,78.33,1,Science,77.48,Sci&Tech,1,86.5,0,66.28,Placed
2,1,65.00,0,68.00,0,Arts,64.00,Comm&Mgmt,0,75.0,0,57.80,Placed
3,1,56.00,0,52.00,0,Science,52.00,Sci&Tech,0,66.0,1,59.43,Not Placed
4,1,85.80,0,73.60,0,Commerce,73.30,Comm&Mgmt,0,96.8,0,55.50,Placed
...,...,...,...,...,...,...,...,...,...,...,...,...,...
210,1,80.60,1,82.00,1,Commerce,77.60,Comm&Mgmt,0,91.0,0,74.49,Placed
211,1,58.00,1,60.00,1,Science,72.00,Sci&Tech,0,74.0,0,53.62,Placed
212,1,67.00,1,67.00,1,Commerce,73.00,Comm&Mgmt,1,59.0,0,69.72,Placed
213,0,74.00,1,66.00,1,Commerce,58.00,Comm&Mgmt,0,70.0,1,60.23,Placed


In [15]:
hot_encoded_data = label_encoded_data.copy()

hot_encoded_data_y_placeholder = hot_encoded_data["status"]
hot_encoded_data = hot_encoded_data.drop("status", axis=1) # Removes the prediction column so that we don't encode it

hot_encoded_data = pd.get_dummies(hot_encoded_data,drop_first=True)

hot_encoded_data = pd.concat([hot_encoded_data, hot_encoded_data_y_placeholder], axis=1)
hot_encoded_data

Unnamed: 0,gender,ssc_p,ssc_b,hsc_p,hsc_b,degree_p,workex,etest_p,specialisation,mba_p,hsc_s_Commerce,hsc_s_Science,degree_t_Others,degree_t_Sci&Tech,status
0,1,67.00,1,91.00,1,58.00,0,55.0,1,58.80,1,0,0,1,Placed
1,1,79.33,0,78.33,1,77.48,1,86.5,0,66.28,0,1,0,1,Placed
2,1,65.00,0,68.00,0,64.00,0,75.0,0,57.80,0,0,0,0,Placed
3,1,56.00,0,52.00,0,52.00,0,66.0,1,59.43,0,1,0,1,Not Placed
4,1,85.80,0,73.60,0,73.30,0,96.8,0,55.50,1,0,0,0,Placed
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
210,1,80.60,1,82.00,1,77.60,0,91.0,0,74.49,1,0,0,0,Placed
211,1,58.00,1,60.00,1,72.00,0,74.0,0,53.62,0,1,0,1,Placed
212,1,67.00,1,67.00,1,73.00,1,59.0,0,69.72,1,0,0,0,Placed
213,0,74.00,1,66.00,1,58.00,0,70.0,1,60.23,1,0,0,0,Placed


Scaling

In [17]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

scaled_data = hot_encoded_data.copy()
scaled_data = scaled_data.iloc[:, :-1]
scaled_data = pd.DataFrame(scaler.fit_transform(scaled_data),columns=scaled_data.columns)

scaled_data

Unnamed: 0,gender,ssc_p,ssc_b,hsc_p,hsc_b,degree_p,workex,etest_p,specialisation,mba_p,hsc_s_Commerce,hsc_s_Science,degree_t_Others,degree_t_Sci&Tech
0,0.739434,-0.028087,1.082459,2.268812,0.800763,-1.140102,-0.724446,-1.291091,1.123903,-0.597647,0.950082,-0.856663,-0.23221,1.626059
1,0.739434,1.113369,-0.923823,1.103448,0.800763,1.513267,1.380364,1.087157,-0.889757,0.687620,-1.052541,1.167321,-0.23221,1.626059
2,0.739434,-0.213238,-0.923823,0.153313,-1.248809,-0.322843,-0.724446,0.218908,-0.889757,-0.769474,-1.052541,-0.856663,-0.23221,-0.614984
3,0.739434,-1.046417,-0.923823,-1.318339,-1.248809,-1.957362,-0.724446,-0.460592,1.123903,-0.489396,-1.052541,1.167321,-0.23221,1.626059
4,0.739434,1.712332,-0.923823,0.668391,-1.248809,0.943909,-0.724446,1.864806,-0.889757,-1.164676,0.950082,-0.856663,-0.23221,-0.614984
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
210,0.739434,1.230940,1.082459,1.441008,0.800763,1.529612,-0.724446,1.426906,-0.889757,2.098321,0.950082,-0.856663,-0.23221,-0.614984
211,0.739434,-0.861266,1.082459,-0.582513,0.800763,0.766836,-0.724446,0.143408,-0.889757,-1.487711,-1.052541,1.167321,-0.23221,1.626059
212,0.739434,-0.028087,1.082459,0.061335,0.800763,0.903046,1.380364,-0.989091,-0.889757,1.278706,0.950082,-0.856663,-0.23221,-0.614984
213,-1.352386,0.619941,1.082459,-0.030644,0.800763,-1.140102,-0.724446,-0.158592,1.123903,-0.351934,0.950082,-0.856663,-0.23221,-0.614984


# Train-Test Split

In [18]:
from sklearn.model_selection import train_test_split

X = scaled_data
y = hot_encoded_data_y_placeholder


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)
X_train

Unnamed: 0,gender,ssc_p,ssc_b,hsc_p,hsc_b,degree_p,workex,etest_p,specialisation,mba_p,hsc_s_Commerce,hsc_s_Science,degree_t_Others,degree_t_Sci&Tech
122,-1.352386,-0.074375,-0.923823,0.042939,-1.248809,0.399070,1.380364,0.626607,-0.889757,1.498644,-1.052541,-0.856663,-0.23221,-0.614984
21,-1.352386,1.082819,1.082459,0.889139,0.800763,2.537565,-0.724446,1.728906,-0.889757,1.165300,0.950082,-0.856663,-0.23221,-0.614984
172,0.739434,0.527366,1.082459,-0.766470,0.800763,-1.412522,-0.724446,0.898407,1.123903,-1.656102,0.950082,-0.856663,-0.23221,-0.614984
184,-1.352386,-1.020496,1.082459,-0.322215,0.800763,-0.896287,-0.724446,-0.913591,1.123903,-0.857106,0.950082,-0.856663,-0.23221,-0.614984
7,0.739434,1.360545,-0.923823,-0.214600,-1.248809,-0.050423,1.380364,-0.385092,-0.889757,-0.023744,-1.052541,1.167321,-0.23221,1.626059
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
86,0.739434,-0.490964,1.082459,-0.306578,0.800763,-0.322843,-0.724446,-0.385092,-0.889757,-0.901781,0.950082,-0.856663,-0.23221,-0.614984
16,0.739434,-0.398389,-0.923823,-0.012248,-1.248809,-0.104907,1.380364,-0.913591,-0.889757,0.044987,0.950082,-0.856663,-0.23221,-0.614984
214,0.739434,-0.490964,-0.923823,-0.766470,0.800763,-1.821152,-0.724446,1.275907,1.123903,-0.353652,-1.052541,1.167321,-0.23221,-0.614984
77,0.739434,-0.305813,1.082459,1.257052,0.800763,-0.186633,1.380364,-0.234092,-0.889757,-0.795248,-1.052541,1.167321,-0.23221,1.626059


## Run ML Algorithms

### Build Ensemble Predictor

In [19]:
# Instantiating and configuring our Ensemble Classifier

from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

log_clf = LogisticRegression()
rnd_for_clf = RandomForestClassifier()
svm_clf = SVC()

voting_clf = VotingClassifier(estimators = [('lr', log_clf), ('rf', rnd_for_clf), ('sc', svm_clf)], voting="hard")

In [20]:
# Training the Ensemble classifier

voting_clf.fit(X_train, y_train)

## Measure the accuracy of our classifier

In [21]:
from sklearn.metrics import accuracy_score

for clf in (log_clf, rnd_for_clf, svm_clf, voting_clf):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(clf.__class__.__name__, accuracy_score(y_test, y_pred))

LogisticRegression 0.8169014084507042
RandomForestClassifier 0.8028169014084507
SVC 0.8169014084507042
VotingClassifier 0.8028169014084507


## Bagging and Pasting in skLearn

In [22]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

bag_clf = BaggingClassifier(
    DecisionTreeClassifier(), n_estimators=500,
    max_samples=100, bootstrap=True, n_jobs=-1) # Bootstrap determines whether this is bagging or pasting
bag_clf.fit(X_train, y_train)
y_pred = bag_clf.predict(X_test)

## Random Forests

In [23]:
## Create a random forest classifier and see the importance of each variable

from sklearn.ensemble import RandomForestClassifier

rf_clf = RandomForestClassifier(n_estimators=500, n_jobs=-1)
rf_clf.fit(X_train, y_train)

for name, score in zip(hot_encoded_data.iloc[:, :-1].columns, rf_clf.feature_importances_):
    print(name, ": ", score)


gender :  0.024117190987287727
ssc_p :  0.2780759496930317
ssc_b :  0.015521553480713862
hsc_p :  0.1901319502811814
hsc_b :  0.013140995910865066
degree_p :  0.17663465594771147
workex :  0.03777549408343003
etest_p :  0.07704888518088979
specialisation :  0.03561867387427166
mba_p :  0.10501615332696412
hsc_s_Commerce :  0.012011663394785514
hsc_s_Science :  0.013130973457615708
degree_t_Others :  0.006866777253564753
degree_t_Sci&Tech :  0.014909083127687295


## AdaBoost

In [24]:
from sklearn.ensemble import AdaBoostClassifier

ada_clf = AdaBoostClassifier(
    DecisionTreeClassifier(max_depth=1), n_estimators=200, 
    algorithm="SAMME.R", learning_rate=0.5)

ada_clf.fit(X_train, y_train)

y_pred = ada_clf.predict(X_test)

accuracy_score(y_test, y_pred)

0.8591549295774648

## Gradient Boosting

In [33]:
from sklearn.ensemble import GradientBoostingClassifier

gbrt = GradientBoostingClassifier(max_depth=2, n_estimators=120)

gbrt.fit(X_train, y_train)

y_pred = gbrt.predict(X_test)

accuracy_score(y_test, y_pred)

0.8309859154929577