# Exercise - Ensemble

In this exercise, we will focus on underage drinking. The data set contains data about high school students. Each row represents a single student. The columns include the characteristics of deidentified students. This is a binary classification task: predict whether a student drinks alcohol or not (this is the **alc** column: 1=Yes, 0=No). This is an important prediction task to detect underage drinking and deploy intervention techniques. 

## Description of Variables

The description of variables are provided in "Alcohol - Data Dictionary.docx"

## Goal

Use the **alcohol.csv** data set and build a model to predict **alc**. 

# Read and Prepare the Data

In [1]:
# Common imports

import pandas as pd
import numpy as np

np.random.seed(42)

# Get the data

In [2]:
#We will predict the "price" value in the data set:

alcohol = pd.read_csv("alcohol.csv")
alcohol.head()

Unnamed: 0,age,Medu,Fedu,traveltime,studytime,failures,famrel,freetime,goout,health,absences,gender,alc
0,18,2,1,4,2,0,5,4,2,5,2,M,1
1,18,4,3,1,0,0,4,4,2,3,9,M,1
2,15,4,3,2,3,0,5,3,4,5,0,F,0
3,15,3,3,1,4,0,4,3,3,3,10,F,0
4,17,3,2,1,2,0,5,3,5,5,2,M,1


# Split data (train/test)

In [3]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(alcohol, test_size=0.3)

# Data Prep

In [4]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import FunctionTransformer

## Separate the target variable 

In [5]:
train_y = train['alc']
test_y = test['alc']

train_inputs = train.drop(['alc'], axis=1)
test_inputs = test.drop(['alc'], axis=1)

## Feature Engineering: Derive a new column

Examples:
- Ratio of study time to travel time
- Student is younger than 18 or not
- Average of father's and mother's level of education
- (etc.)

In [6]:
def new_col(df):
    
    #Create a copy so that we don't overwrite the existing dataframe
    df1 = df.copy()

    # Use the formula, though fill in 0s when the value is 0/0 (because 0/0 generates "nan" values)
    df1['studytm_traveltm'] = (df1['studytime']/df1['traveltime']).fillna(0)

    # Replace the infinity values with 1 (because a value divided by 0 generates infinity)
    df1['studytm_traveltm'].replace(np.inf, 1, inplace=True)

    return df1[['studytm_traveltm']]
    # You can use this to check whether the calculation is made correctly:
    return df1
    

In [7]:
# Send train set to the function we created
new_col(train)

Unnamed: 0,studytm_traveltm
12759,1.0
4374,1.5
8561,1.0
10697,2.0
19424,4.0
...,...
16850,1.0
6265,2.0
11284,1.5
860,1.5


In [8]:
#Check the new distribution

new_col(train).value_counts()

studytm_traveltm
1.000000            10810
2.000000             5423
3.000000             2824
1.500000             2430
0.500000              656
4.000000              491
0.666667              478
0.000000              429
1.333333               85
0.333333               63
0.750000               43
0.400000               15
0.600000               13
5.000000               13
0.250000               12
2.500000                9
0.200000                2
0.285714                2
1.250000                1
1.666667                1
dtype: int64

##  Identify the numeric, binary, and categorical columns

In [9]:
# Identify the numerical columns
numeric_columns = train_inputs.select_dtypes(include=[np.number]).columns.to_list()

# Identify the categorical columns
categorical_columns = train_inputs.select_dtypes('object').columns.to_list()

In [10]:
numeric_columns

['age',
 'Medu',
 'Fedu',
 'traveltime',
 'studytime',
 'failures',
 'famrel',
 'freetime',
 'goout',
 'health',
 'absences']

In [11]:
categorical_columns

['gender']

In [13]:
feat_eng_columns = ['studytime', 'traveltime']

# Pipeline

In [14]:
numeric_transformer = Pipeline(steps=[
                ('imputer', SimpleImputer(strategy='median')),
                ('scaler', StandardScaler())])

In [15]:
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value=0)),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

In [16]:
# Create a pipeline for the transformed column here

my_new_column = Pipeline(steps=[('my_new_column', FunctionTransformer(new_col)),
                               ('scaler', StandardScaler())])



In [18]:
preprocessor = ColumnTransformer([
        ('num', numeric_transformer, numeric_columns),
        ('cat', categorical_transformer, categorical_columns),
        ('trans', my_new_column, feat_eng_columns)],
        remainder='drop')

#passtrough is an optional step. You don't have to use it.

# Transform: fit_transform() for TRAIN

In [19]:
#Fit and transform the train data
train_x = preprocessor.fit_transform(train_inputs)

train_x

array([[ 0.66643886,  0.96597412,  0.90362635, ...,  1.        ,
         0.        , -0.6645619 ],
       [ 0.66643886, -0.93881619, -1.68666277, ...,  0.        ,
         1.        , -0.0495349 ],
       [ 0.66643886,  0.33104402,  0.04019664, ...,  0.        ,
         1.        , -0.6645619 ],
       ...,
       [ 0.66643886, -2.20867639, -2.55009248, ...,  1.        ,
         0.        , -0.0495349 ],
       [ 1.6195814 , -0.30388608, -1.68666277, ...,  0.        ,
         1.        , -0.0495349 ],
       [ 1.6195814 , -0.30388608, -2.55009248, ...,  0.        ,
         1.        , -0.6645619 ]])

In [20]:
train_x.shape

(23800, 14)

# Tranform: transform() for TEST

In [21]:
# Transform the test data
test_x = preprocessor.transform(test_inputs)

test_x

array([[-1.23984621,  0.33104402,  1.76705606, ...,  1.        ,
         0.        ,  0.5654921 ],
       [-1.23984621, -0.30388608,  0.04019664, ...,  0.        ,
         1.        , -1.2795889 ],
       [-0.28670367,  0.33104402,  0.04019664, ...,  0.        ,
         1.        ,  0.5654921 ],
       ...,
       [ 0.66643886, -0.30388608,  0.04019664, ...,  1.        ,
         0.        , -0.6645619 ],
       [-1.23984621, -0.93881619,  0.04019664, ...,  0.        ,
         1.        , -0.0495349 ],
       [-1.23984621,  0.96597412,  0.04019664, ...,  1.        ,
         0.        , -0.6645619 ]])

In [22]:
test_x.shape

(10200, 14)

# Calculate the Baseline

In [23]:
from sklearn.dummy import DummyClassifier

dummy_clf = DummyClassifier(strategy="most_frequent")

dummy_clf.fit(train_x, train_y)

In [24]:
from sklearn.metrics import accuracy_score

In [25]:
#Baseline Train Accuracy
dummy_train_pred = dummy_clf.predict(train_x)

baseline_train_acc = accuracy_score(train_y, dummy_train_pred)

print('Baseline Train Accuracy: {}' .format(baseline_train_acc))

Baseline Train Accuracy: 0.5234873949579832


In [26]:
#Baseline Test Accuracy
dummy_test_pred = dummy_clf.predict(test_x)

baseline_test_acc = accuracy_score(test_y, dummy_test_pred)

print('Baseline Test Accuracy: {}' .format(baseline_test_acc))

Baseline Test Accuracy: 0.5194117647058824


# Train a voting classifier 

In [27]:
from sklearn.tree import DecisionTreeClassifier 
from sklearn.linear_model import SGDClassifier 
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier


dtree_clf = DecisionTreeClassifier(max_depth=20)
log_clf = LogisticRegression(solver = 'lbfgs', C=10, max_iter=1000)
sgd_clf = SGDClassifier(max_iter=10000, tol=1e-3)

voting_clf = VotingClassifier(
            estimators=[('dt', dtree_clf), 
                        ('lr', log_clf), 
                        ('sgd', sgd_clf)],
            voting='hard')

voting_clf.fit(train_x, train_y)

In [28]:
from sklearn.metrics import accuracy_score

In [29]:
#Train accuracy

train_y_pred = voting_clf.predict(train_x)

train_acc = accuracy_score(train_y, train_y_pred)

print('Train acc: {}' .format(train_acc))

Train acc: 0.8343697478991596


In [30]:
#Test accuracy

test_y_pred = voting_clf.predict(test_x)

test_acc = accuracy_score(test_y, test_y_pred)

print('Test acc: {}' .format(test_acc))

Test acc: 0.8200980392156862


In [33]:
from sklearn.metrics import confusion_matrix

In [34]:
confusion_matrix(test_y, test_y_pred)

array([[4490,  808],
       [1038, 3864]])

In [35]:
#each classifier's accuracy
for clf in (dtree_clf, log_clf, sgd_clf, voting_clf):
    clf.fit(train_x, train_y.ravel())
    test_y_pred = clf.predict(test_x)
    print(clf.__class__.__name__, 'Test acc=', accuracy_score(test_y, test_y_pred))

DecisionTreeClassifier Test acc= 0.7552941176470588
LogisticRegression Test acc= 0.8201960784313725
SGDClassifier Test acc= 0.8163725490196079
VotingClassifier Test acc= 0.8186274509803921


# Train a bagging classifier

In [36]:
from sklearn.ensemble import BaggingClassifier 


#If you want to do pasting, change "bootstrap=False"
#n_jobs=-1 means use all CPU cores
#bagging automatically performs soft voting

bag_clf = BaggingClassifier( 
            SGDClassifier(), n_estimators=50, 
            max_samples=1000, bootstrap=True, n_jobs=-1) 

bag_clf.fit(train_x, train_y)

In [37]:
#Train accuracy

train_y_pred = bag_clf.predict(train_x)

train_acc = accuracy_score(train_y, train_y_pred)

print('Train acc: {}' .format(train_acc))

Train acc: 0.8222689075630252


In [38]:
#Test accuracy

test_y_pred = bag_clf.predict(test_x)

test_acc = accuracy_score(test_y, test_y_pred)

print('Test acc: {}' .format(test_acc))

Test acc: 0.8209803921568627


In [39]:
#Random Patches: see the max_features variable
bag_clf = BaggingClassifier( 
            SGDClassifier(), n_estimators=50, max_features=10,
            max_samples=1000, bootstrap=True, n_jobs=-1) 

bag_clf.fit(train_x, train_y)

In [40]:
#Train accuracy

train_y_pred = bag_clf.predict(train_x)

train_acc = accuracy_score(train_y, train_y_pred)

print('Train acc: {}' .format(train_acc))

Train acc: 0.8171428571428572


In [41]:
#Test accuracy

test_y_pred = bag_clf.predict(test_x)

test_acc = accuracy_score(test_y, test_y_pred)

print('Test acc: {}' .format(test_acc))

Test acc: 0.8168627450980392


# Train a random forest classifier

In [43]:
from sklearn.ensemble import RandomForestClassifier 

rnd_clf = RandomForestClassifier(n_estimators=500, max_depth=10, n_jobs=-1) 

rnd_clf.fit(train_x, train_y)

In [44]:
#Train accuracy

train_y_pred = rnd_clf.predict(train_x)

train_acc = accuracy_score(train_y, train_y_pred)

print('Train acc: {}' .format(train_acc))

Train acc: 0.8452941176470589


In [45]:
#Test accuracy

test_y_pred = rnd_clf.predict(test_x)

test_acc = accuracy_score(test_y, test_y_pred)

print('Test acc: {}' .format(test_acc))

Test acc: 0.8132352941176471


In [46]:
#feature importance, to make sure the columns we took for feature engg are good
rnd_clf.feature_importances_

array([0.11723697, 0.26788547, 0.09740898, 0.08554249, 0.12043984,
       0.00208346, 0.0523202 , 0.02319567, 0.01780943, 0.02621952,
       0.03240912, 0.02275273, 0.02265291, 0.11204322])

In [47]:
# Round to two decimals
np.round(rnd_clf.feature_importances_,2)

array([0.12, 0.27, 0.1 , 0.09, 0.12, 0.  , 0.05, 0.02, 0.02, 0.03, 0.03,
       0.02, 0.02, 0.11])

# Train an adaboost classifier

In [48]:
from sklearn.ensemble import AdaBoostClassifier 


ada_clf = AdaBoostClassifier( 
            DecisionTreeClassifier(max_depth=5), n_estimators=500, 
            learning_rate=0.1) 


ada_clf.fit(train_x, train_y)

In [49]:
#Train accuracy

train_y_pred = ada_clf.predict(train_x)

train_acc = accuracy_score(train_y, train_y_pred)

print('Train acc: {}' .format(train_acc))

Train acc: 0.8819747899159663


In [50]:
#Test accuracy

test_y_pred = ada_clf.predict(test_x)

test_acc = accuracy_score(test_y, test_y_pred)

print('Test acc: {}' .format(test_acc))

Test acc: 0.8174509803921569


# Train a gradient boosting classifier

In [51]:
#Use GradientBoosting

from sklearn.ensemble import GradientBoostingClassifier

gbclf = GradientBoostingClassifier(max_depth=2, n_estimators=100, learning_rate=0.1) 

gbclf.fit(train_x, train_y)

In [52]:
#Train accuracy

train_y_pred = gbclf.predict(train_x)

train_acc = accuracy_score(train_y, train_y_pred)

print('Train acc: {}' .format(train_acc))

Train acc: 0.8191176470588235


In [53]:
#Test accuracy

test_y_pred = gbclf.predict(test_x)

test_acc = accuracy_score(test_y, test_y_pred)

print('Test acc: {}' .format(test_acc))

Test acc: 0.8115686274509804
