# Exercise - Ensemble

In this exercise, we will focus on underage drinking. The data set contains data about high school students. Each row represents a single student. The columns include the characteristics of deidentified students. This is a binary classification task: predict whether a student drinks alcohol or not (this is the **alc** column: 1=Yes, 0=No). This is an important prediction task to detect underage drinking and deploy intervention techniques. 

## Description of Variables

The description of variables are provided in "Alcohol - Data Dictionary.docx"

## Goal

Use the **alcohol.csv** data set and build a model to predict **alc**. 

# Read and Prepare the Data

In [None]:
# Common imports

import pandas as pd
import numpy as np

np.random.seed(42)

# Get the data

In [None]:
#We will predict the "price" value in the data set:

alcohol = pd.read_csv("alcohol.csv")
alcohol.head()

In [None]:
alcohol["gender"].unique()

In [None]:
alcohol.shape

In [None]:
alcohol.isna().sum()

In [None]:
gender_dict = {"M": 0, "F": 1}
alcohol["gender_binary"] = alcohol["gender"].map(gender_dict)
# alcohol["gender_binary"] = alcohol["gender"].str.upper().map(gender_dict) - *Lower preferred
alcohol.head()

# Split data (train/test)

In [None]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(alcohol, test_size=0.3)

# Data Prep

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import FunctionTransformer

## Separate the target variable 

In [None]:
train_target = train['alc']
test_target = test['alc']

train_inputs = train.drop(['alc','gender'], axis=1)
test_inputs = test.drop(['alc', 'gender'], axis=1)

In [None]:
# train_target.head()
# test_target.head()

# train_inputs.head()
# test_inputs.head()


## Feature Engineering: Derive a new column

Examples:
- Ratio of study time to travel time
- Student is younger than 18 or not
- Average of father's and mother's level of education
- (etc.)

In [None]:
# Ratio of absences to age
train_inputs['absences_to_age'] = train_inputs.apply(
    lambda row: row['absences'] / row['age'] if row['age'] > 0 else 0, axis=1
)
test_inputs['absences_to_age'] = test_inputs.apply(
    lambda row: row['absences'] / row['age'] if row['age'] > 0 else 0, axis=1
)

In [None]:
# # Ratio of absences to age
# train_inputs['absences_to_age'] = 
# train_inputs.apply(
#     lambda row: #1
#     row['absences'] / row['age'] #3a
#     if row['age'] > 0 #2
#     else 0, #3b <=
#     axis=1
# )
# test_inputs['absences_to_age'] = test_inputs.apply(
#     lambda row: row['absences'] / row['age'] if row['age'] > 0 else 0, axis=1
# )

In [None]:
train_inputs.head()

In [None]:
test_inputs.head()

##  Identify the numeric, binary, and categorical columns

In [None]:
# Identify the numerical columns
numeric_columns = train_inputs.select_dtypes(include=[np.number]).columns.to_list()
# binary_columns = train_inputs.select_dtypes(include=[np.bool]).columns.to_list()


# Identify the categorical columns
categorical_columns = train_inputs.select_dtypes('object').columns.to_list()

In [None]:
binary_columns = ["gender_binary"]

In [None]:
numeric_columns

In [None]:
numeric_corrected = [
 'age',
 'Medu',
 'Fedu',
 'traveltime',
 'studytime',
 'failures',
 'famrel',
 'freetime',
 'goout',
 'health',
 'absences',
]

In [None]:
categorical_columns

In [None]:
feat_eng_columns = ["absences_to_age"]

# Pipeline

In [None]:
numeric_transformer = Pipeline(steps=[
                ('imputer', SimpleImputer(strategy='median')),
                ('scaler', StandardScaler())])

In [None]:
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value=0)),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

In [None]:
# Create a pipeline for the transformed column here
binary_transformer = Pipeline(steps = [
    ('imputer', SimpleImputer(strategy= 'most_frequent'))
])

In [None]:
feature_transformer = Pipeline(steps=[
                ('imputer', SimpleImputer(strategy='mean')),
                ('scaler', StandardScaler())])

In [None]:
preprocessor = ColumnTransformer([
        ('num', numeric_transformer, numeric_corrected),
        ('cat', categorical_transformer, categorical_columns),
        ('binary', binary_transformer, binary_columns),
        ('feat', feature_transformer, feat_eng_columns)
        ],
        remainder='passthrough')

#passtrough is an optional step. You don't have to use it.

# Transform: fit_transform() for TRAIN

In [None]:
#Fit and transform the train data
train_x = preprocessor.fit_transform(train_inputs)

train_x

In [None]:
train_x.shape

# Tranform: transform() for TEST

In [None]:
# Transform the test data
test_x = preprocessor.transform(test_inputs)

test_x

In [None]:
test_x.shape

# Calculate the Baseline

In [None]:
# Create and fit the dummy classifier
from sklearn.dummy import DummyClassifier
dummy_clf = DummyClassifier(strategy="most_frequent")
dummy_clf.fit(train_x, train_target)

In [None]:
# Predict and evaluate on evaluate on test data
baseline_predictions = dummy_clf.predict(test_x) 
from sklearn.metrics import accuracy_score

In [None]:
# Calculate baseline accuracy 
baseline_accuracy = accuracy_score(test_target, baseline_predictions)
print("Baseline test accuracy: ", baseline_accuracy)

# Train a voting classifier 

In [None]:
from sklearn.tree import DecisionTreeClassifier 
from sklearn.linear_model import SGDClassifier 
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier


dtree = DecisionTreeClassifier(max_depth=20)
log_regress = LogisticRegression(multi_class='multinomial', solver = 'lbfgs', C=10, max_iter=1000)
sgd = SGDClassifier(max_iter=10000, tol=1e-3)

voting_ensemble = VotingClassifier(
            estimators=[('dt', dtree), 
                        ('lr', log_regress), 
                        ('sgd', sgd)],
            voting='hard')

voting_ensemble.fit(train_x, train_target)

In [None]:
#Train accuracy

train_y_pred = voting_ensemble.predict(train_x)

train_acc = accuracy_score(train_target, train_y_pred)

print(f'Train acc: {train_acc}')

In [None]:
#Test accuracy

test_y_pred = voting_ensemble.predict(test_x)

test_acc = accuracy_score(test_target, test_y_pred)

print(f'Test acc: {test_acc}')

# Train a bagging classifier

In [None]:
from sklearn.ensemble import BaggingClassifier 

bag_clf = BaggingClassifier( 
            SGDClassifier(penalty='l1'), n_estimators=50, 
            max_samples=1000, bootstrap=True, n_jobs=3) 

bag_clf.fit(train_x, train_target)

In [None]:
#Train accuracy

train_y_pred = bag_clf.predict(train_x)

train_acc = accuracy_score(train_target, train_y_pred)

print(f'Train acc: {train_acc}')

In [None]:
#Test accuracy

test_y_pred = bag_clf.predict(test_x)

test_acc = accuracy_score(test_target, test_y_pred)

print(f'Test acc: {test_acc}')

# Train a random forest classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier 

rnd_clf = RandomForestClassifier(n_estimators=500, max_depth=10, n_jobs=3) 

rnd_clf.fit(train_x, train_target)

In [None]:
#Train accuracy

train_y_pred = rnd_clf.predict(train_x)

train_acc = accuracy_score(train_target, train_y_pred)

print(f'Train acc: {train_acc}')

In [None]:
#Test accuracy

test_y_pred = rnd_clf.predict(test_x)

test_acc = accuracy_score(test_target, test_y_pred)

print(f'Test acc: {test_acc}')

In [None]:
rnd_clf.feature_importances_

In [None]:
# Round to two decimals
np.round(rnd_clf.feature_importances_,2)

# Train an adaboost classifier

In [None]:
from sklearn.ensemble import AdaBoostClassifier 

ada_clf = AdaBoostClassifier( 
            DecisionTreeClassifier(max_depth=5), n_estimators=500, 
            learning_rate=0.1) 

ada_clf.fit(train_x, train_target)

In [None]:
#Train accuracy

train_y_pred = ada_clf.predict(train_x)

train_acc = accuracy_score(train_target, train_y_pred)

print(f'Train acc: {train_acc}')

In [None]:
#Test accuracy

test_y_pred = ada_clf.predict(test_x)

test_acc = accuracy_score(test_target, test_y_pred)

print(f'Test acc: {test_acc}')

# Train a gradient boosting classifier

In [None]:
#Use GradientBoosting

from sklearn.ensemble import GradientBoostingClassifier

gbclf = GradientBoostingClassifier(max_depth=2, n_estimators=100, learning_rate=0.1) 

gbclf.fit(train_x, train_target)

In [None]:
#Train accuracy

train_y_pred = gbclf.predict(train_x)

train_acc = accuracy_score(train_target, train_y_pred)

print(f'Train acc: {train_acc}')