# Q6
#### Boosting, Bagging, and Stacking— Ensemble Methods with sklearn and mlens

#### 0. Imports of libraries that allow the Python code to work

In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
from scipy.stats import pearsonr 
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
# from pydataset import data
from sklearn.preprocessing import normalize, StandardScaler
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.ensemble import BaggingClassifier, ExtraTreesClassifier, RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import RidgeClassifier, LogisticRegression
from sklearn.svm import SVC
import matplotlib.pyplot as plt
from mlxtend.plotting import plot_decision_regions
import matplotlib.gridspec as gridspec
import itertools
from mlxtend.classifier import EnsembleVoteClassifier
import warnings
from xgboost import XGBClassifier, plot_importance
from mlens.ensemble import SuperLearner
from sklearn.metrics import accuracy_score
from itertools import combinations

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import nltk
from nltk import word_tokenize        
from nltk.stem import WordNetLemmatizer 
nltk.download('wordnet')

import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')

[MLENS] backend: threading
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/bharathkarumudi/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


### 1. Obtaining and cleaning data

In [2]:
dat = pd.read_csv("data/spam.csv", encoding='latin-1')
dat.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


#### Let's start our cleaning:

In [3]:
dat = dat.loc[:, ['v1', 'v2']]
dat.rename(columns={'v1': 'y', 'v2': 'sms'}, inplace=True)
dat.y = dat.y.replace({'ham': 0, 'spam': 1})
dat.head()

Unnamed: 0,y,sms
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


#### Onehot Encoding
Removing the stop words which do not add value, create ngrams and also filter the words that appears in almost all or very less apperance.

Some words in the English language, while necessary, don't contribute much to the meaning of a phrase. These words, such as "when", "had", "those" or "before", are called stop words and should be filtered out.

We can tokenize individual terms and generate what's called a bag of words model. You may notice this model has a glaring pitfall: it fails to capture the innate structure of human language. Under this model, the following sentences have the same feature vector although they convey dramatically different meanings.

Does steak taste delicious?
Steak does taste delicious.

Alternatively, we can tokenize every sequence of n terms called n-grams. For example, tokenizing adjacent pairs of words yields bigrams. The n

-gram model preserves word order and can potentially capture more information than the bag of words model.

To get the best of both worlds, let's tokenize unigrams and bigrams. As an example, unigrams and bigrams for "The quick brown fox" are "The", "quick", "brown", "fox", "The quick", "quick brown" and "brown fox".


In [4]:
class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, articles):
        return [self.wnl.lemmatize(t) for t in word_tokenize(articles)]


def preprocess_txt(raw_text):
    
    processed = raw_text.str.replace(r'\b[\w\-.]+?@\w+?\.\w{2,4}\b', 'emailaddr')
    processed = processed.str.replace(r'(http[s]?\S+)|(\w+\.[A-Za-z]{2,4}\S*)', 'httpaddr')
    processed = processed.str.replace(r'£|\$', 'moneysymb')    
    processed = processed.str.replace(r'\b(\+\d{1,2}\s)?\d?[\-(.]?\d{3}\)?[\s.-]?\d{3}[\s.-]?\d{4}\b', 'phonenumbr')
    processed = processed.str.replace(r'\d+(\.\d+)?', 'numbr')

    processed = processed.str.replace(r'[^\w\d\s]', ' ')
    processed = processed.str.replace(r'\s+', ' ')
    processed = processed.str.replace(r'^\s+|\s+?$', '')

    processed = processed.str.lower()
    
    return processed
processed = preprocess_txt(dat.sms)

In [5]:
# One hot encode words
vec = TfidfVectorizer(tokenizer=LemmaTokenizer(),
                      stop_words='english', # Remove stop words like a, an, the, etc that do not add much value.
                      ngram_range=(1, 2),   # create unigrams and bigrams.
                      min_df=0.01,          # filter words that appear in less than 1% of records
                      max_df=0.99)          # filter words that appear in more than 99% of records.

X = vec.fit_transform(processed)

In [6]:
message_len = dat.sms.apply(len).values
message_len = message_len / max(message_len)

message_n_words = np.array([len(x) for x in dat.sms.str.split()])
message_n_words = message_n_words / max(message_n_words)

all_caps_freq = np.array([sum(1 for c in message if c.isupper()) for message in dat.sms]) / message_len

avg_word_len = []
for message in dat.sms.str.split():
    message_word_lens = []
    for word in message:
        message_word_lens.append(len(word))
    avg_word_len.append(np.mean(message_word_lens))
avg_word_len = np.array(avg_word_len)


corr_coef, p = pearsonr(message_len, message_n_words) # highly correlated


X = np.column_stack((X.todense(), message_n_words, all_caps_freq, avg_word_len))
X_column_names = vec.get_feature_names() + ['message_n_words', 'all_caps_freq', 'avg_word_len']
X = pd.DataFrame(X, columns=X_column_names)

y = dat.y

### 2. Split into training and test dataset

In [7]:
# Get index of message_n_words
message_n_words_index = [i for i, name in enumerate(X) if name == 'message_n_words']

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=1234, test_size=0.2)

In [9]:
message_n_words_train = X_train['message_n_words']
message_n_words_test = X_test['message_n_words']

### 3. Create Classifiers

In [10]:
seed = 1234
np.random.seed(seed)

# Create classifiers
rf = RandomForestClassifier()
et = ExtraTreesClassifier()
knn = KNeighborsClassifier()
svc = SVC()
rg = RidgeClassifier()

clf_array = [rf, et, knn, svc, rg]

for clf in clf_array:
    vanilla_scores = cross_val_score(clf, X_train, y_train, cv=10, n_jobs=-1)
    bagging_clf = BaggingClassifier(clf, max_samples=0.4, max_features=10, random_state=seed)
    bagging_scores = cross_val_score(bagging_clf, X_train, y_train, cv=10, n_jobs=-1)
    
    print("Mean of: {1:.3f}, std: (+/-) {2:.3f} [{0}]".format(clf.__class__.__name__, 
                                                              vanilla_scores.mean(), vanilla_scores.std()))
    print("Mean of: {1:.3f}, std: (+/-) {2:.3f} [Bagging {0}]\n".format(clf.__class__.__name__, 
                                                                        bagging_scores.mean(), bagging_scores.std()))

Mean of: 0.982, std: (+/-) 0.004 [RandomForestClassifier]
Mean of: 0.888, std: (+/-) 0.006 [Bagging RandomForestClassifier]

Mean of: 0.982, std: (+/-) 0.004 [ExtraTreesClassifier]
Mean of: 0.890, std: (+/-) 0.007 [Bagging ExtraTreesClassifier]

Mean of: 0.916, std: (+/-) 0.012 [KNeighborsClassifier]
Mean of: 0.888, std: (+/-) 0.007 [Bagging KNeighborsClassifier]

Mean of: 0.891, std: (+/-) 0.017 [SVC]
Mean of: 0.866, std: (+/-) 0.001 [Bagging SVC]

Mean of: 0.979, std: (+/-) 0.005 [RidgeClassifier]
Mean of: 0.866, std: (+/-) 0.001 [Bagging RidgeClassifier]



### Voting

In [11]:
# Set up voting
eclf = VotingClassifier(estimators=[('Random Forests', rf), ('Extra Trees', et), 
                                    ('KNeighbors', knn), ('SVC', svc), ('Ridge Classifier', rg)], voting='hard')

for clf, label in zip([rf, et, knn, svc, rg, eclf], ['Random Forest', 'Extra Trees', 
                                                     'KNeighbors', 'SVC', 'Ridge Classifier', 'Ensemble']):
    scores = cross_val_score(clf, X_train, y_train, cv=10, scoring='accuracy')
    print("Mean: {0:.3f}, std: (+/-) {1:.3f} [{2}]".format(scores.mean(), scores.std(), label))

Mean: 0.983, std: (+/-) 0.004 [Random Forest]
Mean: 0.983, std: (+/-) 0.004 [Extra Trees]
Mean: 0.916, std: (+/-) 0.012 [KNeighbors]
Mean: 0.891, std: (+/-) 0.017 [SVC]
Mean: 0.979, std: (+/-) 0.005 [Ridge Classifier]
Mean: 0.984, std: (+/-) 0.004 [Ensemble]


#### Ensemble voting for bagging

In [12]:
# Set up ensemble voting for bagging


ebclf_array = [BaggingClassifier(clf, max_samples=0.25, max_features=10, random_state=seed) for clf in clf_array]

# ebclf_array = []
# for clf in clf_array:
#     ebclf_array.append(
#         BaggingClassifier(clf, max_samples=0.25, max_features=10, random_state=seed)
#     )

v_eclf = VotingClassifier(
    estimators=list(zip(
        [
            "Bagging Random Forest",
            "Bagging Extra Trees",
            "Bagging KNeighbors",
            "Bagging SVC",
            "Bagging Ridge Classifier",
            "Bagging Ensemble",
        ],
        ebclf_array,
    )),
    voting="hard",
)

ebclf_array.append(v_eclf)

for clf, label in zip(
    ebclf_array,
    [
        "Bagging Random Forest",
        "Bagging Extra Trees",
        "Bagging KNeighbors",
        "Bagging SVC",
        "Bagging Ridge Classifier",
        "Bagging Ensemble",
    ],
):
    scores = cross_val_score(clf, X_train, y_train, cv=10, scoring="accuracy")
    print(
        "Mean: {0:.3f}, std: (+/-) {1:.3f} [{2}]".format(
            scores.mean(), scores.std(), label
        )
    )

Mean: 0.886, std: (+/-) 0.005 [Bagging Random Forest]
Mean: 0.888, std: (+/-) 0.004 [Bagging Extra Trees]
Mean: 0.886, std: (+/-) 0.008 [Bagging KNeighbors]
Mean: 0.866, std: (+/-) 0.001 [Bagging SVC]
Mean: 0.866, std: (+/-) 0.001 [Bagging Ridge Classifier]
Mean: 0.876, std: (+/-) 0.005 [Bagging Ensemble]


#### Boosting classifiers

In [13]:
# Create boosting classifiers
ada_boost = AdaBoostClassifier()
grad_boost = GradientBoostingClassifier()
xgb_boost = XGBClassifier()

boost_array = [ada_boost, grad_boost, xgb_boost]

eclf = EnsembleVoteClassifier(clfs=[ada_boost, grad_boost, xgb_boost], voting='hard')

labels = ['Ada Boost', 'Grad Boost', 'XG Boost', 'Ensemble']

for clf, label in zip([ada_boost, grad_boost, xgb_boost, eclf], labels):
    scores = cross_val_score(clf, X_train, y_train, cv=10, scoring='accuracy')
    print("Mean: {0:.3f}, std: (+/-) {1:.3f} [{2}]".format(scores.mean(), scores.std(), label))

Mean: 0.982, std: (+/-) 0.005 [Ada Boost]
Mean: 0.985, std: (+/-) 0.005 [Grad Boost]
Mean: 0.983, std: (+/-) 0.006 [XG Boost]
Mean: 0.985, std: (+/-) 0.006 [Ensemble]


In [14]:
lr = LogisticRegression()
seed = 1075
ensemble = SuperLearner(scorer = accuracy_score, 
                        random_state=seed, 
                        folds=10,
                        verbose = 2)

# Build the first layer
ensemble.add([rf, et, knn, rg])
# Attach the final meta estimator
ensemble.add_meta(lr)

ensemble.fit(X_train, y_train)
preds = ensemble.predict(X_test)
print("Fit data:\n%r" % ensemble.data)
print("Accuracy score: {:.3f}".format(accuracy_score(preds, y_test)))


Fitting 2 layers
Processing layer-1             done | 00:00:04
Processing layer-2             done | 00:00:00
Fit complete                        | 00:00:04

Predicting 2 layers
Processing layer-1             done | 00:00:00
Processing layer-2             done | 00:00:00
Predict complete                    | 00:00:00
Fit data:
                                   score-m  score-s  ft-m  ft-s  pt-m  pt-s
layer-1  extratreesclassifier         0.98     0.00  0.68  0.14  0.02  0.02
layer-1  kneighborsclassifier         0.92     0.02  0.08  0.05  0.35  0.08
layer-1  randomforestclassifier       0.98     0.01  0.31  0.03  0.01  0.01
layer-1  ridgeclassifier              0.98     0.01  0.05  0.01  0.00  0.00

Accuracy score: 0.984


### Conclusions

In [15]:
from itertools import combinations

names = ['Random Forest', 'Extra Trees', 'KNeighbors', 'SVC', 'Ridge Classifier']

def zip_stacked_classifiers(*args):
    to_zip = []
    for arg in args:
        combined_items = sum([list(map(list, combinations(arg, i))) for i in range(len(arg) + 1)], [])
        combined_items = filter(lambda x: len(x) > 0, combined_items)
        to_zip.append(combined_items)
    
    return zip(to_zip[0], to_zip[1])

stacked_clf_list = zip_stacked_classifiers(clf_array, names)

best_combination = [0.00, ""]

for clf in stacked_clf_list:
    
    ensemble = SuperLearner(scorer = accuracy_score, 
                            random_state = seed, 
                            folds = 10)
    ensemble.add(clf[0])
    ensemble.add_meta(lr)
    ensemble.fit(X_train, y_train)
    preds = ensemble.predict(X_test)
    accuracy = accuracy_score(preds, y_test)
    
    if accuracy > best_combination[0]:
        best_combination[0] = accuracy
        best_combination[1] = clf[1]
    
    print("Accuracy score: {:.3f} {}".format(accuracy, clf[1]))

print("\nBest stacking model is {} with accuracy of: {:.3f}".format(best_combination[1], best_combination[0]))

Accuracy score: 0.981 ['Random Forest']
Accuracy score: 0.980 ['Extra Trees']
Accuracy score: 0.922 ['KNeighbors']
Accuracy score: 0.880 ['SVC']
Accuracy score: 0.976 ['Ridge Classifier']
Accuracy score: 0.984 ['Random Forest', 'Extra Trees']
Accuracy score: 0.982 ['Random Forest', 'KNeighbors']
Accuracy score: 0.979 ['Random Forest', 'SVC']
Accuracy score: 0.981 ['Random Forest', 'Ridge Classifier']
Accuracy score: 0.978 ['Extra Trees', 'KNeighbors']
Accuracy score: 0.977 ['Extra Trees', 'SVC']
Accuracy score: 0.979 ['Extra Trees', 'Ridge Classifier']
Accuracy score: 0.922 ['KNeighbors', 'SVC']
Accuracy score: 0.976 ['KNeighbors', 'Ridge Classifier']
Accuracy score: 0.976 ['SVC', 'Ridge Classifier']
Accuracy score: 0.981 ['Random Forest', 'Extra Trees', 'KNeighbors']
Accuracy score: 0.981 ['Random Forest', 'Extra Trees', 'SVC']
Accuracy score: 0.978 ['Random Forest', 'Extra Trees', 'Ridge Classifier']
Accuracy score: 0.978 ['Random Forest', 'KNeighbors', 'SVC']
Accuracy score: 0.982 [

### References:
[1] https://medium.com/@rrfd/boosting-bagging-and-stacking-ensemble-methods-with-sklearn-and-mlens-a455c0c982de  
[2] Dataset: https://www.kaggle.com/uciml/sms-spam-collection-dataset/  