### Gradient Boosting

Ensemble learning method that takes an iterative approach to combining weak learners to create a strong learner by focusing on mistakes of prior iterations.

+ Also uses Decision Trees

+ Boosting samples with an increased weight on the examples it got wrong previously

+ Training done iteratively

+ Weighted voting for final prediction

+ Harder to tune, easier to overfit than Random Forests. This is because it obsesses over the examples it got wrong and it can get lost pursuing those outliers

Gradient boosting is more powerful and better performing if tuned properly. 

<!-- ![](MLModels.PNG) -->

<img src="MLModels.PNG" style="width:100px;height:100px"/>

![](GradientBoosting.PNG)

In [1]:
import nltk
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
import string

stopwords = nltk.corpus.stopwords.words('english')
ps = nltk.PorterStemmer()

data = pd.read_csv("SMSSpamCollection.tsv", sep='\t')
data.columns = ['label', 'body_text']

def count_punct(text):
    count = sum([1 for char in text if char in string.punctuation])
    return round(count/(len(text) - text.count(" ")), 3)*100

data['body_len'] = data['body_text'].apply(lambda x: len(x) - x.count(" "))
data['punct%'] = data['body_text'].apply(lambda x: count_punct(x))

def clean_text(text):
    text = "".join([word.lower() for word in text if word not in string.punctuation])
    tokens = re.split('\W+', text)
    text = [ps.stem(word) for word in tokens if word not in stopwords]
    return text

tfidf_vect = TfidfVectorizer(analyzer=clean_text)
X_tfidf = tfidf_vect.fit_transform(data['body_text'])

X_features = pd.concat([data['body_len'], data['punct%'], pd.DataFrame(X_tfidf.toarray())], axis=1)
X_features.head()

Unnamed: 0,body_len,punct%,0,1,2,3,4,5,6,7,...,8094,8095,8096,8097,8098,8099,8100,8101,8102,8103
0,128,4.7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,49,4.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,62,3.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,28,7.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,135,4.4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [2]:
from sklearn.ensemble import GradientBoostingClassifier

print(dir(GradientBoostingClassifier))
print(help(GradientBoostingClassifier()))

['_SUPPORTED_LOSS', '__abstractmethods__', '__annotations__', '__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getitem__', '__getstate__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__iter__', '__le__', '__len__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__setstate__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_abc_impl', '_check_initialized', '_check_n_features', '_check_params', '_clear_state', '_compute_partial_dependence_recursion', '_estimator_type', '_fit_stage', '_fit_stages', '_get_param_names', '_get_tags', '_init_state', '_is_initialized', '_make_estimator', '_more_tags', '_raw_predict', '_raw_predict_init', '_repr_html_', '_repr_html_inner', '_repr_mimebundle_', '_required_parameters', '_resize_state', '_staged_raw_predict', '_validate_data', '_validate_estimator', '_validate_y', '_warn_mae_for_criterion', 'apply

#### Grid Search 

In [3]:
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_features, data['label'], test_size=0.2)

In [4]:
def train_GB(n_est,max_depth, lr):
    gb = GradientBoostingClassifier(n_estimators=n_est, max_depth=max_depth, learning_rate=lr)
    gb_model = gb.fit(X_train, y_train)
    y_pred = gb_model.predict(X_test)
    precision, recall, fscore, support = score(y_test, y_pred, pos_label='spam', average='binary')
    print("estimators: {} / Depth: {} / Learning rate: {} ----- / Precision: {} / Recall: {} / Accuracy:{}".format
         (n_est, max_depth, lr, round(precision,3), round(recall,3), round((y_pred==y_test).sum() / len(y_pred),3)))

In [5]:
for n_est in [50, 100, 150]:
    for max_depth in [3, 7, 11, 15]:
        for lr in [0.01, 0.1, 1]:
            train_GB(n_est, max_depth, lr)

  _warn_prf(average, modifier, msg_start, len(result))


estimators: 50 / Depth: 3 / Learning rate: 0.01 ----- / Precision: 0.0 / Recall: 0.0 / Accuracy:0.882
estimators: 50 / Depth: 3 / Learning rate: 0.1 ----- / Precision: 0.941 / Recall: 0.727 / Accuracy:0.962
estimators: 50 / Depth: 3 / Learning rate: 1 ----- / Precision: 0.88 / Recall: 0.886 / Accuracy:0.972
estimators: 50 / Depth: 7 / Learning rate: 0.01 ----- / Precision: 1.0 / Recall: 0.015 / Accuracy:0.883
estimators: 50 / Depth: 7 / Learning rate: 0.1 ----- / Precision: 0.883 / Recall: 0.856 / Accuracy:0.969
estimators: 50 / Depth: 7 / Learning rate: 1 ----- / Precision: 0.882 / Recall: 0.848 / Accuracy:0.969
estimators: 50 / Depth: 11 / Learning rate: 0.01 ----- / Precision: 1.0 / Recall: 0.015 / Accuracy:0.883
estimators: 50 / Depth: 11 / Learning rate: 0.1 ----- / Precision: 0.867 / Recall: 0.841 / Accuracy:0.966
estimators: 50 / Depth: 11 / Learning rate: 1 ----- / Precision: 0.898 / Recall: 0.864 / Accuracy:0.972
estimators: 50 / Depth: 15 / Learning rate: 0.01 ----- / Precisi

#### GradientBooster with GridSearchCV 

In [6]:
import nltk
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import string

stopwords = nltk.corpus.stopwords.words('english')
ps = nltk.PorterStemmer()

data = pd.read_csv("SMSSpamCollection.tsv", sep='\t')
data.columns = ['label', 'body_text']

def count_punct(text):
    count = sum([1 for char in text if char in string.punctuation])
    return round(count/(len(text) - text.count(" ")), 3)*100

data['body_len'] = data['body_text'].apply(lambda x: len(x) - x.count(" "))
data['punct%'] = data['body_text'].apply(lambda x: count_punct(x))

def clean_text(text):
    text = "".join([word.lower() for word in text if word not in string.punctuation])
    tokens = re.split('\W+', text)
    text = [ps.stem(word) for word in tokens if word not in stopwords]
    return text

# TF-IDF
tfidf_vect = TfidfVectorizer(analyzer=clean_text)
X_tfidf = tfidf_vect.fit_transform(data['body_text'])
X_tfidf_feat = pd.concat([data['body_len'], data['punct%'], pd.DataFrame(X_tfidf.toarray())], axis=1)

# CountVectorizer
count_vect = CountVectorizer(analyzer=clean_text)
X_count = count_vect.fit_transform(data['body_text'])
X_count_feat = pd.concat([data['body_len'], data['punct%'], pd.DataFrame(X_count.toarray())], axis=1)

X_count_feat.head()

Unnamed: 0,body_len,punct%,0,1,2,3,4,5,6,7,...,8094,8095,8096,8097,8098,8099,8100,8101,8102,8103
0,128,4.7,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,49,4.1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,62,3.2,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,28,7.1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,135,4.4,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV

gb = GradientBoostingClassifier()
param = {
    'n_estimators': [100, 150], 
    'max_depth': [7, 11, 15],
    'learning_rate': [0.1]
}

clf = GridSearchCV(gb, param, cv=5, n_jobs=-1)
cv_fit = clf.fit(X_tfidf_feat, data['label'])
pd.DataFrame(cv_fit.cv_results_).sort_values('mean_test_score', ascending=False)[0:5]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_learning_rate,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
5,410.000619,46.4529,0.218822,0.01504,0.1,15,150,"{'learning_rate': 0.1, 'max_depth': 15, 'n_est...",0.965889,0.976661,0.971249,0.969452,0.969452,0.97054,0.00352,1
1,333.790758,1.099178,0.336874,0.030618,0.1,7,150,"{'learning_rate': 0.1, 'max_depth': 7, 'n_esti...",0.965889,0.979354,0.969452,0.96496,0.965858,0.969102,0.005352,2
3,484.656573,7.052641,0.264104,0.025317,0.1,11,150,"{'learning_rate': 0.1, 'max_depth': 11, 'n_est...",0.962298,0.977558,0.966757,0.967655,0.968553,0.968564,0.004985,3
4,394.730912,24.799926,0.26558,0.039827,0.1,15,100,"{'learning_rate': 0.1, 'max_depth': 15, 'n_est...",0.963196,0.973968,0.968553,0.967655,0.968553,0.968385,0.003426,4
0,218.16354,0.722731,0.385361,0.024286,0.1,7,100,"{'learning_rate': 0.1, 'max_depth': 7, 'n_esti...",0.964093,0.976661,0.969452,0.963163,0.96496,0.967666,0.004989,5


In [8]:
gb = GradientBoostingClassifier()
param = {
    'n_estimators': [50, 100, 150], 
    j
    
    'max_depth': [7, 11, 15],
    'learning_rate': [0.1]
}

clf = GridSearchCV(gb, param, cv=5, n_jobs=-1)
cv_fit = clf.fit(X_count_feat, data['label'])
pd.DataFrame(cv_fit.cv_results_).sort_values('mean_test_score', ascending=False)[0:5]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_learning_rate,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
8,448.122111,34.65827,0.159174,0.012591,0.1,15,150,"{'learning_rate': 0.1, 'max_depth': 15, 'n_est...",0.963196,0.977558,0.968553,0.96496,0.971249,0.969103,0.005069,1
5,467.480928,1.669536,0.275208,0.006235,0.1,11,150,"{'learning_rate': 0.1, 'max_depth': 11, 'n_est...",0.964093,0.977558,0.968553,0.963163,0.971249,0.968923,0.005227,2
2,316.641597,4.780169,0.254718,0.00397,0.1,7,150,"{'learning_rate': 0.1, 'max_depth': 7, 'n_esti...",0.963196,0.979354,0.971249,0.964061,0.966757,0.968923,0.005922,3
4,293.547056,1.042354,0.265251,0.005926,0.1,11,100,"{'learning_rate': 0.1, 'max_depth': 11, 'n_est...",0.965889,0.975763,0.968553,0.963163,0.97035,0.968744,0.004271,4
7,392.934939,3.738721,0.25811,0.023688,0.1,15,100,"{'learning_rate': 0.1, 'max_depth': 15, 'n_est...",0.964093,0.976661,0.965858,0.963163,0.971249,0.968205,0.005072,5
