## Machine Learning

### Supervised vs unsupervised learning
### Cross validation and evaluation metrics

1. Holdout test set - how model will perform on unseen real world data
2. K- fold cross validation: Data is divided into k subsets and , one of k subsets is used as holdout set and k-1 are clubbed together for training
3. Accuracy = # predicted correctly/ total # of ovservations
4. Precision = # predicted as spam that are actually spam(true positives)/# perdicted as spam(ture positives+ false positives)
5. Recall = # predicted as spam that are actually spam(true positives)/# actually spam(true positives + false negatives)

### Random forest model

Ensemble learning model, creates multiple decision trees and combines their outcomes to produce beter results than any single model

#### Benefits
1. Can be used for classification or regression
2. Easily handles outliers and missing values, etc
3. Accepts various types of inputs(continuous, ordinal etc)
4. Less Likely to overfit
5. Outputs feature importance

### Building the basic model

In [2]:
# Read clean and vectorize the data
import pandas as pd
import numpy as np
import string
import re
import nltk

from sklearn.feature_extraction.text import TfidfVectorizer


# Reading the raw file

pd.set_option('display.max_colwidth',100)
data = pd.read_csv("SMSSpamCollection.tsv", sep = '\t', header = None)
data.columns = ['label','body_text']
data.head()


# Cleaning the text

wn = nltk.WordNetLemmatizer()
ps = nltk.PorterStemmer()

def clean_text(text):
    text = "".join([char for char in text if char not in string.punctuation])
    tokens = re.split('\W+', text)
    text = ([word for word in tokens if word not in nltk.corpus.stopwords.words("English")])
    #text = [wn.lemmatize(word) for word in text]   
    text = [ps.stem(word) for word in text] 
    return(text)


# Counting the percennt of punctuations

def count_punct(text):
    count = sum([1 for char in text if char in string.punctuation])
    return round((count/(len(text)-text.count(" "))),3)*100

# Counting the length
data['body_len'] = data['body_text'].apply(lambda x: len(x) - x.count(" "))

# Adding punctuation percent column
data['body_per_punct'] = data['body_text'].apply(lambda x: count_punct(x)) 

data.head()

# Creating the TfIdf term document matrix
Tfidf_vect = TfidfVectorizer(analyzer = clean_text)
X_count = Tfidf_vect.fit_transform(data['body_text'])

X_features = pd.concat([data['body_len'],data['body_per_punct'], pd.DataFrame(X_count.toarray())], axis = 1)


X_features.head()


Unnamed: 0,body_len,body_per_punct,0,1,2,3,4,5,6,7,...,8327,8328,8329,8330,8331,8332,8333,8334,8335,8336
0,160,2.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,128,4.7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,49,4.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,62,3.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,28,7.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Explore Random forest classifier Attributes and Hyperparameters

In [3]:
from sklearn.ensemble import RandomForestClassifier


In [4]:
print(dir(RandomForestClassifier))
print(RandomForestClassifier())

['__abstractmethods__', '__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getitem__', '__getstate__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__iter__', '__le__', '__len__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__setstate__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_abc_impl', '_estimator_type', '_get_param_names', '_get_tags', '_make_estimator', '_more_tags', '_required_parameters', '_set_oob_score', '_validate_X_predict', '_validate_estimator', '_validate_y_class_weight', 'apply', 'decision_path', 'feature_importances_', 'fit', 'get_params', 'predict', 'predict_log_proba', 'predict_proba', 'score', 'set_params']
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0

### Important features
feature_importance, fit , predict

### Hyperparameters
max_depth, n_estimators

### Explore random forest classifier through cross validation


In [5]:
from sklearn.model_selection import KFold, cross_val_score

In [6]:
rf =  RandomForestClassifier(n_jobs = -1) # parallel processing: building the individual jobs in parallel
k_fold = KFold(n_splits=5)
cross_val_score(rf, X_features, data['label'], cv = k_fold, scoring = 'accuracy', n_jobs = -1)

array([0.97486535, 0.96768402, 0.97486535, 0.95867026, 0.97124888])

### Explore random forest classifier through Holdout set

In [7]:
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.model_selection import train_test_split

In [8]:
X_train, X_test, Y_train, Y_test = train_test_split(X_features, data['label'], test_size = 0.2) 

In [9]:
rf = RandomForestClassifier(n_estimators=50, max_depth=20, n_jobs=-1)
rf_model = rf.fit(X_train, Y_train)


In [10]:
sorted(zip(rf_model.feature_importances_, X_train.columns), reverse = True)

[(0.06327530214330236, 2017),
 (0.03878606408453409, 7577),
 (0.02959121840078095, 3349),
 (0.029156006732044264, 'body_len'),
 (0.022292717985557924, 6503),
 (0.021189055965959425, 6965),
 (0.018915676865765438, 392),
 (0.018653226124106387, 7689),
 (0.018434064115584816, 5294),
 (0.016880909378177246, 2384),
 (0.01669367914778382, 7247),
 (0.0165520706356305, 2244),
 (0.01525221238828935, 1112),
 (0.015084139607667104, 8013),
 (0.01422020842910521, 5009),
 (0.014063696776728689, 5945),
 (0.013278346634162462, 2511),
 (0.012551869404204606, 6207),
 (0.010740175502087912, 'body_per_punct'),
 (0.00995598298213791, 695),
 (0.009445533143461249, 3658),
 (0.009292329684735872, 4511),
 (0.008854100588432588, 1572),
 (0.008499358520544087, 294),
 (0.008324557126902112, 397),
 (0.007521493126467247, 6269),
 (0.007306754466829286, 7445),
 (0.00662561432947275, 2294),
 (0.0065115264153367625, 4485),
 (0.006319488517861582, 878),
 (0.006318302811504844, 7733),
 (0.006117122795669002, 4594),
 (0.

In [11]:
Y_pred = rf_model.predict(X_test)

precision,recall,fscore,support = score(Y_test, Y_pred, pos_label = 'spam', average = 'binary')

print('Precision:{} / Recall:{} / Accuracy: {} / fscore: {}'.format(round(precision,3), round(recall,3), 
                                                                    (Y_pred==Y_test).sum()/len(Y_pred), round(fscore,3)))

Precision:1.0 / Recall:0.573 / Accuracy: 0.9344703770197487 / fscore: 0.729


### Random Forest Model with grid search

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X_features, data['label'], test_size=0.2)

def train_RF(n_est, depth):
    rf = RandomForestClassifier(n_estimators=n_est, max_depth=depth, n_jobs=-1)
    rf_model = rf.fit(X_train, y_train)
    y_pred = rf_model.predict(X_test)
    precision, recall, fscore, support = score(y_test, y_pred, pos_label='spam', average='binary')
    print('Est: {} / Depth: {} ---- Precision: {} / Recall: {} / Accuracy: {}'.format(
        n_est, depth, round(precision, 3), round(recall, 3),
        round((y_pred==y_test).sum() / len(y_pred), 3)))

In [13]:
for n_est in [10, 50, 100]:
    for depth in [10, 20, 30, None]:
        train_RF(n_est, depth)

Est: 10 / Depth: 10 ---- Precision: 1.0 / Recall: 0.263 / Accuracy: 0.89
Est: 10 / Depth: 20 ---- Precision: 1.0 / Recall: 0.623 / Accuracy: 0.943
Est: 10 / Depth: 30 ---- Precision: 0.985 / Recall: 0.766 / Accuracy: 0.963
Est: 10 / Depth: None ---- Precision: 0.985 / Recall: 0.796 / Accuracy: 0.968
Est: 50 / Depth: 10 ---- Precision: 1.0 / Recall: 0.305 / Accuracy: 0.896
Est: 50 / Depth: 20 ---- Precision: 1.0 / Recall: 0.647 / Accuracy: 0.947
Est: 50 / Depth: 30 ---- Precision: 0.984 / Recall: 0.76 / Accuracy: 0.962
Est: 50 / Depth: None ---- Precision: 0.986 / Recall: 0.856 / Accuracy: 0.977
Est: 100 / Depth: 10 ---- Precision: 1.0 / Recall: 0.216 / Accuracy: 0.882
Est: 100 / Depth: 20 ---- Precision: 1.0 / Recall: 0.593 / Accuracy: 0.939
Est: 100 / Depth: 30 ---- Precision: 1.0 / Recall: 0.772 / Accuracy: 0.966
Est: 100 / Depth: None ---- Precision: 1.0 / Recall: 0.856 / Accuracy: 0.978


### Exploring parameter setting using grid search CV

In [14]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

In [16]:
rf = RandomForestClassifier()
param = {'n_estimators': [10, 150, 300],
        'max_depth': [30, 60, 90, None]}

gs = GridSearchCV(rf, param, cv=5, n_jobs=-1)
gs_fit = gs.fit(X_features, data['label'])
pd.DataFrame(gs_fit.cv_results_).sort_values('mean_test_score', ascending=False)[0:5]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
7,33.382433,1.005753,0.419795,0.056072,90.0,150,"{'max_depth': 90, 'n_estimators': 150}",0.975785,0.979354,0.975741,0.969452,0.972147,0.974497,0.003399,1
8,63.525799,1.492204,0.498793,0.06476,90.0,300,"{'max_depth': 90, 'n_estimators': 300}",0.978475,0.976661,0.973944,0.968553,0.973046,0.974138,0.003394,2
11,58.625582,0.772039,0.434049,0.053354,,300,"{'max_depth': None, 'n_estimators': 300}",0.977578,0.976661,0.975741,0.967655,0.973046,0.974138,0.003577,2
10,34.001541,1.952088,0.367624,0.029668,,150,"{'max_depth': None, 'n_estimators': 150}",0.975785,0.979354,0.974843,0.963163,0.972147,0.97306,0.005458,4
5,54.717784,1.637598,0.468078,0.060223,60.0,300,"{'max_depth': 60, 'n_estimators': 300}",0.974888,0.974865,0.973944,0.966757,0.972147,0.972522,0.003049,5


### Gradient boosting In Python

1. Gradient boosting uses boosting whereas Randoom forest uses bagging
2. Bagging samples randomly whereas boosting samples based on an increase in weight of what it got wrong previously
3. Since all trees of random forest are trained independently they can be parallalized, whereas boosting is iterative
4. Random forest uses unweighted scoring, whereas gradient boosting does weighted voting for final prediction
5. Easier to tune, harder to overfit in case of random forest whereas gradient boosting is easier to overfit and harder to train, longer to train

Then why gradient boosting, more powerful when trained properly

In [20]:
from sklearn.ensemble import GradientBoostingClassifier

print(GradientBoostingClassifier())
print(dir(GradientBoostingClassifier))


GradientBoostingClassifier(criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=100,
                           n_iter_no_change=None, presort='auto',
                           random_state=None, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)
['_SUPPORTED_LOSS', '__abstractmethods__', '__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getitem__', '__getstate__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__iter__', '__le__', '__len__', '__lt__', '__module__', '__ne__', '__new__', '

In [21]:
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.model_selection import train_test_split

In [23]:
def train_GB(est, max_depth, lr):
    gb = GradientBoostingClassifier(n_estimators=est, max_depth=max_depth, learning_rate=lr)
    gb_model = gb.fit(X_train, y_train)
    y_pred = gb_model.predict(X_test)
    precision, recall, fscore, train_support = score(y_test, y_pred, pos_label='spam', average='binary')
    print('Est: {} / Depth: {} / LR: {} ---- Precision: {} / Recall: {} / Accuracy: {}'.format(
        est, max_depth, lr, round(precision, 3), round(recall, 3), 
        round((y_pred==y_test).sum()/len(y_pred), 3)))

In [None]:
for n_est in [50, 100, 150]:
    for max_depth in [3, 7, 11, 15]:
        for lr in [0.01, 0.1, 1]:
            train_GB(n_est, max_depth, lr)

### Gradient Boostin with CV and parameters setting

In [25]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV

In [None]:
gb = GradientBoostingClassifier()
param = {
    'n_estimators': [100, 150], 
    'max_depth': [7, 11, 15],
    'learning_rate': [0.1]
}

clf = GridSearchCV(gb, param, cv=5, n_jobs=-1)
cv_fit = clf.fit(X_tfidf_feat, data['label'])
pd.DataFrame(cv_fit.cv_results_).sort_values('mean_test_score', ascending=False)[0:5]

## Final Evaluation of Models

In [None]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import precision_recall_fscore_support as score
import time

In [None]:
rf = RandomForestClassifier(n_estimators=150, max_depth=None, n_jobs=-1)

start = time.time()
rf_model = rf.fit(X_train_vect, y_train)
end = time.time()
fit_time = (end - start)

start = time.time()
y_pred = rf_model.predict(X_test_vect)
end = time.time()
pred_time = (end - start)

precision, recall, fscore, train_support = score(y_test, y_pred, pos_label='spam', average='binary')
print('Fit time: {} / Predict time: {} ---- Precision: {} / Recall: {} / Accuracy: {}'.format(
    round(fit_time, 3), round(pred_time, 3), round(precision, 3), round(recall, 3), round((y_pred==y_test).sum()/len(y_pred), 3)))

In [None]:
gb = GradientBoostingClassifier(n_estimators=150, max_depth=11)

start = time.time()
gb_model = gb.fit(X_train_vect, y_train)
end = time.time()
fit_time = (end - start)

start = time.time()
y_pred = gb_model.predict(X_test_vect)
end = time.time()
pred_time = (end - start)

precision, recall, fscore, train_support = score(y_test, y_pred, pos_label='spam', average='binary')
print('Fit time: {} / Predict time: {} ---- Precision: {} / Recall: {} / Accuracy: {}'.format(
    round(fit_time, 3), round(pred_time, 3), round(precision, 3), round(recall, 3), round((y_pred==y_test).sum()/len(y_pred), 3)))