In [None]:
# Building Machine Learning Classifiers: Building a basic Random Forest Model

In [None]:
# Read and clean text

In [None]:
import pandas as pd
import nltk
import re
from sklearn.feature_extraction.text import TfidfVectorizer
import string

nltk.download('stopwords')
stopwords = nltk.corpus.stopwords.words('english')
ps = nltk.PorterStemmer()



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
data = pd.read_csv('SMSSpamCollection.tsv', sep='\t', header=None, names = ['label', 'body_text'])
# data.head()

In [None]:
def count_punct(text):
    count = sum([1 for char in text if char in string.punctuation])
    return round((count / (len(text) - text.count(' '))) * 100, 3)

In [None]:
data['body_len'] = data['body_text'].apply(lambda x: len(x) - x.count(' '))
data['punct'] = data['body_text'].apply(count_punct)

In [None]:
data.head()

Unnamed: 0,label,body_text,body_len,punct
0,ham,I've been searching for the right words to tha...,160,2.5
1,spam,Free entry in 2 a wkly comp to win FA Cup fina...,128,4.688
2,ham,"Nah I don't think he goes to usf, he lives aro...",49,4.082
3,ham,Even my brother is not like to speak with me. ...,62,3.226
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!,28,7.143


In [None]:
def clean_text(text):
    text = ''.join([word.lower() for word in text if word not in string.punctuation])
    tokens = re.split('\W+',text)
    text = [ps.stem(word) for word in tokens if word not in stopwords]
    return text

tfidd_vect = TfidfVectorizer(analyzer=clean_text)
X_tfidf = tfidd_vect.fit_transform(data['body_text'])
x_features = pd.concat([data['body_len'], data['punct'], pd.DataFrame(X_tfidf.toarray())], axis=1)

In [None]:
x_features.head()

In [None]:
# Explore RandomForest Classifier Attribues and HyperParameters

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
from sklearn.model_selection import KFold, cross_val_score

In [None]:
rf = RandomForestClassifier(n_jobs=-1)
k_fold =  KFold(n_splits=5)
crossvalscore = cross_val_score(rf, x_features, data['label'], scoring='accuracy', n_jobs=-1)


In [None]:
crossvalscore

In [None]:
# Explore RandomForest Classifier through holdout set

In [None]:
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, Y_train, Y_test =  train_test_split(x_features, data['label'], test_size=0.2)

In [None]:
RF = RandomForestClassifier(n_estimators=50, max_depth=20, n_jobs=-1)
rf_model = RF.fit(X_train, Y_train)

In [None]:
sorted(zip(rf_model.feature_importances_, X_train.columns), reverse=True)[0:10]

In [None]:
y_pred = rf_model.predict(X_test)
precision, recall, fscore, support =score(Y_test,y_pred, pos_label='spam', average='binary')

In [None]:
print('Precision: {} / Recall:{} /Accuracy: {} '.format(round(precision,3),
round(recall, 3),
round( ((y_pred == Y_test).sum()  / len(y_pred)),3  )
))

In [None]:
# The important points to be taken into consideration after the prediction by the model is that, the 1.0 precisio means that all mail in the spam folder is acutally spam.
# The 56% in the recall means that the percantage of 56 % of all the spam that has come into your mail was properly placed in the spam folder.
# The 94.4% of accuracy means, that percentage of emails that have come into your email were correctly identified as spam or ham


In [None]:
# Building Machine Learning classifiers: Explore Random Forest model with grid-search

In [None]:
# Grid-Search = Exhaustively search all parameter combinations in a given grid to determine the best model.

In [None]:
#Build our Grid-Search


In [None]:
def train_RF(n_eat, depth):
    Rf = RandomForestClassifier(n_estimators=n_eat, max_depth=depth, n_jobs=-1)
    rf_model = Rf.fit(X_train, Y_train)
    y_pred = rf_model.predict(X_test)
    precision, recall, fscore, support =score(Y_test,y_pred, pos_label='spam', average='binary')
    print(' Est: {} / Depth: {}     ================Precision: {} / Recall:{} /Accuracy: {} '.format( n_eat, depth, round(precision,3),
    round(recall, 3),
    round( ((y_pred == Y_test).sum()  / len(y_pred)),3  )))

In [None]:
for n_eat in [10, 50, 100]:
    for depth in [10, 20, 30, None]:
        train_RF(n_eat, depth)

In [None]:
# Building Machine Learning Classifiers: Evaluate Random Forest with Grid search CV

In [None]:
#Cross validation: Divide a dataset into k-subsets and repeat the holdout method k times where a different subset is used as the holdout set in each iteration

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer


In [None]:
tfidf_vect = TfidfVectorizer(analyzer=clean_text)
X_tfidf = tfidf_vect.fit_transform(data['body_text'])
X_tfidf_feat = pd.concat([data['body_len'], data['punct'], pd.DataFrame(X_tfidf.toarray())], axis=1)

#Count Vectorizer
count_vect = CountVectorizer(analyzer=clean_text)
X_count = count_vect.fit_transform(data['body_text'])
X_count_feat = pd.concat([data['body_len'], data['punct'], pd.DataFrame(X_tfidf.toarray())], axis=1)

In [None]:
X_count_feat.head()

In [None]:
#Exloring parameter setting using Grid Search CV



In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

In [None]:
rf  = RandomForestClassifier()
param = { 'n_estimators':[10, 150, 300] ,
            'max_depth':[30,60,90,None]}
            

In [None]:
gs = GridSearchCV(rf, param, cv=5, n_jobs=-1)
gs_fit = gs.fit(X_tfidf_feat, data['label'])
pd.DataFrame(gs_fit.cv_results).sort_values('mean_test_score', ascending=False)[0:5]

AttributeError: 'GridSearchCV' object has no attribute 'cv_results'

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=c110350c-d27f-445e-a2fe-f71c16e85564' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>