# Building Machine Learning Classifiers: Model selection

### Read in & clean text

In [1]:
import nltk
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
import string

stopwords = nltk.corpus.stopwords.words('english')
ps = nltk.PorterStemmer()

data = pd.read_csv("SMSSpamCollection.tsv", sep='\t')
data.columns = ['Label', 'Text']
data.head()

Unnamed: 0,Label,Text
0,spam,Free entry in 2 a wkly comp to win FA Cup fina...
1,ham,"Nah I don't think he goes to usf, he lives aro..."
2,ham,Even my brother is not like to speak with me. ...
3,ham,I HAVE A DATE ON SUNDAY WITH WILL!!
4,ham,As per your request 'Melle Melle (Oru Minnamin...


0

In [11]:
data.shape

(5567, 5)

In [2]:
# User defined function to get percentages of punctuations used
def count_punct(text):
    count = sum([1 for char in text if char in string.punctuation])
    return round(count/(len(text) - text.count(" ")), 3)*100

In [3]:
# Creating additional features
data['body_len'] = data['Text'].apply(lambda x: len(x) - x.count(" "))
data['punct%'] = data['Text'].apply(lambda x: count_punct(x))

In [4]:
data.head()

Unnamed: 0,Label,Text,body_len,punct%
0,spam,Free entry in 2 a wkly comp to win FA Cup fina...,128,4.7
1,ham,"Nah I don't think he goes to usf, he lives aro...",49,4.1
2,ham,Even my brother is not like to speak with me. ...,62,3.2
3,ham,I HAVE A DATE ON SUNDAY WITH WILL!!,28,7.1
4,ham,As per your request 'Melle Melle (Oru Minnamin...,135,4.4


In [8]:
# Creating a user defined function to perform all the above text cleaning in single fucntion
def clean_text(text):
    # Stripping white spaces before and after the text
    text = text.strip(" ")
    # Replacing multiple spaces with a single space
    text = re.sub("\s+"," ", text)
    # Replacing punctuations
    text = "".join([char for char in text if char not in string.punctuation])
    # Creating tokens
    tokens = re.split('\W+', text)
    # removing stopwords and stemming - snowball stemming
    text_final = [ps.stem(word) for word in tokens if word not in stopwords]
    # creating a list of tokens
    text_final = " ".join(text_final)
    return text_final

In [9]:
data['cleaned_text'] = data['Text'].apply(lambda x: clean_text(x.lower()))
data.head()

Unnamed: 0,Label,Text,body_len,punct%,cleaned_text
0,spam,Free entry in 2 a wkly comp to win FA Cup fina...,128,4.7,free entri 2 wkli comp win fa cup final tkt 21...
1,ham,"Nah I don't think he goes to usf, he lives aro...",49,4.1,nah dont think goe usf live around though
2,ham,Even my brother is not like to speak with me. ...,62,3.2,even brother like speak treat like aid patent
3,ham,I HAVE A DATE ON SUNDAY WITH WILL!!,28,7.1,date sunday
4,ham,As per your request 'Melle Melle (Oru Minnamin...,135,4.4,per request mell mell oru minnaminungint nurun...


In [14]:
x = data.drop(['Label','Text'], axis = 1)
# Keeping new added features and cleaned text only
y = data['Label']

- Here we are first splitting to training and test set
- Then vectorizing the training set
- Use this training set vectorizer to transform the test set

### Split into train/test

In [16]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2, random_state = 0)

In [17]:
x_train.shape

(4453, 3)

In [18]:
x_test.shape

(1114, 3)

### Vectorize text
- Here using unigrams.
- We can try wih bigrams and trigrams as well and compare the accuracy

In [70]:
tfidf_vect = TfidfVectorizer(max_features = 5000)
# Taking top 5000 most frequent words based on the frequency
tfidf = tfidf_vect.fit(x_train['cleaned_text'])
# Fitting the vectrizer only on the training data
tfidf
# Fit will help to extract the features

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=5000,
                min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [71]:
# Now using this data to transform the training data and the test data
tfidf_train = tfidf.transform(x_train['cleaned_text'])
tfidf_test = tfidf.transform(x_test['cleaned_text'])

tfidf_train_df = pd.DataFrame(tfidf_train.toarray(), columns = tfidf_vect.get_feature_names())
tfidf_test_df = pd.DataFrame(tfidf_test.toarray(), columns = tfidf_vect.get_feature_names())

In [72]:
tfidf_train_df.head()

Unnamed: 0,008704050406,0089mi,0121,01223585236,01223585334,0125698789,02,020603,0207,02070836089,...,zed,zero,zhong,zindgi,zoe,zogtoriu,zoom,zouk,zyada,üll
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [73]:
tfidf_test_df.head()

Unnamed: 0,008704050406,0089mi,0121,01223585236,01223585334,0125698789,02,020603,0207,02070836089,...,zed,zero,zhong,zindgi,zoe,zogtoriu,zoom,zouk,zyada,üll
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


- Here the index of X_train and tfidf_trainidf do not match and concat function matches with the index only
- But the body_text is still in the same order, so we have to ignore the indices by rest_index(drop=True) give axis =1 along which we have to concatenate, ie joining side by side

In [74]:
x_train_df = pd.concat([x_train[['body_len','punct%']].reset_index(drop=True), tfidf_train_df], axis=1)
x_train_df.head()

Unnamed: 0,body_len,punct%,008704050406,0089mi,0121,01223585236,01223585334,0125698789,02,020603,...,zed,zero,zhong,zindgi,zoe,zogtoriu,zoom,zouk,zyada,üll
0,28,14.3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,56,3.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,70,4.3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,130,4.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [75]:
x_test_df = pd.concat([x_test[['body_len','punct%']].reset_index(drop=True),tfidf_test_df], axis=1)
x_test_df.head()

Unnamed: 0,body_len,punct%,008704050406,0089mi,0121,01223585236,01223585334,0125698789,02,020603,...,zed,zero,zhong,zindgi,zoe,zogtoriu,zoom,zouk,zyada,üll
0,19,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,72,1.4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,19,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,35,2.9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,63,6.3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


- tfidf_train_df and tfidf_test_df contains same number of columns as it is transformed based on the tfidf which takes the same words(words are columns) matching in the training data set

In [76]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y_train_enc = le.fit_transform(y_train)
y_train_enc

array([0, 0, 0, ..., 0, 1, 0])

In [77]:
y_test_enc = le.transform(y_test)
y_test_enc

array([0, 0, 0, ..., 0, 0, 0])

### Final evaluation of models

In [78]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import precision_recall_fscore_support as score
import time

## Using Random Forest Algorithm

In [102]:
rf = RandomForestClassifier(n_estimators=150, max_depth=None, n_jobs=-1) # passing best parameters as seen before

start = time.time()
rf_model = rf.fit(tfidf_train_df, y_train)
end = time.time()
fit_time = end-start # gives the training time

start = time.time()
y_pred = rf_model.predict(tfidf_test_df)
end = time.time()
pred_time = end-start # gives the prediction time

end=time.time()

precision, recall, fscore, train_support = score(y_test, y_pred, pos_label='spam', average='binary')
print('Fit_time:{}, Pred_time:{}, Precision: {} / Recall: {} / Accuracy: {}'.format(
    round(fit_time,3),round(pred_time,3),round(precision, 3), round(recall, 3), round((y_pred==y_test).sum()/len(y_pred), 3)))

Fit_time:25.987, Pred_time:0.489, Precision: 1.0 / Recall: 0.858 / Accuracy: 0.98


## Using MultiNomial Naive Bayes

In [143]:
from sklearn.naive_bayes import MultinomialNB
mnb = MultinomialNB(alpha = 0.2)

start = time.time()
mnb_model = mnb.fit(tfidf_train_df, y_train_enc)
end = time.time()
fit_time = end-start

start = time.time()
y_pred = mnb_model.predict(tfidf_test_df)
end = time.time()
pred_time = end-start

# precision, recall, fscore, train_support = score(y_test_enc, y_pred, pos_label=, average = 'binary')
# print('Fit_time:{}, Pred_time:{}, Precision: {} / Recall: {} / Accuracy: {}'.format(
#     round(fit_time,3),round(pred_time,3),round(precision, 3), round(recall, 3), round((y_pred==y_test).sum()/len(y_pred), 3)))

In [144]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test_enc, y_pred)

0.9892280071813285

In [145]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test_enc, y_pred)
cm

array([[958,   1],
       [ 11, 144]], dtype=int64)

## Using Gradient Boosting

In [146]:
gb = GradientBoostingClassifier(n_estimators=150, max_depth=11)

start = time.time()
gb_model = gb.fit(tfidf_train_df, y_train)
end = time.time()
fit_time = end-start

start = time.time()
y_pred = gb_model.predict(tfidf_test_df)
end = time.time()
pred_time = end-start

precision, recall, fscore, train_support = score(y_test, y_pred, pos_label='spam', average='binary')
print('Fit_time:{}, Pred_time:{}, Precision: {} / Recall: {} / Accuracy: {}'.format(
    round(fit_time,3),round(pred_time,3),round(precision, 3), round(recall, 3), round((y_pred==y_test).sum()/len(y_pred), 3)))

Fit_time:800.014, Pred_time:0.125, Precision: 0.955 / Recall: 0.826 / Accuracy: 0.97


- - Here we can see that Multinomial Naive Bayes is giving the best accuracy compared to Random Forest and Gradient Boosting.
- Here we will have to see the predict time as we will not every time train the model
- We will save this best model and then use this for predictions, so this is why we want low predict time
- We can see that random forest is taking more time to predict and less time to train because of paralleization
- Similarly, gradient boosting is taking more time to train because of sequential process and less time to predict
- So while selecting a model we have to do trade offs and see which model is required accordingly
- If we are focussing on False Positives then we have to see that Precision is high
- If we are focussing on False Negatives then we have to see that  Recall is high 

In [120]:
# Now making a single prediction
inp_text1 = 'Free Free Free!! Flat 20% off on all Shirts!! Avail this offer and get upto 3500 cashback'
inp_text2 = 'Hi Yash! I will be coming to India on 25th October. Hope you are free at that time. Lets meet its been long!'
inp_text3 = 'Hurray! Flat--20%+30% discount--coupons available!!! Grab this chance to get upto--70% profits on SBI credit cards'
inp_text4 = "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's"
inp_df = pd.DataFrame([inp_text1,inp_text2,inp_text3,inp_text4], columns = ['Text'])

inp_df

Unnamed: 0,Text
0,Free Free Free!! Flat 20% off on all Shirts!! ...
1,Hi Yash! I will be coming to India on 25th Oct...
2,Hurray! Flat--20%+30% discount--coupons availa...
3,Free entry in 2 a wkly comp to win FA Cup fina...


In [82]:
# # Creating additional features
# inp_df['body_len'] = inp_df['Text'].apply(lambda x: len(x) - x.count(" "))
# inp_df['punct%'] = inp_df['Text'].apply(lambda x: count_punct(x))
# inp_df.head()

Unnamed: 0,Text,body_len,punct%
0,Free Free Free!! Flat 20% off on all Shirts!! ...,73,6.8
1,Hi Yash! I will be coming to India on 25th Oct...,86,4.7
2,Hurray! Flat--20%+30% discount--coupons availa...,121,11.6


In [121]:
inp_df['cleaned_text'] = inp_df['Text'].apply(lambda x: clean_text(x.lower()))
inp_df.head()

Unnamed: 0,Text,cleaned_text
0,Free Free Free!! Flat 20% off on all Shirts!! ...,free free free flat 20 shirt avail offer get u...
1,Hi Yash! I will be coming to India on 25th Oct...,hi yash come india 25th octob hope free time l...
2,Hurray! Flat--20%+30% discount--coupons availa...,hurray flat2030 discountcoupon avail miss chan...
3,Free entry in 2 a wkly comp to win FA Cup fina...,free entri 2 wkli comp win fa cup final tkt 21...


In [122]:
x_inp = inp_df.drop('Text', axis = 1)
x_inp

Unnamed: 0,cleaned_text
0,free free free flat 20 shirt avail offer get u...
1,hi yash come india 25th octob hope free time l...
2,hurray flat2030 discountcoupon avail miss chan...
3,free entri 2 wkli comp win fa cup final tkt 21...


In [123]:
# Tansforming the input text using Tfidf vectorizer which was already trained
x_inp_tfidf = tfidf.transform(x_inp['cleaned_text'])
x_inp_tfidf_df = pd.DataFrame(x_inp_tfidf.toarray())
x_inp_tfidf_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4990,4991,4992,4993,4994,4995,4996,4997,4998,4999
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [125]:
# x_inp_df = pd.concat([x_inp[['body_len','punct%']].reset_index(drop=True), x_inp_tfidf_df], axis=1)
# x_inp_df.head()

In [128]:
rf_model.predict(x_inp_tfidf_df)

array(['ham', 'ham', 'ham', 'spam'], dtype=object)