In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
messages = pd.read_csv("SMSSpamCollection", sep='\t',names=['label','message'])

In [3]:
messages

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [4]:
import re
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

In [5]:
corpus = []
for i in range(0,len(messages)):
    review = re.sub('[^a-zA-Z0-9]'," ",messages['message'][i])
    review = review.lower().split()
    review = [w for w in review if w not in stop_words]
    corpus.append(" ".join(review))

y = pd.get_dummies(messages['label'])
y = y.iloc[:,1].values

## Making the model using BOW

In [6]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
pipeline_bow = Pipeline([
    ('cv', CountVectorizer()),
    ('nb', MultinomialNB())
])

param_grid_bow = {
    'cv__max_features': [1000, 2500, 5000],
    'cv__ngram_range': [(1,1), (1,2)],
    'cv__binary': [True, False],
    'nb__alpha': [0.1, 0.5, 1.0]
}

from sklearn.model_selection import GridSearchCV
grid_bow = GridSearchCV(
    pipeline_bow,
    param_grid_bow,
    cv=5,
    scoring='f1',
    n_jobs=-1
)

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(corpus, y, test_size = 0.20, random_state = 0)

grid_bow.fit(x_train,y_train)

0,1,2
,estimator,Pipeline(step...inomialNB())])
,param_grid,"{'cv__binary': [True, False], 'cv__max_features': [1000, 2500, ...], 'cv__ngram_range': [(1, ...), (1, ...)], 'nb__alpha': [0.1, 0.5, ...]}"
,scoring,'f1'
,n_jobs,-1
,refit,True
,cv,5
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,stop_words,
,token_pattern,'(?u)\\b\\w\\w+\\b'
,ngram_range,"(1, ...)"

0,1,2
,alpha,0.5
,force_alpha,True
,fit_prior,True
,class_prior,


In [15]:
x_bow_pred = grid_bow.predict(x_test)
from sklearn.metrics import accuracy_score,classification_report
score_bow=accuracy_score(y_test,x_bow_pred)
print("BOW")
print("accuracy_score:",score_bow)
print(classification_report(y_test, x_bow_pred))

BOW
accuracy_score: 0.9865470852017937
              precision    recall  f1-score   support

       False       0.99      0.99      0.99       955
        True       0.97      0.94      0.95       160

    accuracy                           0.99      1115
   macro avg       0.98      0.97      0.97      1115
weighted avg       0.99      0.99      0.99      1115



## Making the model using TF-IDF

In [16]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
pipeline_tfidf = Pipeline([
    ("tfidf",TfidfVectorizer()),
    ('nb',MultinomialNB())
])

param_grid_tfidf = {
    'tfidf__max_features': [1000, 2500, 5000],
    'tfidf__ngram_range': [(1,1), (1,2)],
    'tfidf__min_df': [1, 2],
    'tfidf__max_df': [0.9, 1.0],
    'tfidf__sublinear_tf': [True, False],
    'nb__alpha': [0.1, 0.5, 1.0]
}

from sklearn.model_selection import GridSearchCV
grid_tfidf = GridSearchCV(
    pipeline_tfidf,
    param_grid_tfidf,
    cv=5,
    scoring='f1',
    n_jobs=-1
)

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(corpus, y, test_size = 0.20, random_state = 0)


grid_tfidf.fit(x_train,y_train)

0,1,2
,estimator,Pipeline(step...inomialNB())])
,param_grid,"{'nb__alpha': [0.1, 0.5, ...], 'tfidf__max_df': [0.9, 1.0], 'tfidf__max_features': [1000, 2500, ...], 'tfidf__min_df': [1, 2], ...}"
,scoring,'f1'
,n_jobs,-1
,refit,True
,cv,5
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,analyzer,'word'
,stop_words,
,token_pattern,'(?u)\\b\\w\\w+\\b'

0,1,2
,alpha,0.1
,force_alpha,True
,fit_prior,True
,class_prior,


In [17]:
x_tfidf_pred = grid_tfidf.predict(x_test)
from sklearn.metrics import accuracy_score,classification_report
score_tfidf=accuracy_score(y_test,x_tfidf_pred)
print("TF-IDF:")
print("accuracy_score:",score_tfidf)
print(classification_report(y_test, x_tfidf_pred))

TF-IDF:
accuracy_score: 0.9865470852017937
              precision    recall  f1-score   support

       False       0.99      1.00      0.99       955
        True       0.97      0.93      0.95       160

    accuracy                           0.99      1115
   macro avg       0.98      0.96      0.97      1115
weighted avg       0.99      0.99      0.99      1115



## Making the model using Word2vec

In [18]:
texts = corpus
labels = y

In [19]:
from sklearn.model_selection import train_test_split
x_train_text, x_test_text, y_train, y_test = train_test_split(texts,labels,test_size=0.20,
random_state=0
)

In [20]:
x_train_tokens = [text.split() for text in x_train_text]
x_test_tokens  = [text.split() for text in x_test_text]

In [21]:
from gensim.models import Word2Vec
w2v_model = Word2Vec(
    sentences=x_train_tokens,
    vector_size=100,
    window=5,
    min_count=2,
    workers=4
)

In [22]:
w2v_model.wv['free']

array([-4.2394608e-01,  4.1403919e-01,  1.6893286e-01, -1.3138114e-02,
        8.0497794e-02, -8.4007359e-01,  1.7287078e-01,  1.0787777e+00,
       -4.4820175e-01, -1.0549315e-01, -1.8637601e-01, -8.2650954e-01,
       -1.1745408e-01,  3.2410285e-01,  2.3800650e-01, -5.3685439e-01,
        8.8007329e-03, -8.0884242e-01, -7.6257959e-02, -9.9006009e-01,
        3.2691869e-01,  3.4602013e-01,  4.4323832e-01, -2.4113764e-01,
       -1.3380310e-01,  2.3233068e-01, -4.4354549e-01, -4.6879265e-01,
       -5.3219998e-01,  1.1592452e-01,  7.5670367e-01, -1.1466800e-01,
        2.2176942e-01, -4.0640685e-01, -6.2124047e-02,  6.3766837e-01,
       -6.4093841e-04, -4.4624105e-01, -3.9366123e-01, -1.0316622e+00,
        9.5143840e-02, -6.5371424e-01, -3.6449647e-01, -1.9915780e-02,
        2.7448437e-01, -1.7964233e-01, -4.8868865e-01, -7.1235426e-02,
        5.2818823e-01,  5.0249374e-01,  3.0329031e-01, -6.1824471e-01,
       -3.0666071e-01, -1.3279673e-01, -2.5868982e-01,  1.9546841e-01,
      

In [23]:
def sentence_vector(tokens, model, vector_size):
    vectors = [model.wv[w] for w in tokens if w in model.wv]
    if len(vectors) == 0:
        return np.zeros(vector_size)
    return np.mean(vectors, axis=0)

x_train_w2v = np.array([
    sentence_vector(tokens, w2v_model, 100)
    for tokens in x_train_tokens
])

x_test_w2v = np.array([
    sentence_vector(tokens, w2v_model, 100)
    for tokens in x_test_tokens
])


In [24]:
from sklearn.linear_model import LogisticRegression
clf_w2v = LogisticRegression()

param_grid_w2v = {
    'C': [0.01, 0.1, 1, 10],
    'penalty': ['l2'],
    'solver': ['lbfgs', 'liblinear'],
    'class_weight': [None, 'balanced']
}

from sklearn.model_selection import GridSearchCV
grid_w2v = GridSearchCV(clf_w2v,param_grid=param_grid_w2v,cv=5,scoring='f1',n_jobs=-1)

grid_w2v.fit(x_train_w2v,y_train)

0,1,2
,estimator,LogisticRegression()
,param_grid,"{'C': [0.01, 0.1, ...], 'class_weight': [None, 'balanced'], 'penalty': ['l2'], 'solver': ['lbfgs', 'liblinear']}"
,scoring,'f1'
,n_jobs,-1
,refit,True
,cv,5
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,10
,fit_intercept,True
,intercept_scaling,1
,class_weight,'balanced'
,random_state,
,solver,'liblinear'
,max_iter,100


In [25]:
x_w2v_pred = grid_w2v.predict(x_test_w2v)
from sklearn.metrics import accuracy_score,classification_report
score_w2v=accuracy_score(y_test,x_w2v_pred)
print("W2V:")
print("accuracy_score:",score_w2v)
print(classification_report(y_test,x_w2v_pred))


W2V:
accuracy_score: 0.9040358744394619
              precision    recall  f1-score   support

       False       0.98      0.91      0.94       955
        True       0.61      0.89      0.73       160

    accuracy                           0.90      1115
   macro avg       0.80      0.90      0.83      1115
weighted avg       0.93      0.90      0.91      1115



In [26]:
## Performance Analysis of Text Embeddings

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def get_metrics(y_true, y_pred):
    return {
        "Accuracy": accuracy_score(y_true, y_pred),
        "Precision": precision_score(y_true, y_pred),
        "Recall": recall_score(y_true, y_pred),
        "F1-score": f1_score(y_true, y_pred)
    }

bow_metrics   = get_metrics(y_test, x_bow_pred)
tfidf_metrics = get_metrics(y_test, x_tfidf_pred)
w2v_metrics   = get_metrics(y_test, x_w2v_pred)

import pandas as pd

results = pd.DataFrame([
    {
        "Embedding": "BoW",
        "Model": "MultinomialNB",
        **bow_metrics
    },
    {
        "Embedding": "TF-IDF",
        "Model": "MultinomialNB",
        **tfidf_metrics
    },
    {
        "Embedding": "Word2Vec",
        "Model": "Logistic Regression",
        **w2v_metrics
    }
])

results

Unnamed: 0,Embedding,Model,Accuracy,Precision,Recall,F1-score
0,BoW,MultinomialNB,0.986547,0.967742,0.9375,0.952381
1,TF-IDF,MultinomialNB,0.986547,0.973856,0.93125,0.952077
2,Word2Vec,Logistic Regression,0.904036,0.613734,0.89375,0.727735
