In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import cross_validate, StratifiedKFold, train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.preprocessing import FunctionTransformer
from sklearn.svm import SVC

In [2]:
train = pd.read_csv('data/inbound_messages.csv')
train.head()

Unnamed: 0,text,conversation_type,spam,manual_class
0,Camila here are you still looking for hookup ?...,1,False,Other Site
1,hi,1,False,Short Greeting
2,"Heya, you responded to my CityXGuide ad a whil...",1,False,Other Site
3,Do you know that law enforcement are using Cit...,1,False,Other Site
4,<img>https://api.twilio.com/2010-04-01/Account...,1,False,Image


In [3]:
train.spam.value_counts()

False    206
True     108
Name: spam, dtype: int64

In [4]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 314 entries, 0 to 313
Data columns (total 4 columns):
text                 310 non-null object
conversation_type    314 non-null int64
spam                 314 non-null bool
manual_class         287 non-null object
dtypes: bool(1), int64(1), object(2)
memory usage: 7.7+ KB


In [5]:
columns = ['text', 'spam']
train = train[columns].dropna()
X = train.text
y = train.spam
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, stratify=y, test_size=0.3, random_state=2)

## TD-IDF with NaiveBayes

In [12]:
nb_tf = Pipeline([('tfidf', TfidfVectorizer()),
                ('clf', MultinomialNB())])

tf_params = {
    'tfidf__ngram_range': [(1, 1), (1, 2), (1, 3)],
    'tfidf__stop_words': [None, 'english'],
    'clf__alpha': [0.01, 0.1, 0.5, 1]
}

model = GridSearchCV(nb_tf, param_grid=tf_params, cv = 5, return_train_score=True)
model.fit(Xtrain, ytrain)
#model.cv_results_



GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('tfidf',
                                        TfidfVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.float64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                         

In [13]:
model.best_params_

{'clf__alpha': 1, 'tfidf__ngram_range': (1, 1), 'tfidf__stop_words': 'english'}

In [14]:
params = {'clf__alpha': 0.5}

In [15]:
kfold = StratifiedKFold(n_splits=5, random_state=0)
cv_results = cross_validate(nb_tf, X.ravel(), y, cv=kfold,return_train_score=True)
print("Train accuracy: {}".format(cv_results['train_score'].mean()))
print("Test accuracy: {}".format(cv_results['test_score'].mean()))

Train accuracy: 0.882373637163254
Test accuracy: 0.6825589886933091


In [16]:
tfidf_best = model.best_estimator_
print('Train accuracy: {}'.format(model.score(Xtrain, ytrain)))
print('Test accuracy: {}'.format(model.score(Xtest, ytest)))

Train accuracy: 0.8847926267281107
Test accuracy: 0.7419354838709677


In [9]:
train.spam.value_counts()

False    202
True     108
Name: spam, dtype: int64

## TD-IDF with NaiveBayes (Upsampled data)

In [10]:
count_class_0, count_class_1 = train.spam.value_counts()

class_0 = train[train.spam == False]
class_1 = train[train.spam == True]

class_1_plus = class_1.sample(count_class_0, replace=True)
train_upsampled = pd.concat([class_0, class_1_plus], axis=0)

train_upsampled.spam.value_counts()

True     202
False    202
Name: spam, dtype: int64

In [11]:
X = train_upsampled.text
y = train_upsampled.spam
Xtrain_, Xtest_, ytrain_, ytest_ = train_test_split(X, y, stratify=y, test_size=0.3, random_state=2)
model = GridSearchCV(nb_tf, param_grid=tf_params, cv = 4, return_train_score=True)
model.fit(Xtrain_, ytrain_)
#model.cv_results_



GridSearchCV(cv=4, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('tfidf',
                                        TfidfVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.float64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                         

In [12]:
model.best_params_

{'clf__alpha': 0.01,
 'tfidf__ngram_range': (1, 3),
 'tfidf__stop_words': 'english'}

### On upsampled test data

In [13]:
tfidf_best = model.best_estimator_
print('Train accuracy_: {}'.format(tfidf_best.score(Xtrain_, ytrain_)))
print('Test accuracy_: {}'.format(tfidf_best.score(Xtest_, ytest_)))

Train accuracy_: 0.9858156028368794
Test accuracy_: 0.7868852459016393


In [14]:
print(classification_report(ytest, tfidf_best.predict(Xtest)))

              precision    recall  f1-score   support

       False       0.89      0.69      0.78        61
        True       0.59      0.84      0.69        32

    accuracy                           0.74        93
   macro avg       0.74      0.77      0.74        93
weighted avg       0.79      0.74      0.75        93



### On original test data

In [15]:
print('Test accuracy: {}'.format(tfidf_best.score(Xtest, ytest)))

Test accuracy: 0.7419354838709677


### Dimensionality Reduction on TD-IDF Features

In [16]:
nb_tf = Pipeline([('tfidf', TfidfVectorizer()),
                  ('pca', TruncatedSVD()),
                ('clf', GaussianNB())])

tf_params = {
    'tfidf__ngram_range': [(1, 1), (1, 2), (1, 3)],
    'tfidf__stop_words': [None, 'english'],
    'pca__n_components': [800],
    #'clf__alpha': [0.01, 0.1, 0.5, 1]
}

model = GridSearchCV(nb_tf, param_grid=tf_params, cv = 4, return_train_score=True)
model.fit(Xtrain, ytrain)
#model.cv_results_



GridSearchCV(cv=4, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('tfidf',
                                        TfidfVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.float64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                         

In [17]:
model.best_params_

{'pca__n_components': 800,
 'tfidf__ngram_range': (1, 3),
 'tfidf__stop_words': None}

In [18]:
tfidf_best = model.best_estimator_
print('Train accuracy: {}'.format(model.score(Xtrain, ytrain)))
print('Test accuracy: {}'.format(model.score(Xtest, ytest)))

Train accuracy: 0.695852534562212
Test accuracy: 0.4946236559139785


### Support Vector Machines

In [19]:
nb_tf = Pipeline([('tfidf', TfidfVectorizer()),
                ('clf', SVC())])

tf_params = {
    'tfidf__ngram_range': [(1, 1), (1, 2), (1, 3)],
    'tfidf__stop_words': [None, 'english'],
    'clf__kernel': ['linear', 'rbf'],
    'clf__C': [0.01, 0.1, 1, 10]
}

model = GridSearchCV(nb_tf, param_grid=tf_params, cv = 4, return_train_score=True)
model.fit(Xtrain, ytrain)
#model.cv_results_



GridSearchCV(cv=4, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('tfidf',
                                        TfidfVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.float64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                         

In [20]:
model.best_params_

{'clf__C': 1,
 'clf__kernel': 'linear',
 'tfidf__ngram_range': (1, 3),
 'tfidf__stop_words': None}

In [21]:
tfidf_best = model.best_estimator_
print('Train accuracy: {}'.format(model.score(Xtrain, ytrain)))
print('Test accuracy: {}'.format(model.score(Xtest, ytest)))

Train accuracy: 0.9585253456221198
Test accuracy: 0.7634408602150538


In [22]:
print(classification_report(ytest, model.predict(Xtest)))

              precision    recall  f1-score   support

       False       0.87      0.75      0.81        61
        True       0.62      0.78      0.69        32

    accuracy                           0.76        93
   macro avg       0.75      0.77      0.75        93
weighted avg       0.78      0.76      0.77        93



In [32]:
best

{'clf__alpha': 0.5, 'tfidf__ngram_range': (1, 1), 'tfidf__stop_words': None}

In [33]:
nb_tf

Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('clf',
                 MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],
         verbose=False)

Train accuracy: 0.908446438156583
Test accuracy: 0.7137195121951219
