In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import nltk
from profanity_check import predict as prof_predict
from profanity_check import predict_prob as prof_predict_prob




In [2]:
dataset_path = "../dataset/spam.csv"
df = pd.read_csv(dataset_path, sep=",", encoding="latin-1")
df = df.rename(columns={"v1": "class", "v2": "text"})
df = df.drop(df.columns[2:], axis=1)

In [3]:
import string

from nltk.corpus import stopwords

def text_process(mess):
    nopunc =[char for char in mess if char not in string.punctuation]
    nopunc=''.join(nopunc)
    return [word for word in nopunc.split() if word.lower() not in stopwords.words('english')]

df['text'].apply(text_process)

0       [Go, jurong, point, crazy, Available, bugis, n...
1                          [Ok, lar, Joking, wif, u, oni]
2       [Free, entry, 2, wkly, comp, win, FA, Cup, fin...
3           [U, dun, say, early, hor, U, c, already, say]
4       [Nah, dont, think, goes, usf, lives, around, t...
                              ...                        
5567    [2nd, time, tried, 2, contact, u, U, å£750, Po...
5568                   [Ì, b, going, esplanade, fr, home]
5569                     [Pity, mood, Soany, suggestions]
5570    [guy, bitching, acted, like, id, interested, b...
5571                                   [Rofl, true, name]
Name: text, Length: 5572, dtype: object

In [4]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test  = train_test_split(df['text'],df['class'],test_size=0.2, random_state = 42)

In [5]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

cv = CountVectorizer(max_features = 1500)
cv.fit(X_train)

X_train_cv = cv.transform(X_train)

X_test_cv = cv.transform(X_test)

mnb = MultinomialNB(alpha = 0.5)
mnb.fit(X_train_cv,y_train)

y_mnb = mnb.predict(X_test_cv)

print('Naive Bayes Accuracy: ', accuracy_score( y_mnb , y_test))

print('Naive Bayes classification report: ', classification_report(y_mnb, y_test))

Naive Bayes Accuracy:  0.9829596412556054
Naive Bayes classification report:                precision    recall  f1-score   support

         ham       0.99      0.99      0.99       974
        spam       0.91      0.96      0.93       141

    accuracy                           0.98      1115
   macro avg       0.95      0.98      0.96      1115
weighted avg       0.98      0.98      0.98      1115



In [6]:
from sklearn.pipeline import Pipeline

pipeline = Pipeline([
   ( 'bow',CountVectorizer(analyzer=text_process)),
    ('tfidf',TfidfTransformer()),
    ('classifier',mnb),
])


pipeline.fit(X_train,y_train)



Pipeline(memory=None,
         steps=[('bow',
                 CountVectorizer(analyzer=<function text_process at 0x000002989511EA60>,
                                 binary=False, decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('tfidf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('classifier',
                 MultinomialNB(alpha=0.5, class_prior=None, fit_prior=True))],
         verbose=False)

In [7]:
predictions = pipeline.predict(X_test)
print('Pipelined Naive Bayes accuracy score: ', accuracy_score(y_test,predictions))
print('Pipelined Naive Bayes classification report: ', classification_report(y_test,predictions))

Pipelined Naive Bayes accuracy score:  0.9739910313901345
Pipelined Naive Bayes classification report:                precision    recall  f1-score   support

         ham       0.97      1.00      0.99       965
        spam       1.00      0.81      0.89       150

    accuracy                           0.97      1115
   macro avg       0.99      0.90      0.94      1115
weighted avg       0.97      0.97      0.97      1115



In [8]:
tweet = "Claim a free fat cock for your prize"

sample_spam_predict = pipeline.predict([tweet])

sample_prof_predict = prof_predict([tweet])

sample_prof_predict_prob = prof_predict_prob([tweet])

print(sample_spam_predict, sample_prof_predict, sample_prof_predict_prob)

['spam'] [1] [0.74044736]


In [9]:
def moderatemytweet(tweet):
    sample_spam_predict = pipeline.predict([tweet])

    sample_prof_predict_prob = prof_predict_prob([tweet])

    my_mod = {'label' : str(sample_spam_predict), 'profanity_score' : float(sample_prof_predict_prob) }
    
    return my_mod
    

In [10]:
moderatemytweet("suck my fat cock")

{'label': "['ham']", 'profanity_score': 0.9721670938546879}

In [11]:
import joblib
joblib.dump(pipeline, 'pipeline.pkl')

['pipeline.pkl']

In [12]:
pipeline = joblib.load('pipeline.pkl')

In [17]:
joblib.dump(mnb, 'model.pkl')

['model.pkl']

In [18]:
joblib.dump(X_train, 'X_train.pkl')

['X_train.pkl']

In [19]:
joblib.dump(y_train, 'y_train.pkl')

['y_train.pkl']

In [16]:
joblib.dump(prof_predict_prob, 'prof_predict_prob.pkl')

['prof_predict_prob.pkl']

In [21]:
text_process('Helllo my name is Matthieu.')

['Helllo', 'name', 'Matthieu']