In [1]:
%%time
from os import listdir, getcwd, chdir
from os.path import isfile, join, dirname, realpath
import pandas as pd
import re
from bs4 import BeautifulSoup as bs
import nltk
from nltk.corpus import wordnet
from nltk import word_tokenize
import numpy as np
import string
from nltk.corpus import stopwords
import os
import email
from html.parser import HTMLParser
import matplotlib.pyplot as plt
import seaborn as sns

Wall time: 1.66 s


In [2]:
%%time
wpt = nltk.WordPunctTokenizer()
stop_words = nltk.corpus.stopwords.words('english')

def normalize_document(doc):
    # lower case and remove special characters\whitespaces
    doc = re.sub(r'[^a-zA-Z\s]', '', doc, re.I|re.A)
    doc = doc.lower()
    doc = doc.strip()
    # tokenize document
    tokens = wpt.tokenize(doc)
    # filter stopwords out of document
    filtered_tokens = [token for token in tokens if token not in stop_words]
    # re-create document from filtered tokens
    doc = ' '.join(filtered_tokens)
    return doc

Wall time: 2.67 ms


In [3]:
df = pd.read_csv('C:/Users/bodie/Documents/Spreadsheets/emails_not_normalized.csv')

In [4]:

df['full_text'] = df['full_text'].apply(normalize_document)

In [5]:
%%time
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.calibration import CalibratedClassifierCV


clf = LinearSVC(random_state=0, tol=1e-5)

calibrated_clf = CalibratedClassifierCV(base_estimator=clf, cv=3)

text_clf = Pipeline([
  ('vect', CountVectorizer()),
   ('tfidf', TfidfTransformer()),
('calibrated_clf',calibrated_clf)])

Wall time: 7.36 ms


In [6]:
print(text_clf)

Pipeline(steps=[('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                ('calibrated_clf',
                 CalibratedClassifierCV(base_estimator=LinearSVC(random_state=0,
                                                                 tol=1e-05),
                                        cv=3))])


In [7]:
%%time

y = df['is_spam'].values
   
X = df['full_text'].tolist()


Wall time: 0 ns


In [8]:
X

['amailbotwebde thu aug returnpath amailbotwebde deliveredto zzzzlocalhostspamassassintaintorg received localhost localhost phoboslabsspamassassintaintorg postfix esmtp id bc thu aug edt received mailwebnotenet localhost pop fetchmail zzzzlocalhost singledrop thu aug ist received ddit webnotenet esmtp id naa thu aug amailbotwebde received rsmtpkoreacom ddit microsoft smtpsvc sat aug 2 : 10 + 0900 : subject : life insurance - pay ? date : wed , 21 aug 2002 20 : 31 : 57 - 1600 mime - version : 1 . 0 message - id : < 0103c1042001882dd_it7 @ dd_it7 > content - type : text / html ; charset =" iso - 8859 - 1 " content - transfer - encoding : quoted - printable <= / tr > save 70 % life insurance . spend ? life quote savings ensurin = g family \' financial security important . life quote savings = kes buying life insurance simple affordable . provide free access = best companies lowest rates . life quote savings fast , eas = saves money ! let us help get started best val = ues country new cove

In [9]:
%%time
text_clf.fit(X,y)

Wall time: 2.88 s


Pipeline(steps=[('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                ('calibrated_clf',
                 CalibratedClassifierCV(base_estimator=LinearSVC(random_state=0,
                                                                 tol=1e-05),
                                        cv=3))])

In [10]:
%%time
text_clf.predict_proba(X)[:5, :]

Wall time: 3.97 s


array([[6.67032143e-04, 9.99332968e-01],
       [8.46451173e-02, 9.15354883e-01],
       [2.53029788e-03, 9.97469702e-01],
       [6.95580440e-03, 9.93044196e-01],
       [1.07002678e-01, 8.92997322e-01]])

In [11]:
df_idf = pd.DataFrame(text_clf['tfidf'].idf_, index=text_clf['vect'].get_feature_names(),columns=["idf_weights"]) 

df_idf.sort_values(by=['idf_weights'])

Unnamed: 0,idf_weights
date,1.000535
subject,1.001498
id,1.024019
received,1.025444
returnpath,1.026651
...,...
hovels,9.450412
57d10910d1f977334df9f1b25c6a2d08,9.450412
57c6,9.450412
howd,9.450412


In [12]:
%%time
probabilities = text_clf.predict_proba(X)

Wall time: 2.2 s


In [13]:
%%time
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y)

Wall time: 3.99 ms


In [14]:
%%time
text_clf.fit(X_train,y_train)

Wall time: 2.26 s


Pipeline(steps=[('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                ('calibrated_clf',
                 CalibratedClassifierCV(base_estimator=LinearSVC(random_state=0,
                                                                 tol=1e-05),
                                        cv=3))])

In [15]:
%%time
predictions = text_clf.predict(X_test)

Wall time: 565 ms


In [16]:
%%time
from sklearn.metrics import classification_report

target_names = ['class 0', 'class 1']

print(classification_report(y_test, predictions, target_names=target_names))

              precision    recall  f1-score   support

     class 0       0.99      1.00      1.00      1722
     class 1       1.00      0.98      0.99       617

    accuracy                           0.99      2339
   macro avg       0.99      0.99      0.99      2339
weighted avg       0.99      0.99      0.99      2339

Wall time: 9.98 ms


In [17]:
df_idf.to_excel("weights.xlsx")

In [18]:
import pickle

filename = 'finalized_model.sav'
pickle.dump(text_clf, open(filename, 'wb'))

In [19]:
test_prediction = X_test[0]

In [20]:
loaded_model = pickle.load(open('finalized_model.sav', 'rb'))

In [21]:

test_prediction = [test_prediction]

loaded_model.predict(test_prediction)

array([1], dtype=int64)