In [2]:
import string
import pandas as pd

Read the CSV to a Pandas `DataFrame`

In [3]:
df = pd.read_csv('emails.csv')
df.head()

Unnamed: 0,text,spam
0,Subject: naturally irresistible your corporate...,1
1,Subject: the stock trading gunslinger fanny i...,1
2,Subject: unbelievable new homes made easy im ...,1
3,Subject: 4 color printing special request add...,1
4,"Subject: do not have money , get software cds ...",1


Check the number of spam and non-spam ("ham") entries.

In [4]:
df['spam'].value_counts()

0    4360
1    1368
Name: spam, dtype: int64

In [5]:
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

def preprocess(text):
    # remove punctuation and lowercase
    text = "".join([t.lower() for t in text if t not in string.punctuation])
    
    # tokenize
    tokens = text.split(" ")
    
    # remove stopwords
    return " ".join(t for t in tokens if t not in ENGLISH_STOP_WORDS)


df['text'] = df['text'].apply(lambda text: preprocess(text))

In [6]:
df.head()

Unnamed: 0,text,spam
0,subject naturally irresistible corporate ident...,1
1,subject stock trading gunslinger fanny merril...,1
2,subject unbelievable new homes easy im wantin...,1
3,subject 4 color printing special request addi...,1
4,subject money software cds software compati...,1


Split data into X (features) and y (target labels), and learn vectors for the X data using the TF-IDF technique.

In [7]:
X = df['text']
y = df['spam']

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer()
tfidf.fit(X)
X_vectors = tfidf.transform(X)

Split the data into training and test sets

In [9]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_vectors, y,test_size=0.2, random_state=42)

print(X_train.shape)
print(y_train.shape)

(4582, 37023)
(4582,)


Train a K-Nearest Neighbours classifier on the training data

In [10]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, recall_score, precision_score

knn_classifier = KNeighborsClassifier()
knn_classifier.fit(X_train,y_train)
y_pred1 = knn_classifier.predict(X_test)

Let's evaluate the model with a simple evaluation using accuracy, precision and recall.

In [11]:
# Accuracy, precision, recall scores
print("Accuracy:", accuracy_score(y_test,y_pred1))
print("Precision:", precision_score(y_test,y_pred1))
print("Recall:", recall_score(y_test,y_pred1))

Accuracy: 0.9729493891797557
Precision: 0.9814126394052045
Recall: 0.9103448275862069


Serialize our model with `joblib`, so we can load it in our Django application, and run text through the model to get predictions.

In [12]:
from joblib import dump

dump(knn_classifier, 'spam_model.joblib')
dump(tfidf, 'tfidf.joblib')

['tfidf.joblib']

In [13]:
knn_classifier.predict(tfidf.transform([preprocess(
    "Hi, I tried phoning you but no answer - give me a call back."
)]))

array([0], dtype=int64)

In [14]:
from collections import Counter 

df['words'] = df.text.str.split()

counts = Counter()

for word in df[df['spam'] == 1]['words']:
    if word:
        counts.update(Counter(word))
    

In [15]:
counts.most_common(25)

[('subject', 1574),
 ('s', 1333),
 ('com', 998),
 ('1', 952),
 ('business', 844),
 ('company', 805),
 ('email', 804),
 ('information', 740),
 ('e', 698),
 ('5', 687),
 ('money', 662),
 ('2', 613),
 ('free', 606),
 ('3', 604),
 ('http', 600),
 ('mail', 586),
 ('t', 577),
 ('000', 560),
 ('click', 531),
 ('just', 524),
 ('time', 521),
 ('new', 504),
 ('make', 496),
 ('website', 465),
 ('adobe', 462)]