In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

In [4]:
df = pd.read_csv("completeSpamAssassin.csv", encoding="latin-1")

In [5]:
df.head()

Unnamed: 0.1,Unnamed: 0,Body,Label
0,0,\nSave up to 70% on Life Insurance.\nWhy Spend...,1
1,1,1) Fight The Risk of Cancer!\nhttp://www.adcli...,1
2,2,1) Fight The Risk of Cancer!\nhttp://www.adcli...,1
3,3,##############################################...,1
4,4,I thought you might like these:\n1) Slim Down ...,1


In [6]:
df = df[['Body', 'Label']]
df.columns = ['body', 'label']

In [7]:
df.head()

Unnamed: 0,body,label
0,\nSave up to 70% on Life Insurance.\nWhy Spend...,1
1,1) Fight The Risk of Cancer!\nhttp://www.adcli...,1
2,1) Fight The Risk of Cancer!\nhttp://www.adcli...,1
3,##############################################...,1
4,I thought you might like these:\n1) Slim Down ...,1


In [8]:
df['label'].value_counts()

label
0    4150
1    1896
Name: count, dtype: int64

In [9]:
df[df['label'] == 0]['body'].head(10)

1896        Date:        Wed, 21 Aug 2002 10:54:46 -05...
1897    Martin A posted:\nTassos Papadopoulos, the Gre...
1898    Man Threatens Explosion In Moscow Thursday Aug...
1899    Klez: The Virus That Won't Die\n \nAlready the...
1900    >  in adding cream to spaghetti carbonara, whi...
1901    \n> I just had to jump in here as Carbonara is...
1902    The Scotsman - 22 August 2002 Playboy wants to...
1903    Martin Adamson wrote:\n> \n> Isn't it just bas...
1904    The Scotsman Thu 22 Aug 2002  Meaningful sente...
1905    I have been trying to research via SA mirrors ...
Name: body, dtype: object

In [10]:
df[df['label'] == 1]['body'].head(10)

0    \nSave up to 70% on Life Insurance.\nWhy Spend...
1    1) Fight The Risk of Cancer!\nhttp://www.adcli...
2    1) Fight The Risk of Cancer!\nhttp://www.adcli...
3    ##############################################...
4    I thought you might like these:\n1) Slim Down ...
5    A POWERHOUSE GIFTING PROGRAM You Don't Want To...
6    Help wanted.  We are a 14 year old fortune 500...
7    ReliaQuote - Save Up To 70% On Life Insurance\...
8    TIRED OF THE BULL OUT THERE?\nWant To Stop Los...
9    Dear ricardo1 ,\nCOST EFFECTIVE Direct Email A...
Name: body, dtype: object

In [19]:
df['body'].head()

0    \nSave up to 70% on Life Insurance.\nWhy Spend...
1    1) Fight The Risk of Cancer!\nhttp://www.adcli...
2    1) Fight The Risk of Cancer!\nhttp://www.adcli...
3    ##############################################...
4    I thought you might like these:\n1) Slim Down ...
Name: body, dtype: object

In [11]:
df['body'] = df['body'].astype(str).fillna("")
df['body'].isna().sum()

np.int64(0)

In [13]:
df['length'] = df['body'].fillna("").apply(len)


In [14]:
X_train, X_test, y_train, y_test = train_test_split(
    df['body'],
    df['label'],
    test_size=0.2,
    random_state=42,
    stratify=df['label']
)

In [32]:
model = make_pipeline(
    TfidfVectorizer(ngram_range=(1,2),max_features=8000),
    LogisticRegression(max_iter=1000)
)

In [33]:
model.fit(X_train, y_train)

0,1,2
,steps,"[('tfidfvectorizer', ...), ('logisticregression', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,analyzer,'word'
,stop_words,
,token_pattern,'(?u)\\b\\w\\w+\\b'

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,1000


In [34]:
y_pred = model.predict(X_test)

In [35]:
print("Accuracy :", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred, pos_label=1))
print("Recall   :", recall_score(y_test, y_pred, pos_label=1))
print("F1 Score :", f1_score(y_test, y_pred, pos_label=1))

conf_matrix = confusion_matrix(y_test, y_pred)
conf_matrix

Accuracy : 0.952892561983471
Precision: 0.8985148514851485
Recall   : 0.9577836411609498
F1 Score : 0.9272030651340997


array([[790,  41],
       [ 16, 363]])

In [36]:
import joblib
joblib.dump(model, "email_spam_classifier.pkl")

['email_spam_classifier.pkl']