In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import seaborn as sns
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [5]:
df = pd.read_csv('E:/emails.csv')

In [61]:
from sklearn.utils import resample

In [63]:
max_count = df['spam'].value_counts().max()

balanced_data = []
for spm in df['spam'].unique():
    category_data = df[df['spam'] == spm]
    if len(category_data) < max_count:
        balanced_category_data = resample(category_data,replace=True,n_samples=max_count,random_state=42)
    else:
        balanced_category_data = resample(category_data,replace=True,n_samples=max_count,random_state=42)
    balanced_data.append(balanced_category_data)
balanced_df = pd.concat(balanced_data)

In [65]:
balanced_df['spam'].value_counts()

spam
1    4360
0    4360
Name: count, dtype: int64

In [6]:
df.head()

Unnamed: 0,text,spam
0,Subject: naturally irresistible your corporate...,1
1,Subject: the stock trading gunslinger fanny i...,1
2,Subject: unbelievable new homes made easy im ...,1
3,Subject: 4 color printing special request add...,1
4,"Subject: do not have money , get software cds ...",1


In [7]:
df.shape

(5728, 2)

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5728 entries, 0 to 5727
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    5728 non-null   object
 1   spam    5728 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 89.6+ KB


In [9]:
df.tail()

Unnamed: 0,text,spam
5723,Subject: re : research and development charges...,0
5724,"Subject: re : receipts from visit jim , than...",0
5725,Subject: re : enron case study update wow ! a...,0
5726,"Subject: re : interest david , please , call...",0
5727,Subject: news : aurora 5 . 2 update aurora ve...,0


In [12]:
import re
stop_words = stopwords.words('english')

In [13]:
ps = PorterStemmer()

In [15]:
def stemming(content):
    cleaned_text = re.sub('[^a-zA-Z]', ' ',content)
    cleaned_text = cleaned_text.lower()
    cleaned_text = cleaned_text.split()
    cleaned_text = [ps.stem(txt) for txt in cleaned_text if not txt in stop_words]
    cleaned_text = ' '.join(cleaned_text)
    return cleaned_text

In [66]:
balanced_df['text'] = df['text'].apply(stemming)

In [67]:
balanced_df

Unnamed: 0,text,spam
1126,subject save money get oem softwar need softwa...,1
860,subject perfect visual solut busi work compani...,1
1294,subject new peni enlarg patch new peni enlarg ...,1
1130,subject med girl happi girl unsatisfi potenc w...,1
1095,subject natur irresist corpor ident lt realli ...,1
...,...,...
3544,subject rice seminar hello fyi jone graduat sc...,0
2254,subject paper request inform meet san antonio ...,0
1708,subject site john griebl optic network engin a...,0
3419,subject summari spreadsheet data vendor resear...,0


In [24]:
df.head()

Unnamed: 0,text,spam
0,subject natur irresist corpor ident lt realli ...,1
1,subject stock trade gunsling fanni merril muzo...,1
2,subject unbeliev new home made easi im want sh...,1
3,subject color print special request addit info...,1
4,subject money get softwar cd softwar compat gr...,1


In [68]:
X = balanced_df['text'].values

In [69]:
y = balanced_df['spam'].values

In [70]:
X_train ,X_test, y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [71]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [72]:
vect = TfidfVectorizer()

In [73]:
X_train = vect.fit_transform(X_train)
X_test = vect.transform(X_test)

In [74]:
X_train.shape

(6976, 21583)

In [75]:
model = LogisticRegression()

In [76]:
model.fit(X_train, y_train)

In [77]:
prediction = model.predict(X_test)

In [78]:
print('Accuracy of the model is:', accuracy_score(y_test, prediction))

Accuracy of the model is: 0.9931192660550459


In [79]:
print(classification_report(y_test,prediction))

              precision    recall  f1-score   support

           0       1.00      0.99      0.99       879
           1       0.99      1.00      0.99       865

    accuracy                           0.99      1744
   macro avg       0.99      0.99      0.99      1744
weighted avg       0.99      0.99      0.99      1744



In [80]:
print(confusion_matrix(y_test,prediction))

[[868  11]
 [  1 864]]


In [81]:
import pickle

In [82]:
pickle.dump(model,open('model.pkl','wb'))
pickle.dump(vect,open('vector.pkl','wb'))