In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from imblearn.under_sampling import RandomUnderSampler
from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from torch import nn, optim
import pandas as pd
import pickle
import torch
import nltk

device = 'cuda' if torch.cuda.is_available() else 'cpu'

df = pd.read_csv('C:/Users/Arsh/Downloads/enron_spam_data.csv', usecols=[2, 3], names=['text', 'label'], skiprows=1)

stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))

df.head()

Unnamed: 0,text,label
0,,ham
1,"gary , production from the high island larger ...",ham
2,- calpine daily gas nomination 1 . doc,ham
3,fyi - see note below - already done .\nstella\...,ham
4,fyi .\n- - - - - - - - - - - - - - - - - - - -...,ham


In [2]:
df['label'] = df['label'].map({'ham': 0, 'spam': 1})

df['label'].value_counts()

1    17171
0    16545
Name: label, dtype: int64

In [3]:
df.isnull().sum()

text     371
label      0
dtype: int64

In [4]:
df.dropna(inplace=True)

df.duplicated().sum()

3565

In [5]:
df.drop_duplicates(inplace=True)

df.reset_index(drop=True, inplace=True)

In [6]:
df['text'] = df['text'].apply(lambda x: x.lower())
df['text'] = df['text'].apply(lambda x: word_tokenize(x))
df['text'] = df['text'].apply(lambda x: [y for y in x if not y in stop_words])
df['text'] = df['text'].apply(lambda x: [stemmer.stem(y) for y in x])
df['text'] = df['text'].apply(lambda x: [y for y in x if y.isalpha()])
df['text'] = df['text'].apply(lambda x: ' '.join(x))

In [7]:
tfidf = TfidfVectorizer()

tfidf.fit(df['text'])

X = tfidf.transform(df['text'])
y = df['label']

X_train, y_train = RandomUnderSampler().fit_resample(X, y)

y_train.value_counts()

0    13987
1    13987
Name: label, dtype: int64

In [8]:
X_train = X_train.todense()

X_train = torch.tensor(X_train, dtype=torch.float32)
y_train = torch.tensor(y_train, dtype=torch.float32)

dataset = TensorDataset(X_train, y_train)

dataloader = DataLoader(dataset, batch_size=64)

In [9]:
class SpamDetector(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc1 = nn.Linear(117022, 512)
        self.fc2 = nn.Linear(512, 256)
        self.fc3 = nn.Linear(256, 128)
        self.fc4 = nn.Linear(128, 64)
        self.fc5 = nn.Linear(64, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.fc1(x)
        x = nn.ReLU()(x)
        x = self.fc2(x)
        x = nn.ReLU()(x)
        x = self.fc3(x)
        x = nn.ReLU()(x)
        x = self.fc4(x)
        x = nn.ReLU()(x)
        x = self.fc5(x)
        x = self.sigmoid(x)
        return x

model = SpamDetector().to(device)

model.load_state_dict(torch.load('C:/Users/Arsh/Desktop/SpamDetection_Save_last.pth'))

model.train()

loss_function = nn.BCELoss()

optimizer = optim.Adam(model.parameters(), lr=0.001)

In [10]:
for epoch in range(5):
    for (inputs, labels) in dataloader:
        
        inputs, labels = inputs.to(device), labels.to(device)
        
        optimizer.zero_grad()

        outputs = model(inputs)

        loss = loss_function(outputs, labels.unsqueeze(1))

        loss.backward()

        optimizer.step()

    print(f"Epoch: {epoch}, Loss: {loss.item()}")
    
#     if epoch % 50 == 0:
#         torch.save(model.state_dict(), f'C:/Users/Arsh/Desktop/SpamDetection_Save_{epoch}.pth')

Epoch: 0, Loss: 3.576282665562758e-07
Epoch: 1, Loss: 0.0
Epoch: 2, Loss: 1.9868217293605994e-08
Epoch: 3, Loss: 0.0
Epoch: 4, Loss: 0.0


In [11]:
model.eval()

my_text = ['Congratulations! You have won a free vacation to Hawaii! Click here to claim your prize.',
           "I'm so sorry, but I won't be able to make it to dinner tonight. I have to work late.",
           'Your credit card has been declined. Please update your payment information immediately.',
           'Can you please pick up some milk on your way home from work?',
           'You have been selected to receive a free sample of our new product. Click here to claim your gift.',
           "I'm so excited for the new season of Game of Thrones!",
           'Stop ignoring our calls! We need to collect your outstanding balance.',
           "I just got a new job! I'm so happy.",
          'Your computer has been infected with a virus. Click here to download our antivirus software.',
          "I'm going to the doctor tomorrow. I'm not feeling well.",
          "Congratulations! You have won a free trip to the Bahamas.",
           "Hi John, I hope you're doing well. Did you get a chance to look over the report I sent you earlier this week?",
          "Get rich quick! Invest in our amazing money-making scheme today.",
           "Hey Sarah, what time are we meeting for lunch today?",
          "Act now and receive a special discount on our miracle weight-loss pills.",
           "The weather is beautiful today, isn't it? I'm thinking of going for a walk in the park later.",
          "You have been selected to participate in a once-in-a-lifetime opportunity to earn millions of dollars.",
           "Thank you for your email. I appreciate your prompt response.",
          "Click here to claim your prize and join the thousands of satisfied customers who have already benefited from our product.",
          "Hi there, just wanted to remind you that our meeting is scheduled for tomorrow at 10am."]

# with open('C:/Users/Arsh/Desktop/tfidf.pkl', 'rb') as f:
#     tfidf = pickle.load(f)

prediction = None

for text in my_text:
    text1 = text.lower()
    text1 = word_tokenize(text1)
    text1 = [i for i in text1 if not i in stop_words]
    text1 = [stemmer.stem(i) for i in text1]
    text1 = [i for i in text1 if i.isalpha()]
    
    print(text, '\n')
    
    text1 = tfidf.transform([' '.join(text1)])

    text1 = text1.todense()

    text1 = torch.tensor(text1, dtype=torch.float32).to(device)
    
    with torch.no_grad():
        pred = model(text1)
        
    if pred[0][0].item() > 0.5:
        prediction = 'Spam'
    else:
        prediction = 'Non-Spam'

    print('Prediction:', prediction, '\n', 100 * '-', '\n')

Congratulations! You have won a free vacation to Hawaii! Click here to claim your prize. 

Prediction: Spam 
 ---------------------------------------------------------------------------------------------------- 

I'm so sorry, but I won't be able to make it to dinner tonight. I have to work late. 

Prediction: Non-Spam 
 ---------------------------------------------------------------------------------------------------- 

Your credit card has been declined. Please update your payment information immediately. 

Prediction: Spam 
 ---------------------------------------------------------------------------------------------------- 

Can you please pick up some milk on your way home from work? 

Prediction: Non-Spam 
 ---------------------------------------------------------------------------------------------------- 

You have been selected to receive a free sample of our new product. Click here to claim your gift. 

Prediction: Spam 
 -----------------------------------------------------