In [162]:
import torch
from torch import nn
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

In [163]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

In [164]:
read_csv = pd.read_csv("data.csv")
read_csv

Unnamed: 0,email,label
0,date wed NUMBER aug NUMBER NUMBER NUMBER NUMB...,0
1,martin a posted tassos papadopoulos the greek ...,0
2,man threatens explosion in moscow thursday aug...,0
3,klez the virus that won t die already the most...,0
4,in adding cream to spaghetti carbonara which ...,0
...,...,...
2995,abc s good morning america ranks it the NUMBE...,1
2996,hyperlink hyperlink hyperlink let mortgage le...,1
2997,thank you for shopping with us gifts for all ...,1
2998,the famous ebay marketing e course learn to s...,1


In [165]:
read_csv.dropna(subset=["email"],inplace = True)

text = read_csv["email"]
vectorizer = CountVectorizer()
bow_data = vectorizer.fit_transform(text).toarray()


X = torch.tensor(bow_data, dtype=torch.float)
y = torch.tensor(read_csv["label"], dtype = torch.float)

print(X[:1])

X.shape

tensor([[0., 0., 0.,  ..., 0., 0., 0.]])


torch.Size([2999, 34116])

In [166]:
import pickle

with open('vectorizer.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)

In [167]:
X_train, X_test , y_train , y_test = train_test_split(X,y,test_size=0.3,random_state=42)
len(X_train),len(X_test),len(y_train),len(y_test)

(2099, 900, 2099, 900)

In [168]:
#Defining Accuracy function

def accuracy_fn(y_true,y_pred):
    correct = torch.eq(y_true,y_pred).sum().item()
    acc = (correct/len(y_pred))*100
    return acc


In [169]:
# Making the Model
class SpamClassifierV0(nn.Module):
  def __init__(self):
    super().__init__()
    self.layer_1 = nn.Linear(in_features = 34116,out_features = 10)
    self.layer_2 = nn.Linear(in_features = 10,out_features = 10)
    self.layer_3 = nn.Linear(in_features = 10,out_features = 1)


  def forward(self,x):
    return self.layer_3(self.layer_2((self.layer_1(x))))

In [170]:
model_0 = SpamClassifierV0().to(device)
model_0

SpamClassifierV0(
  (layer_1): Linear(in_features=34116, out_features=10, bias=True)
  (layer_2): Linear(in_features=10, out_features=10, bias=True)
  (layer_3): Linear(in_features=10, out_features=1, bias=True)
)

In [171]:
loss_fn = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(params = model_0.parameters(),lr = 0.01)

In [172]:
torch.manual_seed(42)
torch.cuda.manual_seed(42)
epochs = 300

X_train , X_test = X_train.to(device),X_test.to(device)
y_train , y_test = y_train.to(device),y_test.to(device)

for epoch in range(epochs):
  model_0.train()
  y_logits = model_0(X_train).squeeze()
  y_pred = torch.round(torch.sigmoid(y_logits))
  loss = loss_fn(y_logits,y_train)
  acc = accuracy_fn(y_true = y_train , y_pred=y_pred)
  optimizer.zero_grad()
  loss.backward()
  optimizer.step()

  model_0.eval()

  with torch.inference_mode():
    test_logits = model_0(X_test).squeeze()
    test_pred = torch.round(torch.sigmoid(test_logits))
    test_loss = loss_fn(test_logits,y_test)
    test_acc = accuracy_fn(y_true = y_test , y_pred=test_pred)

  if epoch % 10 == 0:
    print(f"Epoch: {epoch} | Loss: {loss:.5f} | Acc: {acc:.2f}% | Test Loss: {test_loss:.5f} | Test Acc : {test_acc:0.2f}")

Epoch: 0 | Loss: 0.83420 | Acc: 16.15% | Test Loss: 0.63676 | Test Acc : 61.22
Epoch: 10 | Loss: 0.03950 | Acc: 99.67% | Test Loss: 0.08610 | Test Acc : 98.44
Epoch: 20 | Loss: 0.00718 | Acc: 99.81% | Test Loss: 0.11648 | Test Acc : 98.33
Epoch: 30 | Loss: 0.00439 | Acc: 99.81% | Test Loss: 0.13814 | Test Acc : 98.44
Epoch: 40 | Loss: 0.00273 | Acc: 99.90% | Test Loss: 0.13284 | Test Acc : 98.67
Epoch: 50 | Loss: 0.00248 | Acc: 99.90% | Test Loss: 0.12343 | Test Acc : 98.78
Epoch: 60 | Loss: 0.00227 | Acc: 99.90% | Test Loss: 0.11398 | Test Acc : 99.00
Epoch: 70 | Loss: 0.00204 | Acc: 99.90% | Test Loss: 0.10422 | Test Acc : 99.11
Epoch: 80 | Loss: 0.00183 | Acc: 99.90% | Test Loss: 0.09491 | Test Acc : 99.00
Epoch: 90 | Loss: 0.00163 | Acc: 99.90% | Test Loss: 0.08665 | Test Acc : 99.00
Epoch: 100 | Loss: 0.00146 | Acc: 99.90% | Test Loss: 0.08011 | Test Acc : 99.11
Epoch: 110 | Loss: 0.00132 | Acc: 99.90% | Test Loss: 0.07527 | Test Acc : 99.11
Epoch: 120 | Loss: 0.00121 | Acc: 99.90

In [173]:
y_pred = model_0(X_test[:1])
print(y_pred)
y_test[:10]



tensor([[-56.7098]], device='cuda:0', grad_fn=<AddmmBackward0>)


tensor([0., 0., 0., 0., 0., 0., 0., 0., 1., 0.], device='cuda:0')

In [174]:
from pathlib import Path

MODEL_PATH = Path("spam_classifier_v0.pth")
MODEL_PATH.parent.mkdir(parents=True, exist_ok=True)
torch.save(obj=model_0.state_dict(),f=MODEL_PATH)

In [175]:
model_1 = SpamClassifierV0()
model_1.load_state_dict(torch.load(f=MODEL_PATH))

<All keys matched successfully>

In [206]:
texts = [

"""have lost my bicycle on 10th August.I have bought it Last week. Last parked at infront of BH4. Please send me 1000 money """




]
with open('vectorizer.pkl', 'rb') as f:
  loaded_vectorizer = pickle.load(f)

bow_data = loaded_vectorizer.transform(texts).toarray()


X_pred = torch.tensor(bow_data, dtype=torch.float)

X_pred

tensor([[0., 0., 0.,  ..., 0., 0., 0.]])

In [207]:
model_1.eval()
with torch.inference_mode():
  y_logits = model_1(X_pred).squeeze()
  y_pred = torch.round(torch.sigmoid(y_logits))
y_pred = "It is Spam" if y_pred == 1 else "It is not Spam"

In [208]:
print(y_pred)

It is Spam
