In [1]:
import torch
import pandas as pd
from torch.utils.data import Dataset
from torch.utils.data import random_split
from torch.utils.data import DataLoader
from torch import nn
from torchsummary import summary

In [2]:
class Email(Dataset):
    
    def __init__(self, file_path):
        self.file_path = pd.read_csv(file_path)
        
    def __len__(self):
        return len(self.file_path)
    
    def __getitem__(self, index):
        word_count = torch.tensor([self.file_path.iloc[index, 1:-1]], dtype=torch.float)
        label = torch.tensor([self.file_path.iloc[index, -1]], dtype=torch.float)
        return word_count, label

In [3]:
class Network(nn.Module):
    
    def __init__(self):
        super().__init__()
        self.fc1 = nn.Linear(3000, 1)
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, x):
        x = self.fc1(x)
        x = self.sigmoid(x)
        return x

In [4]:
def train_single_epoch(model, data_loader, loss_fn, optimiser):

    for sample, target in data_loader:

        # calculate loss
        target = target.flatten()
        prediction = model(sample)
        prediction = prediction.flatten()
        loss = loss_fn(prediction, target)

        # backpropagate error and update weights
        optimiser.zero_grad()
        loss.backward()
        optimiser.step()

    return loss.item()

In [5]:
url='https://drive.google.com/file/d/15lHo3g3MacKROIxPXoKb5BKJlnkZ0KsT/view?usp=sharing'
url='https://drive.google.com/uc?id=' + url.split('/')[-2]

em_set = Email(url)
print("length of the dataset is:", len(em_set))

length of the dataset is: 5172


In [6]:
rate = 0.8
LEN_em = len(em_set)
LEN_train = int(rate * LEN_em)
LEN_test = LEN_em - LEN_train

In [7]:
train_data, test_data = random_split(em_set, [LEN_train, LEN_test])
train_loader = DataLoader(train_data, batch_size = 128, shuffle = True)
test_loader = DataLoader(test_data, batch_size = 128, shuffle = False)

In [8]:
model = Network()
summary(model, (1, 3000))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Linear-1                 [-1, 1, 1]           3,001
           Sigmoid-2                 [-1, 1, 1]               0
Total params: 3,001
Trainable params: 3,001
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.01
Forward/backward pass size (MB): 0.00
Params size (MB): 0.01
Estimated Total Size (MB): 0.02
----------------------------------------------------------------


In [9]:
loss_fn = nn.MSELoss()
optimiser = torch.optim.Adam(model.parameters(), lr=0.001)

In [10]:
EPOCHS = 5
for i in range(EPOCHS):
    print(f"Epoch {i+1}")
    loss = train_single_epoch(model, train_loader, loss_fn, optimiser)
    print(f"tranning loss: {loss}")
    print("---------------------------")
print("Finished training")

Epoch 1
tranning loss: 0.12637469172477722
---------------------------
Epoch 2
tranning loss: 0.06488882750272751
---------------------------
Epoch 3
tranning loss: 0.056798841804265976
---------------------------
Epoch 4
tranning loss: 0.06736987084150314
---------------------------
Epoch 5
tranning loss: 0.05660796910524368
---------------------------
Finished training


In [11]:
correct_sample = 0.0
num_sample = 0.0

model.eval()
with torch.no_grad():
    for sample, target in test_loader:
        
        target = target.flatten()
        prediction = model(sample)
        prediction = prediction.flatten()
        y_pred = torch.round(prediction)
        num_sample += len(target)
        correct_sample += (y_pred == target).sum().float()
        
    acc = correct_sample / num_sample
    acc = acc.item()

print(f"validation accuracy: {acc}")

validation accuracy: 0.9671497344970703
