# MLP with TF-IDF
---

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torchvision import transforms, datasets
from torch.utils.data import DataLoader, TensorDataset, Dataset
from torch import Tensor

import numpy as np
from scipy.sparse import csr_matrix

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
USE_CUDA = torch.cuda.is_available()
DEVICE = torch.device("cuda" if USE_CUDA else "cpu")
print(DEVICE)

cuda


In [None]:
EPOCHS = 30
BATCH_SIZE = 64
LEARNING_RATE = 0.001

### We use TF_IDF here. For the analysis with BOW and Bigram, you can use the same code by only changing the data reading part.

e.g. train_TF_IDF.p -> train_BOW.p




In [None]:
# Load data

import pickle

with open('/train_TF_IDF.p', 'rb') as file:
    X_train = pickle.load(file)
    y_train = pickle.load(file)
    wordlist_filtered = pickle.load(file)

with open('/test_K1_TF_IDF.p', 'rb') as file:
    X_test_K1 = pickle.load(file)
    y_test_K1 = pickle.load(file)
    wordlist_filtered = pickle.load(file)

with open('/test_K2_TF_IDF.p', 'rb') as file:
    X_test_K2 = pickle.load(file)
    y_test_K2 = pickle.load(file)
    wordlist_filtered = pickle.load(file)

with open('/test_K3_TF_IDF.p', 'rb') as file:
    X_test_K3 = pickle.load(file)
    y_test_K3 = pickle.load(file)
    wordlist_filtered = pickle.load(file)

with open('/test_L_TF_IDF.p', 'rb') as file:
    X_test_L = pickle.load(file)
    y_test_L = pickle.load(file)
    wordlist_filtered = pickle.load(file)

# Include top 1000 frequent words only


X_train = X_train.todense()
print(X_train.shape)
print(X_train)

X_test = X_test_K1
X_test = X_test.todense()
print(X_test.shape)
print(X_test)

y_test = y_test_K1

(26939, 1000)
[[0.         0.         0.         ... 0.02843602 0.04265403 0.07582938]
 [0.         0.         0.         ... 0.02389078 0.03412969 0.0887372 ]
 [0.         0.         0.00326797 ... 0.05882353 0.06535948 0.08496732]
 ...
 [0.         0.         0.         ... 0.06341463 0.02926829 0.10243902]
 [0.         0.         0.         ... 0.05555556 0.04166667 0.08333333]
 [0.         0.         0.         ... 0.04958678 0.07438017 0.04132231]]
(17959, 1000)
[[0.         0.         0.         ... 0.01812689 0.04229607 0.03927492]
 [0.         0.         0.         ... 0.02840909 0.04829545 0.05113636]
 [0.         0.00452489 0.         ... 0.05429864 0.04524887 0.04977376]
 ...
 [0.         0.         0.         ... 0.03061224 0.05510204 0.10612245]
 [0.         0.         0.         ... 0.02439024 0.03658537 0.11890244]
 [0.         0.         0.         ... 0.02446483 0.05810398 0.0733945 ]]


In [None]:
y_train = y_train.values
y_test = y_test.values

In [None]:
## train data
class TrainData(Dataset):
    def __init__(self, X_data, y_data):
        self.X_data = X_data
        self.y_data = y_data
        
    def __getitem__(self, index):
        return self.X_data[index], self.y_data[index]
        
    def __len__ (self):
        return len(self.X_data)

train_data = TrainData(torch.FloatTensor(X_train), 
                       torch.FloatTensor(y_train))

## test data    
class TestData(Dataset):
    def __init__(self, X_data, y_data):
        self.X_data = X_data
        self.y_data = y_data
        
    def __getitem__(self, index):
        return self.X_data[index], self.y_data[index]
        
    def __len__ (self):
        return len(self.X_data)
    

test_data = TestData(torch.FloatTensor(X_test),
                     torch.FloatTensor(y_test))

In [None]:
train_loader = DataLoader(dataset=train_data, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(dataset=test_data, batch_size=1)

In [None]:
class BinaryClassification(nn.Module):
    def __init__(self):
        super(BinaryClassification, self).__init__()
        # Number of input features is 1000.
        self.layer_1 = nn.Linear(1000, 64) 
        self.layer_2 = nn.Linear(64, 64)
        self.layer_out = nn.Linear(64, 1) 
        
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=0.1)
        self.batchnorm1 = nn.BatchNorm1d(64)
        self.batchnorm2 = nn.BatchNorm1d(64)
        
    def forward(self, inputs):
        x = self.relu(self.layer_1(inputs))
        x = self.batchnorm1(x)
        x = self.relu(self.layer_2(x))
        x = self.batchnorm2(x)
        x = self.dropout(x)
        x = self.layer_out(x)
        
        return x

In [None]:
model = BinaryClassification()
model.to(DEVICE)
print(model)
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

BinaryClassification(
  (layer_1): Linear(in_features=1000, out_features=64, bias=True)
  (layer_2): Linear(in_features=64, out_features=64, bias=True)
  (layer_out): Linear(in_features=64, out_features=1, bias=True)
  (relu): ReLU()
  (dropout): Dropout(p=0.1, inplace=False)
  (batchnorm1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (batchnorm2): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)


In [None]:
def binary_acc(y_pred, y_test):
    y_pred_tag = torch.round(torch.sigmoid(y_pred))

    correct_results_sum = (y_pred_tag == y_test).sum().float()
    acc = correct_results_sum/y_test.shape[0]
    acc = torch.round(acc * 100)
    
    return acc

In [None]:
def train(model, train_loader, optimizer):
    model.train()
    train_loss = 0
    train_accuracy = 0
    for X_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch.to(DEVICE), y_batch.to(DEVICE)
        optimizer.zero_grad()
        y_pred = model(X_batch)
        
        loss = criterion(y_pred, y_batch.unsqueeze(1))
        acc = binary_acc(y_pred, y_batch.unsqueeze(1))
        
        loss.backward()
        optimizer.step()
        
        train_loss += loss.item()
        train_accuracy += acc.item()
    train_loss /= len(train_loader)
    train_accuracy /=len(train_loader)
    return train_loss, train_accuracy

In [None]:
def evaluate(model, test_loader):
    model.eval()
    test_loss = 0
    test_accuracy = 0
    with torch.no_grad():
        for X_batch, y_batch in test_loader:
            X_batch, y_batch = X_batch.to(DEVICE), y_batch.to(DEVICE)
            y_pred = model(X_batch)

            loss = criterion(y_pred, y_batch.unsqueeze(1))
            acc = binary_acc(y_pred, y_batch.unsqueeze(1))

            test_loss += loss.item()
            test_accuracy += acc.item()
    test_loss /= len(test_loader)
    test_accuracy /=len(test_loader)
    return test_loss, test_accuracy

In [None]:
for epoch in range(1, EPOCHS + 1):
    train_loss, train_accuracy = train(model, train_loader, optimizer)
    test_loss, test_accuracy = evaluate(model, test_loader)
    print('[{}] Train Loss: {:.4f}, Train Accuracy: {:.2f}%'.format(
      epoch, train_loss, train_accuracy))
    print('[{}] Test Loss: {:.4f}, Test Accuracy: {:.2f}%'.format(
          epoch, test_loss, test_accuracy))
  

In [None]:
for epoch in range(1, EPOCHS + 1):
    train_loss, train_accuracy = train(model, train_loader, optimizer)
    print('[{}] Train Loss: {:.4f}, Train Accuracy: {:.2f}%'.format(
      epoch, train_loss, train_accuracy))



test_loss, test_accuracy = evaluate(model, test_loader)
print('[{}] Test Loss: {:.4f}, Test Accuracy: {:.2f}%'.format(
          epoch, test_loss, test_accuracy))

[1] Train Loss: 0.1278, Train Accuracy: 95.28%
[2] Train Loss: 0.0809, Train Accuracy: 96.99%
[3] Train Loss: 0.0664, Train Accuracy: 97.49%
[4] Train Loss: 0.0562, Train Accuracy: 97.81%
[5] Train Loss: 0.0446, Train Accuracy: 98.29%
[6] Train Loss: 0.0415, Train Accuracy: 98.40%
[7] Train Loss: 0.0326, Train Accuracy: 98.63%
[8] Train Loss: 0.0288, Train Accuracy: 98.87%
[9] Train Loss: 0.0236, Train Accuracy: 99.04%
[10] Train Loss: 0.0220, Train Accuracy: 99.13%
[11] Train Loss: 0.0204, Train Accuracy: 99.21%
[12] Train Loss: 0.0263, Train Accuracy: 98.95%
[13] Train Loss: 0.0194, Train Accuracy: 99.23%
[14] Train Loss: 0.0163, Train Accuracy: 99.38%
[15] Train Loss: 0.0148, Train Accuracy: 99.41%
[16] Train Loss: 0.0137, Train Accuracy: 99.47%
[17] Train Loss: 0.0106, Train Accuracy: 99.59%
[18] Train Loss: 0.0121, Train Accuracy: 99.52%
[19] Train Loss: 0.0104, Train Accuracy: 99.58%
[20] Train Loss: 0.0102, Train Accuracy: 99.65%
[21] Train Loss: 0.0092, Train Accuracy: 99.65%
[

In [None]:
test_loss, test_accuracy = evaluate(model, test_loader)
print('[{}] Test Loss: {:.4f}, Test Accuracy: {:.2f}%'.format(
          epoch, test_loss, test_accuracy))

[30] Test Loss: 0.1490, Test Accuracy: 97.16%
