In [1]:
import re
import string
import torch
from torch import nn
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import scipy
from torch import optim
import matplotlib.pyplot as plt
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
from torch.optim.lr_scheduler import ReduceLROnPlateau
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
# from nltk.stem import WordNetLemmatizer
# import nltk

In [2]:
device = "cpu"

In [3]:
df = pd.read_csv('twitter_cleaned.csv')
df = df.dropna()

In [4]:
df.head()

Unnamed: 0,label,message,category
0,neutral,prenatal move to wednesday at pm starting toni...,1
1,positive,happy nd birthday to prince george i cant beli...,2
2,positive,do not be afraid to be saint be open to the lo...,2
3,neutral,dst is saturday nightsunday morning got ta be ...,1
4,negative,sony reward app is like a lot of yo female sin...,0


In [60]:
x_train, x_test, y_train, y_test = train_test_split(
    df['message'], df['category'], test_size=.2, stratify=df['label'], random_state=0)

print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)

(39740,) (9935,) (39740,) (9935,)


In [61]:
vectorizer = TfidfVectorizer(max_features=2000)
# vectorizer = TfidfVectorizer(max_features=1500, stop_words='english')

# Learn vocabulary from training texts and vectorize training texts.
x_train = vectorizer.fit_transform(x_train)

# Vectorize test texts.
x_test = vectorizer.transform(x_test)

In [62]:
x_train = torch.tensor(scipy.sparse.csr_matrix.todense(x_train)).float()
x_test = torch.tensor(scipy.sparse.csr_matrix.todense(x_test)).float()

In [63]:
y_train = torch.tensor(y_train.values)
y_test = torch.tensor(y_test.values)

In [9]:
def topk_encoding(nd_array):
    """
    Function to flatten the predicted category
    """
    
    predictions = nd_array
    
    ps = torch.exp(predictions)
    top_p, top_class  = ps.topk(1, dim=1)
    

    return top_class

In [10]:
class NeuralNetwork(nn.Module):
    def __init__(self, in_dim, out_dim):
        super().__init__()
        self.in_dim = in_dim
        self.out_dim = out_dim
        
        self.hidden_layer_1 = nn.Linear(x_train.shape[1], 64) # input to first hidden layer
        self.output_layer = nn.Linear(64, self.out_dim)
        self.activation = nn.ReLU()
        self.dropout = nn.Dropout(0.1)
        self.softmax = nn.LogSoftmax(dim=1)
    
    def forward(self, x):
        x = self.hidden_layer_1(x)
        x = self.activation(x)
        x = self.dropout(x)
        
        y = self.output_layer(x)
        y = self.activation(y)
        y = self.softmax(y)
        
        return y

In [64]:
model = NeuralNetwork(x_train.shape[1], df['category'].nunique())
# model = NeuralNetwork(x_train.shape[1], 5)


# Define the loss
criterion = nn.NLLLoss()


# Optimizers require the parameters to optimize and a learning rate
optimizer = optim.Adam(model.parameters(), lr=0.002)

#setting up scheduler
# scheduler = ReduceLROnPlateau(optimizer, 'min', patience = 10)

In [59]:
model = nn.Sequential(nn.Linear(x_train.shape[1], 64),
                      nn.ReLU(),
                      nn.Dropout(0.1),
                      nn.Linear(64, df['category'].nunique()),
                      nn.LogSoftmax(dim=1))

# Define the loss
criterion = nn.NLLLoss()

# Forward pass, get our logits
logps = model(x_train)
# Calculate the loss with the logits and the labels
loss = criterion(logps, y_train)

loss.backward()

# Optimizers require the parameters to optimize and a learning rate
optimizer = optim.Adam(model.parameters(), lr=0.002)

TypeError: nll_loss_nd(): argument 'target' (position 2) must be Tensor, not Series

In [13]:
%%time
train_losses = []
test_losses = []
test_accuracies = []

epochs = 200
for e in range(epochs):
    optimizer.zero_grad()

    output = model.forward(x_train) #Forward pass, get the logits
    loss = criterion(output, y_train) # Calculate the loss with the logits and the labels
    loss.backward()
    train_loss = loss.item()
    train_losses.append(train_loss)
    
    optimizer.step()

    # Turn off gradients for validation, saves memory and computations
    with torch.no_grad():
        model.eval()
        log_ps = model.forward(x_test)
        test_loss = criterion(log_ps, y_test)
        test_losses.append(test_loss)

        ps = torch.exp(log_ps)
        top_p, top_class  = ps.topk(1, dim=1)
        equals = top_class == y_test.view(*top_class.shape)
        test_accuracy = torch.mean(equals.float())
        test_accuracies.append(test_accuracy)

    model.train()

    print(f"Epoch: {e+1}/{epochs}.. ",
          f"Training Loss: {train_loss:.3f}.. ",
          f"Test Loss: {test_loss:.3f}.. ",
          f"Test Accuracy: {test_accuracy:.3f}")
#     scheduler.step(test_loss/len(y_test))

Epoch: 1/200..  Training Loss: 1.132..  Test Loss: 1.125..  Test Accuracy: 0.156
Epoch: 2/200..  Training Loss: 1.125..  Test Loss: 1.119..  Test Accuracy: 0.156
Epoch: 3/200..  Training Loss: 1.119..  Test Loss: 1.112..  Test Accuracy: 0.156
Epoch: 4/200..  Training Loss: 1.112..  Test Loss: 1.105..  Test Accuracy: 0.156
Epoch: 5/200..  Training Loss: 1.105..  Test Loss: 1.098..  Test Accuracy: 0.264
Epoch: 6/200..  Training Loss: 1.097..  Test Loss: 1.090..  Test Accuracy: 0.492
Epoch: 7/200..  Training Loss: 1.090..  Test Loss: 1.083..  Test Accuracy: 0.537
Epoch: 8/200..  Training Loss: 1.082..  Test Loss: 1.075..  Test Accuracy: 0.536
Epoch: 9/200..  Training Loss: 1.074..  Test Loss: 1.067..  Test Accuracy: 0.534
Epoch: 10/200..  Training Loss: 1.065..  Test Loss: 1.059..  Test Accuracy: 0.533
Epoch: 11/200..  Training Loss: 1.057..  Test Loss: 1.051..  Test Accuracy: 0.533
Epoch: 12/200..  Training Loss: 1.049..  Test Loss: 1.043..  Test Accuracy: 0.535
Epoch: 13/200..  Training

Epoch: 102/200..  Training Loss: 0.714..  Test Loss: 0.800..  Test Accuracy: 0.626
Epoch: 103/200..  Training Loss: 0.714..  Test Loss: 0.800..  Test Accuracy: 0.626
Epoch: 104/200..  Training Loss: 0.712..  Test Loss: 0.799..  Test Accuracy: 0.627
Epoch: 105/200..  Training Loss: 0.711..  Test Loss: 0.799..  Test Accuracy: 0.627
Epoch: 106/200..  Training Loss: 0.710..  Test Loss: 0.799..  Test Accuracy: 0.628
Epoch: 107/200..  Training Loss: 0.710..  Test Loss: 0.798..  Test Accuracy: 0.628
Epoch: 108/200..  Training Loss: 0.709..  Test Loss: 0.798..  Test Accuracy: 0.628
Epoch: 109/200..  Training Loss: 0.706..  Test Loss: 0.798..  Test Accuracy: 0.629
Epoch: 110/200..  Training Loss: 0.706..  Test Loss: 0.798..  Test Accuracy: 0.629
Epoch: 111/200..  Training Loss: 0.705..  Test Loss: 0.797..  Test Accuracy: 0.629
Epoch: 112/200..  Training Loss: 0.705..  Test Loss: 0.797..  Test Accuracy: 0.629
Epoch: 113/200..  Training Loss: 0.703..  Test Loss: 0.797..  Test Accuracy: 0.630
Epoc

In [14]:
preds = model.forward(x_test)
preds = topk_encoding(preds)
preds[1]

tensor([0])

In [15]:
acc_score = accuracy_score(y_test, preds)
acc_score

0.6362355309511827

In [16]:
f1_score = f1_score(y_test, preds, average='macro')
f1_score

0.5975424613807815

In [87]:
def input_vectorizer(message):
    """
    Function to predict the category of inputted message
    """
    
    vec = vectorizer.transform(pd.Series(message))
    vec = torch.tensor(scipy.sparse.csr_matrix.todense(vec)).float()
    preds = model_1.forward(vec)
    category = topk_encoding(preds).detach().cpu().numpy()
    
    return int(category[0])

In [90]:
a = input_vectorizer("do not be afraid to be saint be open")
a


2

In [89]:
a

2