In [21]:
import re
import string
import torch
from torch import nn
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import scipy
from torch import optim
import matplotlib.pyplot as plt
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
from torch.optim.lr_scheduler import ReduceLROnPlateau
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
import joblib
from joblib import load
# from nltk.stem import WordNetLemmatizer
# import nltk

In [2]:
device = "cpu"

In [3]:
df = pd.read_csv('twitter_cleaned.csv')
df = df.dropna()

### Data Cleanup

In [4]:
def to_lower(message):
    result = message.lower()
    return result

def remove_num(message):
    result = re.sub(r'\d+','',message)
    return result

def contractions(message):
     result = re.sub(r"won't", "will not",message)
     result = re.sub(r"would't", "would not",message)
     result = re.sub(r"could't", "could not",message)
     result = re.sub(r"\'d", " would",message)
     result = re.sub(r"can\'t", "can not",message)
     result = re.sub(r"n\'t", " not", message)
     result = re.sub(r"\'re", " are", message)
     result = re.sub(r"\'s", " is", message)
     result = re.sub(r"\'ll", " will", message)
     result = re.sub(r"\'t", " not", message)
     result = re.sub(r"\'ve", " have", message)
     result = re.sub(r"\'m", " am", message)
     return result
    
def remove_punctuation(message):
    result = message.translate(str.maketrans(dict.fromkeys(string.punctuation)))
    return result

def remove_whitespace(message):
    result = message.strip()
    result = re.sub(' +',' ',message)
    return result

def replace_newline(message):
    result = message.replace('\n','')
    return result

def data_cleanup(message):
    cleaning_utils = [to_lower, remove_num, contractions, remove_punctuation, remove_whitespace, replace_newline]
    for util in cleaning_utils:
        message = util(message)
    return message


In [5]:
df.head()

Unnamed: 0,label,message,category
0,neutral,prenatal move to wednesday at pm starting toni...,1
1,positive,happy nd birthday to prince george i cant beli...,2
2,positive,do not be afraid to be saint be open to the lo...,2
3,neutral,dst is saturday nightsunday morning got ta be ...,1
4,negative,sony reward app is like a lot of yo female sin...,0


In [6]:
x_train, x_test, y_train, y_test = train_test_split(
    df['message'], df['category'], test_size=.2, stratify=df['label'])

print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)

(39740,) (9935,) (39740,) (9935,)


### Building Model

In [7]:
vectorizer = TfidfVectorizer(max_features=2000)
# vectorizer = TfidfVectorizer(max_features=1500, stop_words='english')

# Learn vocabulary from training texts and vectorize training texts.
x_train = vectorizer.fit_transform(x_train)

# Vectorize test texts.
x_test = vectorizer.transform(x_test)

In [8]:
x_train = torch.tensor(scipy.sparse.csr_matrix.todense(x_train)).float()
x_test = torch.tensor(scipy.sparse.csr_matrix.todense(x_test)).float()

In [9]:
y_train = torch.tensor(y_train.values)
y_test = torch.tensor(y_test.values)

In [10]:
def topk_encoding(nd_array):
    """
    Function to flatten the predicted category
    """
    
    predictions = nd_array
    
    ps = torch.exp(predictions)
    top_p, top_class  = ps.topk(1, dim=1)
    

    return top_class

In [11]:
class NeuralNetwork(nn.Module):
    def __init__(self, in_dim, out_dim):
        super().__init__()
        self.in_dim = in_dim
        self.out_dim = out_dim
        
        self.hidden_layer_1 = nn.Linear(x_train.shape[1], 64) # input to first hidden layer
        self.output_layer = nn.Linear(64, self.out_dim)
        self.activation = nn.ReLU()
        self.dropout = nn.Dropout(0.1)
        self.softmax = nn.LogSoftmax(dim=1)
    
    def forward(self, x):
        x = self.hidden_layer_1(x)
        x = self.activation(x)
        x = self.dropout(x)
        
        y = self.output_layer(x)
        y = self.activation(y)
        y = self.softmax(y)
        
        return y

In [12]:
model = NeuralNetwork(x_train.shape[1], df['category'].nunique())
# model = NeuralNetwork(x_train.shape[1], 5)


# Define the loss
criterion = nn.NLLLoss()


# Optimizers require the parameters to optimize and a learning rate
optimizer = optim.Adam(model.parameters(), lr=0.002)

#setting up scheduler
# scheduler = ReduceLROnPlateau(optimizer, 'min', patience = 10)

### Model Fine-Tuning

In [13]:
%%time
train_losses = []
test_losses = []
test_accuracies = []

epochs = 200
for e in range(epochs):
    optimizer.zero_grad()

    output = model.forward(x_train) #Forward pass, get the logits
    loss = criterion(output, y_train) # Calculate the loss with the logits and the labels
    loss.backward()
    train_loss = loss.item()
    train_losses.append(train_loss)
    
    optimizer.step()

    # Turn off gradients for validation, saves memory and computations
    with torch.no_grad():
        model.eval()
        log_ps = model.forward(x_test)
        test_loss = criterion(log_ps, y_test)
        test_losses.append(test_loss)

        ps = torch.exp(log_ps)
        top_p, top_class  = ps.topk(1, dim=1)
        equals = top_class == y_test.view(*top_class.shape)
        test_accuracy = torch.mean(equals.float())
        test_accuracies.append(test_accuracy)

    model.train()

    print(f"Epoch: {e+1}/{epochs}.. ",
          f"Training Loss: {train_loss:.3f}.. ",
          f"Test Loss: {test_loss:.3f}.. ",
          f"Test Accuracy: {test_accuracy:.3f}")
#     scheduler.step(test_loss/len(y_test))

Epoch: 1/200..  Training Loss: 1.096..  Test Loss: 1.092..  Test Accuracy: 0.432
Epoch: 2/200..  Training Loss: 1.091..  Test Loss: 1.088..  Test Accuracy: 0.537
Epoch: 3/200..  Training Loss: 1.087..  Test Loss: 1.084..  Test Accuracy: 0.567
Epoch: 4/200..  Training Loss: 1.083..  Test Loss: 1.079..  Test Accuracy: 0.567
Epoch: 5/200..  Training Loss: 1.079..  Test Loss: 1.075..  Test Accuracy: 0.567
Epoch: 6/200..  Training Loss: 1.074..  Test Loss: 1.071..  Test Accuracy: 0.567
Epoch: 7/200..  Training Loss: 1.070..  Test Loss: 1.066..  Test Accuracy: 0.566
Epoch: 8/200..  Training Loss: 1.065..  Test Loss: 1.061..  Test Accuracy: 0.569
Epoch: 9/200..  Training Loss: 1.060..  Test Loss: 1.056..  Test Accuracy: 0.572
Epoch: 10/200..  Training Loss: 1.054..  Test Loss: 1.051..  Test Accuracy: 0.572
Epoch: 11/200..  Training Loss: 1.049..  Test Loss: 1.046..  Test Accuracy: 0.573
Epoch: 12/200..  Training Loss: 1.044..  Test Loss: 1.041..  Test Accuracy: 0.575
Epoch: 13/200..  Training

Epoch: 102/200..  Training Loss: 0.765..  Test Loss: 0.826..  Test Accuracy: 0.618
Epoch: 103/200..  Training Loss: 0.764..  Test Loss: 0.825..  Test Accuracy: 0.618
Epoch: 104/200..  Training Loss: 0.763..  Test Loss: 0.825..  Test Accuracy: 0.618
Epoch: 105/200..  Training Loss: 0.762..  Test Loss: 0.824..  Test Accuracy: 0.620
Epoch: 106/200..  Training Loss: 0.762..  Test Loss: 0.824..  Test Accuracy: 0.620
Epoch: 107/200..  Training Loss: 0.761..  Test Loss: 0.824..  Test Accuracy: 0.620
Epoch: 108/200..  Training Loss: 0.760..  Test Loss: 0.823..  Test Accuracy: 0.620
Epoch: 109/200..  Training Loss: 0.759..  Test Loss: 0.823..  Test Accuracy: 0.620
Epoch: 110/200..  Training Loss: 0.758..  Test Loss: 0.822..  Test Accuracy: 0.619
Epoch: 111/200..  Training Loss: 0.757..  Test Loss: 0.822..  Test Accuracy: 0.620
Epoch: 112/200..  Training Loss: 0.757..  Test Loss: 0.822..  Test Accuracy: 0.621
Epoch: 113/200..  Training Loss: 0.755..  Test Loss: 0.821..  Test Accuracy: 0.621
Epoc

In [14]:
preds = model.forward(x_test)
preds = topk_encoding(preds)
preds[1]

tensor([1])

In [15]:
acc_score = accuracy_score(y_test, preds)
acc_score

0.6331152491192753

In [16]:
f1_score = f1_score(y_test, preds, average='macro')
f1_score

0.5659331873405122

In [17]:
with open("model/tfidf_vectorizer.joblib", "wb+") as filename:
        joblib.dump(vectorizer, filename)

In [24]:
with open("model/nn_model.joblib", "wb+") as filename:
        joblib.dump(model, filename)

### ML Pipeline

In [18]:
def input_vectorizer(message):
    """
    Function to predict the category of inputted message
    """
    
    cleaned_message = pd.Series(message).apply(lambda x: data_cleanup(x))
    vec = vectorizer.transform(pd.Series(cleaned_message))
    vec = torch.tensor(scipy.sparse.csr_matrix.todense(vec)).float()
    preds = model.forward(vec)
    category = topk_encoding(preds).detach().cpu().numpy()
    
    return int(category[0])

In [19]:
a = input_vectorizer("fly to the sky to go back after a year on september.")
a


1

In [20]:
b = input_vectorizer("the item is not good, waste of money")
b


0