In [1]:
import re
import string
import torch
from torch import nn
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import scipy
from torch import optim
import matplotlib.pyplot as plt
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
from torch.optim.lr_scheduler import ReduceLROnPlateau
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
import joblib
from joblib import load
from sklearn.metrics import classification_report

# from nltk.stem import WordNetLemmatizer
# import nltk

In [2]:
device = "cpu"

In [5]:
df = pd.read_csv('twitter_class.csv')

In [6]:
df = df.sample(frac=1).reset_index() #shuffling data in pandas

### Data Cleanup

In [7]:
def to_lower(message):
    result = message.lower()
    return result

def remove_num(message):
    result = re.sub(r'\d+','',message)
    return result

def contractions(message):
     result = re.sub(r"won't", "will not",message)
     result = re.sub(r"would't", "would not",message)
     result = re.sub(r"could't", "could not",message)
     result = re.sub(r"\'d", " would",message)
     result = re.sub(r"can\'t", "can not",message)
     result = re.sub(r"n\'t", " not", message)
     result = re.sub(r"\'re", " are", message)
     result = re.sub(r"\'s", " is", message)
     result = re.sub(r"\'ll", " will", message)
     result = re.sub(r"\'t", " not", message)
     result = re.sub(r"\'ve", " have", message)
     result = re.sub(r"\'m", " am", message)
     return result
    
def remove_punctuation(message):
    result = message.translate(str.maketrans(dict.fromkeys(string.punctuation)))
    return result

def remove_whitespace(message):
    result = message.strip()
    result = re.sub(' +',' ',message)
    return result

def replace_newline(message):
    result = message.replace('\n','')
    return result

def data_cleanup(message):
    cleaning_utils = [to_lower, remove_num, contractions, remove_punctuation, remove_whitespace, replace_newline]
    for util in cleaning_utils:
        message = util(message)
    return message


In [8]:
df.head()

Unnamed: 0,index,0,1,2,3,4,5,6,7,8,...,1991,1992,1993,1994,1995,1996,1997,1998,1999,category
0,29085,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
1,12004,0.001063,0.000697,0.002682,0.029541,0.002964,0.000161,0.001822,0.000491,2.9e-05,...,0.000375,0.000773,0.000777,0.001049,0.000132,0.002686,0.000204,0.000206,0.000176,0
2,40755,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2
3,1681,0.000381,0.000198,0.000326,0.014041,0.000882,9.5e-05,0.002483,0.000146,4e-06,...,4.9e-05,0.000235,0.000113,0.001436,0.000552,0.001759,0.000207,0.000148,3.7e-05,0
4,51023,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2


In [56]:
x_train, x_test, y_train, y_test = train_test_split(
    df.iloc[:,1:-1], df['category'], test_size=.2, stratify=df['category'], random_state=42)

In [13]:
a = pd.DataFrame(y_test)
a.groupby(['category']).size()

category
0    3000
1    4451
2    3935
dtype: int64

### Building  Neural Network Model

--no need to feed the x_train and x_test to tfidf vectorizer since it is already converted in separate notebook. Refer to NN_model with AutoEncoder.ipynb


--proceed to convert data to tensors

##### Train Datasets:

0   ->  12000

1   ->  17804

2   ->  15737


##### Test Datasets

0   ->  3000

1   ->  4451

2   ->  3935

In [14]:
x_train = torch.tensor(x_train.values).float()
x_test = torch.tensor(x_test.values).float()

In [15]:
y_train = torch.tensor(y_train.values)
y_test = torch.tensor(y_test.values)

In [16]:
def topk_encoding(nd_array):
    """
    Function to flatten the predicted category
    """
    
    predictions = nd_array
    
    ps = torch.exp(predictions)
    top_p, top_class  = ps.topk(1, dim=1)
    

    return top_class

In [17]:
class NeuralNetwork(nn.Module):
    def __init__(self, in_dim, out_dim):
        super().__init__()
        self.in_dim = in_dim
        self.out_dim = out_dim
        
        self.hidden_layer_1 = nn.Linear(x_train.shape[1], 64) # input to first hidden layer
        self.output_layer = nn.Linear(64, self.out_dim)
        self.activation = nn.ReLU()
        self.dropout = nn.Dropout(0.1)
        self.softmax = nn.LogSoftmax(dim=1)
    
    def forward(self, x):
        x = self.hidden_layer_1(x)
        x = self.activation(x)
        x = self.dropout(x)
        
        y = self.output_layer(x)
        y = self.activation(y)
        y = self.softmax(y)
        
        return y

In [24]:
model = NeuralNetwork(x_train.shape[1], df['category'].nunique())
# model = NeuralNetwork(x_train.shape[1], 5)


# Define the loss
criterion = nn.NLLLoss()


# Optimizers require the parameters to optimize and a learning rate
optimizer = optim.Adam(model.parameters(), lr=0.002)

#setting up scheduler
scheduler = ReduceLROnPlateau(optimizer, 'min', patience = 10)

### Model Fine-Tuning

In [26]:
%%time
train_losses = []
test_losses = []
test_accuracies = []

epochs = 400
for e in range(epochs):
    optimizer.zero_grad()

    output = model.forward(x_train) #Forward pass, get the logits
    loss = criterion(output, y_train) # Calculate the loss with the logits and the labels
    loss.backward()
    train_loss = loss.item()
    train_losses.append(train_loss)
    
    optimizer.step()

    # Turn off gradients for validation, saves memory and computations
    with torch.no_grad():
        model.eval()
        log_ps = model.forward(x_test)
        test_loss = criterion(log_ps, y_test)
        test_losses.append(test_loss)

        ps = torch.exp(log_ps)
        top_p, top_class  = ps.topk(1, dim=1)
        equals = top_class == y_test.view(*top_class.shape)
        test_accuracy = torch.mean(equals.float())
        test_accuracies.append(test_accuracy)

    model.train()

    print(f"Epoch: {e+1}/{epochs}.. ",
          f"Training Loss: {train_loss:.3f}.. ",
          f"Test Loss: {test_loss:.3f}.. ",
          f"Test Accuracy: {test_accuracy:.3f}")
    scheduler.step(test_loss/len(y_test))

Epoch: 1/100..  Training Loss: 0.350..  Test Loss: 0.474..  Test Accuracy: 0.789
Epoch: 2/100..  Training Loss: 0.349..  Test Loss: 0.473..  Test Accuracy: 0.789
Epoch: 3/100..  Training Loss: 0.347..  Test Loss: 0.473..  Test Accuracy: 0.789
Epoch: 4/100..  Training Loss: 0.347..  Test Loss: 0.473..  Test Accuracy: 0.789
Epoch: 5/100..  Training Loss: 0.346..  Test Loss: 0.473..  Test Accuracy: 0.789
Epoch: 6/100..  Training Loss: 0.346..  Test Loss: 0.472..  Test Accuracy: 0.788
Epoch: 7/100..  Training Loss: 0.345..  Test Loss: 0.472..  Test Accuracy: 0.789
Epoch: 8/100..  Training Loss: 0.343..  Test Loss: 0.472..  Test Accuracy: 0.789
Epoch: 9/100..  Training Loss: 0.344..  Test Loss: 0.472..  Test Accuracy: 0.789
Epoch: 10/100..  Training Loss: 0.343..  Test Loss: 0.471..  Test Accuracy: 0.789
Epoch: 11/100..  Training Loss: 0.342..  Test Loss: 0.471..  Test Accuracy: 0.789
Epoch: 12/100..  Training Loss: 0.341..  Test Loss: 0.471..  Test Accuracy: 0.789
Epoch: 13/100..  Training

In [27]:
with open("nn_wauto.joblib", "wb+") as filename:
        joblib.dump(model, filename)

In [28]:
preds = model.forward(x_test)
preds = topk_encoding(preds)
preds[1]

tensor([0])

In [29]:
acc_score = accuracy_score(y_test, preds)
acc_score

0.7885122079747058

In [30]:
f1_score = f1_score(y_test, preds, average='macro')
f1_score

0.8018723520537367

In [31]:
report_str = classification_report(y_test, preds)
print("-- Classification Report --")
print(report_str)

-- Classification Report --
              precision    recall  f1-score   support

           0       0.93      1.00      0.96      3000
           1       0.73      0.75      0.74      4451
           2       0.73      0.67      0.70      3935

    accuracy                           0.79     11386
   macro avg       0.80      0.81      0.80     11386
weighted avg       0.78      0.79      0.79     11386



### Simple ML Pipeline

In [34]:
vectorizer = load("tfidf_vectorizer.joblib")

In [35]:
def input_vectorizer(message):
    """
    Function to predict the category of inputted message
    """
    
    cleaned_message = pd.Series(message).apply(lambda x: data_cleanup(x))
    vec = vectorizer.transform(pd.Series(cleaned_message))
    vec = torch.tensor(scipy.sparse.csr_matrix.todense(vec)).float()
    preds = model.forward(vec)
    category = topk_encoding(preds).detach().cpu().numpy()
    
    return int(category[0])

In [55]:
c = input_vectorizer("this product is trash")
c

0

In [43]:
a = input_vectorizer("prenatal move to wednesday at 8 pm starting tonight..")
a


1

In [54]:
b = input_vectorizer("nice video")
b


2