In [None]:
# !pip install onnxruntime

In [None]:
from google.colab import drive
import torch
from torch import nn
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import numpy as np
import pandas as pd
import os
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import spacy # install command: pip install spacy
import string
import nltk # install command: pip install nltk
from nltk.stem import WordNetLemmatizer
import re
from sklearn.model_selection import train_test_split
import onnxruntime # install command: pip install onnxruntime
import torch.onnx as onnx

In [None]:
# !python3 -m spacy download en_core_web_sm

In [None]:
# nltk.download('wordnet')

In [None]:
# nltk.download('omw-1.4')

In [None]:
config = {
    'max_features': 1000,
    'num_epochs':200,
    'learning_rate':1e-1,
    'batch_size':64,
    'train_percentage':90
}

# Dataset

Note that the dataset is taken from https://www.kaggle.com/datasets/yasserh/twitter-tweets-sentiment-dataset






Exploration

In [None]:
df = pd.read_csv('/content/Tweets.csv')
df.head()

In [None]:
df['sentiment'].unique()

In [None]:
df = df.loc[df['sentiment']!='neutral']

In [None]:
df =df[['text','sentiment']]
df.head()

In [None]:
df['text'].count()

# Preprocessing



# Lemmatization

In [None]:
lemmatizer = WordNetLemmatizer()

In [None]:
lemmatizer.lemmatize("settings")

# Stopword Removal

In [None]:
en = spacy.load('en_core_web_sm')
stopwords = en.Defaults.stop_words

In [None]:
type(stopwords)

In [None]:
len(stopwords)

In [None]:
list(stopwords)[:10]

# Preprocessing Pipeline

In [None]:
def preprocess(txt):
    text = txt.lower()
    tokens = txt.split()
    tokens = [lemmatizer.lemmatize(token)for token in tokens]
    txt = ''.join(tokens)
    txt =txt.translate(str.maketrans('','',string.punctuation))
    tokens = txt.split()
    tokens=[token for token in tokens if token not in stopwords]
    txt = ''.join(tokens)
    txt =re.sub('r[0-9]+','',txt)
    return txt

In [None]:
df = df.reset_index()  # restore the indices of the dataframe so that it star

In [None]:
original_txt = df['text'][50]
processed_txt = preprocess(df['text'][50])
print(f'The original text was:\n{original_txt}\n The preprocessed txt is :\n{processed_txt} ')

In [None]:
df['preprocessed_text'] = df['text'].apply(lambda x: preprocess(str(x)))
df.head()

In [None]:
texts = df['preprocessed_text'].tolist()

In [None]:
# vectorizer = CountVectorizer(max_features=config['max_features'])
# features = vectorizer.fit_transform(texts)
vectorizer =TfidfVectorizer(max_features=config['max_features'])
features = vectorizer.fit_transform(texts)

In [None]:
vectorizer.get_feature_names_out()

In [None]:
np_features= features.toarray()

In [None]:
np_features.shape

We should also work with the labels...

In [None]:
df['num_sentiment'] = df['sentiment'].apply(lambda x: 0 if x == 'negative' else 1)
df.head()


In [None]:
labels = df['num_sentiment'].tolist()

In [None]:
labels[:5]

# Train / Test / Dev Split

In [None]:
f_train, f_rem, l_train, l_rem = train_test_split(np_features, labels, test_size=1-config['train_percentage']/100, random_state=50)
f_test, f_dev, l_test, l_dev = train_test_split(f_rem, l_rem, test_size=0.5,random_state=50)

In [None]:
print(f'train features: {f_train.shape}, dev features: {f_dev.shape}, test features :{f_test.shape}')

In [None]:
print(f'train labels: {len(l_train)}, dev labels: {len(l_dev)}, test labels: {len(l_test)}')

# Converting Everything to Tensors

The numpy array we defined above should be converted to a tensor. This tensor will be used in a "Dataset" object.

In [None]:
class MyVectorDataset(Dataset):
    def __init__(self,features,labels):
        self.features = features
        self.labels = np.array(labels).reshape(-1, 1)
    def __len__(self):
        return  self.features.shape[0]
    def __getitem__(self, idx):
        return torch.Tensor(self.features[idx]),torch.Tensor(self.labels[idx])
    
          
        

In [None]:
train_dataset=MyVectorDataset(f_train, l_train)
test_dataset=MyVectorDataset(f_test,l_test)
dev_dataset=MyVectorDataset(f_dev, l_dev)

In [None]:
train_dataloader= DataLoader(train_dataset,batch_size=config['batch_size'],shuffle=True)
test_dataloader= DataLoader(test_dataset,batch_size=config['batch_size'],shuffle=True)
dev_dataloader= DataLoader(dev_dataset,batch_size=config['batch_size'],shuffle=True)


# Neural Net Architecture

In [None]:
device = 'cpu' if torch.cuda.is_available() else 'cpu'
device

In [None]:
class my_neural_net(torch.nn.Module):
    def __init__(self):
        super(my_neural_net, self).__init__() 
        self.first_layer = torch.nn.Sequential( 
            nn.Linear(config['max_features'],1),
            nn.Sigmoid()    
        )
    def forward(self, x):
        output = self.first_layer(x)
        return output   

In [None]:
simple_nn = my_neural_net()

In [None]:
simple_nn = simple_nn.to(device)

In [None]:
simple_nn(train_dataset[:2][0])

In [None]:
simple_nn(train_dataset[:2][0]).shape

# Training


# Binary Cross-Entropy

In [None]:
loss_fn = nn.BCELoss()

# Optimizer
Note that stochastic gradient descent performs a parameter update for each training example Xi and yi label 



In [None]:
optimizer = torch.optim.SGD(simple_nn.parameters(),lr=config['learning_rate'])

In [None]:
def output_to_label(out):
    dis_to_0 = abs(out)
    dis_to_1 = abs(out-1)
    if dis_to_0 <= dis_to_1:
        return 0
    else:
        return 1

In [None]:
def train_loop(dataloader, model, loss_fn, optimizer, epoch_num):
    num_points = len(dataloader.dataset)
    for batch, (features, labels) in enumerate(dataloader):        
        # Compute prediction and loss
        pred = model(features)
        loss = loss_fn(pred, labels)
        
        # Backpropagation
        optimizer.zero_grad() # sets gradients of all model parameters to zero
        loss.backward() # calculate the gradients again
        optimizer.step() # w = w - learning_rate * grad(loss)_with_respect_to_w

        if batch % 100 == 0:
            loss, current = loss.item(), batch * len(features)
            print(f"\r Epoch{epoch_num}_loss:{loss:7f}[{current:>5d}/{num_points:>5d}]",end="")

def test_loop(dataloader, model, loss_fn, epoch_num, name):
    num_points = len(dataloader.dataset)
    sum_test_loss, correct = 0, 0
    with torch.no_grad():
        for batch, (features, labels) in enumerate(dataloader):
            pred = model(features)
            sum_test_loss += loss_fn(pred, labels).item() # add the current loss to the sum of the losses
            # convert the outputs of the model on the current batch to a numpy array
            pred_lst = list(pred.numpy().squeeze())
            pred_lst = [output_to_label(item) for item in pred_lst]
            # convert the original labels corresponding to the current batch to a numpy array
            output_lst = list(labels.numpy().squeeze())
            match_lst = [1 if p==o else 0 for (p, o) in zip(pred_lst, output_lst)]
            # many points are labeled correctly in this batch and add the number to the overall count of the correct labeled points
            correct += sum(match_lst)
    sum_test_loss /= num_points
    correct /= num_points   
    print(f"/r Epoch {epoch_num} - {name} Error: Accuracy:{(100*correct):>0.1f}%, Avg loss: {sum_test_loss:>8f}",end="")


In [None]:
for epoch_num in range(1, config['num_epochs']+1):
    train_loop(train_dataloader, simple_nn, loss_fn, optimizer, epoch_num)
    test_loop(dev_dataloader, simple_nn, loss_fn, epoch_num, 'Development/Validation')

# saving the model 

In [None]:
torch.save(simple_nn.state_dict(),"neural_net.path")

# Load the model

In [None]:
model = my_neural_net()
model.load_state_dict(torch.load("neural_net.path"))
model.eval()

In [None]:
model(test_dataset[:2][0])

In [None]:
l_test[:2]

# The ONNX Format

This format is useful when you want to use your model while coding in Java, Javascript, and C#!

# Save the Model

In [None]:
dummy_input = torch.zeros((1,config['max_features']))

In [None]:
onnx.export(model,dummy_input,'neural_net.onnx')

# Inference

In [None]:
session = onnxruntime.InferenceSession('neural_net.onnx',None)

In [None]:
input_name = session.get_inputs()[0].name
output_name = session.get_outputs()[0].name

In [None]:
input_name

In [None]:
output_name

In [None]:
result = session.run([output_name], {input_name: test_dataset[0][0].numpy().reshape(1,-1)})

In [None]:
result