<a href="https://colab.research.google.com/github/atharvnaidu/SentenceClassification/blob/main/SentenceIdentification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Plan: https://docs.google.com/document/d/1rsEPLStHeIVPvGmEcyN-JpXZD1tlB29VzIe7Syoi5Jk/edit

In [None]:
!pip install --upgrade torchtext

import torch, torchtext
from torch import nn
from torch.utils.data import Dataset, DataLoader
from torchtext.functional import to_tensor
import os
import pandas as pd
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator





In [None]:
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
print(f"Using {device} device")

Using cpu device


In [None]:
class CustomSentenceDataset(Dataset):
    def __init__(self, dataframe):
        self.data = dataframe

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        ##retrieve sentence and label
        sentence = self.data.iloc[idx, 0]
        label = self.data.iloc[idx, 1]
        return sentence, label # Return the sentence and label as a tuple

In [None]:
def transformSentence(sentence, maxLength):
  tokenizer = torchtext.data.utils.get_tokenizer('basic_english')
  tokens = tokenizer(sentence)
  vocab = build_vocab_from_iterator([tokens])
  ids = [float(vocab[token]) for token in tokens]
  padded_ids = torch.nn.functional.pad(torch.tensor(ids), (0, maxLength - len(ids)))
  return padded_ids

In [None]:
##transform data
df = pd.read_csv('/content/questions_vs_statements_v1.0.csv', on_bad_lines='skip') # Skip lines with errors
df = df.iloc[:300]

max_length = df['doc'].apply(lambda x: len(x)).max()
df['Sentence'] = df['doc'].apply(lambda x: transformSentence(x, max_length))
df = df.drop(['doc', 'label'], axis=1)
df = df[['Sentence', 'target']]

max_length = df['Sentence'].apply(lambda x: len(x)).max()

print(df)
print(max_length)


                                              Sentence  target
0    [tensor(1.), tensor(2.), tensor(6.), tensor(7....       0
1    [tensor(10.), tensor(0.), tensor(9.), tensor(6...       0
2    [tensor(0.), tensor(3.), tensor(1.), tensor(12...       1
3    [tensor(8.), tensor(5.), tensor(1.), tensor(7....       1
4    [tensor(14.), tensor(9.), tensor(7.), tensor(5...       1
..                                                 ...     ...
295  [tensor(21.), tensor(24.), tensor(3.), tensor(...       0
296  [tensor(1.), tensor(2.), tensor(0.), tensor(9....       1
297  [tensor(4.), tensor(20.), tensor(11.), tensor(...       0
298  [tensor(7.), tensor(8.), tensor(0.), tensor(6....       1
299  [tensor(9.), tensor(2.), tensor(4.), tensor(0....       1

[300 rows x 2 columns]
1570


In [None]:
training_data = CustomSentenceDataset(df)
test_data = CustomSentenceDataset(df)
train_dataloader = DataLoader(training_data, batch_size=20, shuffle=True)
test_dataloader = DataLoader(test_data, batch_size=20, shuffle=True)

In [None]:
train_features, train_labels = next(iter(train_dataloader))
print(train_features)
print(train_labels)
print(train_features.shape)
print(train_labels.shape)

test_features, test_labels = next(iter(test_dataloader))
print(test_features)
print(test_labels)
print(test_features.shape)
print(test_labels.shape)


tensor([[ 3.,  6.,  0.,  ...,  0.,  0.,  0.],
        [25.,  3.,  0.,  ...,  0.,  0.,  0.],
        [ 1.,  3.,  4.,  ...,  0.,  0.,  0.],
        ...,
        [ 8.,  4.,  0.,  ...,  0.,  0.,  0.],
        [ 3.,  0.,  5.,  ...,  0.,  0.,  0.],
        [ 8.,  4.,  5.,  ...,  0.,  0.,  0.]])
tensor([0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0])
torch.Size([20, 1570])
torch.Size([20])
tensor([[11.,  1., 13.,  ...,  0.,  0.,  0.],
        [ 4.,  3., 18.,  ...,  0.,  0.,  0.],
        [11.,  8.,  4.,  ...,  0.,  0.,  0.],
        ...,
        [ 2., 17., 25.,  ...,  0.,  0.,  0.],
        [ 8.,  3., 11.,  ...,  0.,  0.,  0.],
        [ 6.,  3.,  7.,  ...,  0.,  0.,  0.]])
tensor([1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0])
torch.Size([20, 1570])
torch.Size([20])


# Make Neural Network



In [None]:
class NeuralNetwork(nn.Module):
    def __init__(self):
        super().__init__()
        # Flatten starting from dimension 1 to handle batches correctly
        self.flatten = nn.Flatten(start_dim=1)
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(max_length, max_length//2),
            nn.ReLU(),
            nn.Linear(max_length//2, max_length//2),
            nn.ReLU(),
            nn.Linear(max_length//2, 2)
        )

    def forward(self, x):
        x = self.flatten(x)
        logits = self.linear_relu_stack(x)
        return logits

# Train Neural Network

In [None]:
def train_loop(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)
    # Set the model to training mode - important for batch normalization and dropout layers
    # Unnecessary in this situation but added for best practices
    model.train()
    for batch, (X, y) in enumerate(dataloader):
        print(X)
        print(y)
        # Compute prediction and loss
        pred = model(X)
        loss = loss_fn(pred, y)

        # Backpropagation
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        if batch % 100 == 0:
            loss, current = loss.item(), batch * batch_size + len(X)
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")

def test_loop(dataloader, model, loss_fn):
    # Set the model to evaluation mode - important for batch normalization and dropout layers
    # Unnecessary in this situation but added for best practices
    model.eval()
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    test_loss, correct = 0, 0

    # Evaluating the model with torch.no_grad() ensures that no gradients are computed during test mode
    # also serves to reduce unnecessary gradient computations and memory usage for tensors with requires_grad=True
    with torch.no_grad():
        for X, y in dataloader:
            pred = model(X)
            test_loss += loss_fn(pred, y).item()
            correct += (pred.argmax(1) == y).type(torch.float).sum().item()

    test_loss /= num_batches
    correct /= size
    print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")

In [None]:
model = NeuralNetwork()
batch_size = 20
loss_fn = nn.CrossEntropyLoss()
learning_rate = 1e-3
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

epochs = 300
for t in range(epochs):
    print(f"Epoch {t+1}\n-------------------------------")
    train_loop(train_dataloader, model, loss_fn, optimizer)
    test_loop(test_dataloader, model, loss_fn)
print("Done!")

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
        [ 0.,  3., 11.,  ...,  0.,  0.,  0.]])
tensor([1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0])
tensor([[13., 11.,  0.,  ...,  0.,  0.,  0.],
        [ 4., 10., 12.,  ...,  0.,  0.,  0.],
        [ 2.,  0., 47.,  ...,  0.,  0.,  0.],
        ...,
        [24.,  2., 21.,  ...,  0.,  0.,  0.],
        [ 1.,  2.,  6.,  ...,  0.,  0.,  0.],
        [21., 16.,  8.,  ...,  0.,  0.,  0.]])
tensor([1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0])
tensor([[16., 15., 19.,  ...,  0.,  0.,  0.],
        [14., 13.,  2.,  ...,  0.,  0.,  0.],
        [22.,  3.,  0.,  ...,  0.,  0.,  0.],
        ...,
        [16., 17.,  1.,  ...,  0.,  0.,  0.],
        [12.,  1., 17.,  ...,  0.,  0.,  0.],
        [ 8.,  4.,  5.,  ...,  0.,  0.,  0.]])
tensor([0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1])
tensor([[ 9.,  0.,  5.,  ...,  0.,  0.,  0.],
        [ 0.,  4.,  1.,  ...,  0.,  0.,  0.],
        [

In [None]:
import torchvision.models as models

In [None]:
model = models.vgg16(weights='IMAGENET1K_V1')
torch.save(model.state_dict(), 'model_weights.pth')

In [None]:
model = models.vgg16() # we do not specify ``weights``, i.e. create untrained model
model.load_state_dict(torch.load('model_weights.pth'))
model.eval()

VGG(
  (features): Sequential(
    (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU(inplace=True)
    (2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (3): ReLU(inplace=True)
    (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (5): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (6): ReLU(inplace=True)
    (7): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (8): ReLU(inplace=True)
    (9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (10): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (11): ReLU(inplace=True)
    (12): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (13): ReLU(inplace=True)
    (14): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (15): ReLU(inplace=True)
    (16): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1

In [None]:
torch.save(model, 'model.pth')

In [None]:
model = torch.load('model.pth')

In [None]:
model = NeuralNetwork().to(device)
X = train_features[9].unsqueeze(0) # Add a dimension to X
print(X)
print(X.shape)
logits = model(X)
print(logits)
pred_probab = nn.Softmax(dim=0)(logits)
y_pred = pred_probab.argmax(0)
print(f"Predicted class: {y_pred}")

tensor([[1., 0., 5.,  ..., 0., 0., 0.]])
torch.Size([1, 1570])
tensor([[ 0.0051, -0.0230]], grad_fn=<AddmmBackward0>)
Predicted class: tensor([0, 0])
