In [1]:
import torch
import numpy as np
import pandas as pd
import torch.nn as nn
import torch.nn.functional as F
import gensim.downloader as api # type: ignore
 
import os 
os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'

fasttext = api.load('glove-twitter-25')
fasttext.most_similar(positive=['king', 'woman'], negative=['man'])

[('meets', 0.8841923475265503),
 ('prince', 0.832163393497467),
 ('queen', 0.8257461190223694),
 ('’s', 0.817409873008728),
 ('crow', 0.813499391078949),
 ('hunter', 0.8131037950515747),
 ('father', 0.8115834593772888),
 ('soldier', 0.81113600730896),
 ('mercy', 0.8082393407821655),
 ('hero', 0.8082263469696045)]

In [2]:
import re
from collections import Counter

class Vocab:
    def __init__(self, texts: list[str], min_freq: int = 10):
        self.min_freq = min_freq
        text = ' '.join(texts)
        text = self._remove_links(text)
        text = self._remove_special_chars(text)
        text = self._remove_multiple_spaces(text)
        self.vocab = self._filter_words(text.strip().lower().split())
        self.vocab.append('<unk>')

        self._word2idx = {word: idx for idx, word in enumerate(self.vocab)}
        self._idx2word = {idx: word for idx, word in enumerate(self.vocab)}
    
    def _remove_links(self, text):
        return re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    
    def _remove_special_chars(self, text):
        return re.sub(r'[^a-zA-Z\s]', '', text)
    
    def _remove_multiple_spaces(self, text):
        return re.sub(r'\s+', ' ', text)
    
    def _filter_words(self, words) -> list[str]:
        counter = Counter(words)
        return list({word for word in words if counter[word] > self.min_freq})
    
    def get_vocab(self): return self.vocab

    def idx2word(self, idx):
        if idx not in self.idx2word: return '<unk>'
        return self._idx2word[idx]

    def word2idx(self , word):
        word = word.lower()
        if word not in self._word2idx: return self._word2idx['<unk>']
        return self._word2idx[word]

    def encode(self, text):
        return [self.word2idx(word) for word in text.split()]
    
    def make_vectors(self, fasttext):
        return np.stack([fasttext[word] if fasttext.has_index_for(word) \
                else np.zeros(fasttext.vector_size) \
                for word in self.vocab])

In [3]:
from torch.utils.data import DataLoader, Dataset
import kaggle

class IMDB(Dataset):
    def __init__(self, path):
        self.df = pd.read_csv(path)
        texts = self.df['review'].values
        labels = self.df['sentiment'].values
        self.vocab = Vocab(texts)
        self.labels2int = {'positive': 1, 'negative': 0}

    def __getitem__(self, idx):

        text = self.df['review'].loc[idx]
        label = self.labels2int[self.df['sentiment'].loc[idx]]
        text = torch.LongTensor(self.vocab.encode(text))
        label = torch.FloatTensor([label])

        return text, label
    def __len__(self):
        return len(self.df)
    
if 'IMDB Dataset.csv' not in os.listdir():
    kaggle.api.dataset_download_files('lakshmi25npathi/imdb-dataset-of-50k-movie-reviews', path='.', unzip=True)
    
dataset = IMDB('IMDB Dataset.csv')
#dataset[0]

In [4]:
from torch.utils.data import random_split
from torch.nn.utils.rnn import pad_sequence

train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size
train_set, test_set = random_split(dataset, [train_size, test_size])

pad_index = len(dataset.vocab.vocab)
def collate_fn(batch):
    texts = pad_sequence([b[0] for b in batch], padding_value=pad_index, batch_first=True)
    labels = torch.stack([b[1] for b in batch])
    return texts, labels

In [5]:
train_loader = DataLoader(train_set, batch_size=16, collate_fn=collate_fn, pin_memory=True, num_workers = 7)
test_loader = DataLoader(test_set, batch_size=16, collate_fn=collate_fn, pin_memory=True, num_workers = 7)

In [6]:
class TextConvNN(nn.Module):
    def __init__(self, vocab_size, dims_size, pad_idx):
        super().__init__()

        self.embedding = nn.Embedding(vocab_size, dims_size, padding_idx=pad_idx)

        kernels = [2, 3, 4, 5, 6]
        self.convs = nn.ModuleList(
            [nn.Conv2d(1, 16, kernel_size=(k, dims_size)) for k in kernels]
        )
        self.fc = nn.Linear(len(kernels) * 16, 1)
        self.dropout = nn.Dropout(0.25)

    def forward(self, x):
        x = self.embedding(x)
        x = x.unsqueeze(1)
        x = [F.relu(conv(x)).squeeze(3) for conv in self.convs]
        x = [F.max_pool1d(_, _.shape[2]).squeeze(2) for _ in x]

        x = self.dropout(torch.cat(x, dim = 1))

        return self.fc(x)

In [7]:
model = TextConvNN(vocab_size=len(dataset.vocab.vocab) + 1, dims_size=fasttext.vector_size, pad_idx=pad_index)
vectors = dataset.vocab.make_vectors(fasttext)
print(vectors.shape, model.embedding.weight.shape)
model.embedding.weight.data[:len(vectors)] = torch.from_numpy(vectors)
print(model.embedding.weight.shape)
def binary_accuracy(preds, y):
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float()
    return correct.sum() / len(correct)

(28686, 25) torch.Size([28687, 25])
torch.Size([28687, 25])


In [14]:
import pytorch_lightning as pl # type: ignore
import plotly.graph_objects as go # type: ignore
from IPython.display import display

import warnings
warnings.filterwarnings('ignore')

def binary_accuracy(preds, y):
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float()
    return correct.sum() / len(correct)

class TextConvNN_pl(pl.LightningModule):
    def __init__(self, model):
        super().__init__()
        self.model = model
        self = self.to('cuda' if torch.cuda.is_available() else 'cpu')
        print(f"CUDA available: {torch.cuda.is_available()}, Device: {self.device}")

    def forward(self, x):
        return self.model(x)
    
    def training_step(self, batch, batch_idx):
        x, y = batch
        x, y = x.to(self.device, non_blocking=True), y.to(self.device, non_blocking=True)
        y_hat = self(x)
        loss = F.binary_cross_entropy_with_logits(y_hat, y)
        self.log('train_loss', loss, on_step=False, on_epoch=True, prog_bar=True, logger=True)
        self.log('train_accuracy', binary_accuracy(y_hat, y), on_step=False, on_epoch=True, prog_bar=True, logger=True)
        return loss
    
    def validation_step(self, batch, batch_idx):
        x, y = batch
        x, y = x.to(self.device, non_blocking=True), y.to(self.device, non_blocking=True)
        y_hat = self(x)
        loss = F.binary_cross_entropy_with_logits(y_hat, y)
        self.log('val_loss', loss, on_step=False, on_epoch=True, prog_bar=True, logger=True)
        self.log('val_accuracy', binary_accuracy(y_hat, y), on_step=False, on_epoch=True, prog_bar=True, logger=True)
        return loss
    
    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=1e-3)


In [15]:
import plotly.graph_objects as go
from IPython.display import display
from pytorch_lightning.loggers import Logger
from pytorch_lightning.utilities.rank_zero import rank_zero_only

class PlotlyLogger(Logger):
    def __init__(self):
        super().__init__()
        self.fig_accuracy = go.FigureWidget()
        self.fig_loss = go.FigureWidget()

        self.fig_accuracy.add_trace(go.Scatter(y=[], mode='lines', name='Train'))
        self.fig_accuracy.add_trace(go.Scatter(y=[], mode='lines', name='Test'))
        self.fig_loss.add_trace(go.Scatter(y=[], mode='lines', name='Train'))
        self.fig_loss.add_trace(go.Scatter(y=[], mode='lines', name='Test'))

        self.fig_accuracy.update_layout(xaxis_title='Epoch', yaxis_title='Accuracy', margin=dict(l=20, r=20, t=20, b=20))
        self.fig_loss.update_layout(xaxis_title='Epoch', yaxis_title='Loss', margin=dict(l=20, r=20, t=20, b=20))
        self.y_train_acc, self.y_test_acc = [], []
        self.y_train_loss, self.y_test_loss = [], []
        self.count = 0


        display(self.fig_accuracy)
        display(self.fig_loss)
    @rank_zero_only
    def log_metrics(self, metrics, step):
        self.count += 1
        if self.count % 2 != 0:
            self.y_test_acc.append(metrics.get('val_accuracy', 0))
            self.y_test_loss.append(metrics.get('val_loss', 0))
        else:
            self.y_train_acc.append(metrics.get('train_accuracy', 0))
            self.y_train_loss.append(metrics.get('train_loss', 0))

        self.fig_accuracy.data[0].y = self.y_train_acc
        self.fig_accuracy.data[1].y = self.y_test_acc
        self.fig_loss.data[0].y = self.y_train_loss
        self.fig_loss.data[1].y = self.y_test_loss


    def log_hyperparams(self, params):
        pass
    @property
    def experiment(self):
        return None
    @property
    def name(self):
        return 'PlotlyLogger'
    @property
    def version(self):
        return '0.1'

In [16]:
model = TextConvNN_pl(model)
logger = PlotlyLogger()
trainer = pl.Trainer(max_epochs=10,
                    accelerator='gpu', 
                    devices=1, 
                    logger = logger, 
                    )
trainer.fit(model, train_loader, test_loader)

CUDA available: True, Device: cuda:0


FigureWidget({
    'data': [{'mode': 'lines', 'name': 'Train', 'type': 'scatter', 'uid': '2c022333-4444-4744-bce2-11371eee0201', 'y': []},
             {'mode': 'lines', 'name': 'Test', 'type': 'scatter', 'uid': '0d6dd3b8-a31f-4bb2-becd-38dbe77d37e1', 'y': []}],
    'layout': {'margin': {'b': 20, 'l': 20, 'r': 20, 't': 20},
               'template': '...',
               'xaxis': {'title': {'text': 'Epoch'}},
               'yaxis': {'title': {'text': 'Accuracy'}}}
})

FigureWidget({
    'data': [{'mode': 'lines', 'name': 'Train', 'type': 'scatter', 'uid': '785a811d-91a9-4b02-bacf-ee22c5530907', 'y': []},
             {'mode': 'lines', 'name': 'Test', 'type': 'scatter', 'uid': 'cb74b707-439d-47c6-9c72-c8ee7d462d04', 'y': []}],
    'layout': {'margin': {'b': 20, 'l': 20, 'r': 20, 't': 20},
               'template': '...',
               'xaxis': {'title': {'text': 'Epoch'}},
               'yaxis': {'title': {'text': 'Loss'}}}
})

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type          | Params | Mode 
------------------------------------------------
0 | model | TextConvNN_pl | 720 K  | train
------------------------------------------------
720 K     Trainable params
0         Non-trainable params
720 K     Total params
2.883     Total estimated model params size (MB)
2         Modules in train mode
10        Modules in eval mode


Sanity Checking: |                    | 0/? [00:00<?, ?it/s]

Training: |                           | 0/? [00:00<?, ?it/s]

Validation: |                         | 0/? [00:00<?, ?it/s]


Detected KeyboardInterrupt, attempting graceful shutdown ...


NameError: name 'exit' is not defined