This notebook mirrors content in LightningAI's Course on NLP 

In [73]:
import os 
import numpy as np 
import pandas as pd 
import torch
from sklearn.feature_extraction.text import CountVectorizer
from lightning.pytorch.loggers import CSVLogger

from local_dataset_utilities import download_dataset, load_dataset_into_to_dataframe, partition_dataset, IMDBDataset
from local_utilities import LightningModel

In [74]:
df = load_dataset_into_to_dataframe()

100%|██████████| 50000/50000 [00:41<00:00, 1204.94it/s]

Class distribution:





In [100]:
df.columns 

Index(['text', 'label'], dtype='object')

In [101]:
df

Unnamed: 0,text,label
0,I went and saw this movie last night after bei...,1
0,Actor turned director Bill Paxton follows up h...,1
0,As a recreational golfer with some knowledge o...,1
0,"I saw this film in a sneak preview, and it is ...",1
0,Bill Paxton has taken the true story of the 19...,1
...,...,...
0,"Towards the end of the movie, I felt it was to...",0
0,This is the kind of movie that my enemies cont...,0
0,I saw 'Descent' last night at the Stockholm Fi...,0
0,Some films that you pick up for a pound turn o...,0


In [102]:
partition_dataset(df)

In [103]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')
df_val = pd.read_csv('val.csv')

In [104]:
np.bincount(df_train['label'])

array([17452, 17548])

In [105]:
np.bincount(df_test['label'])

array([5006, 4994])

In [106]:
np.bincount(df_val['label'])

array([2542, 2458])

In [107]:
#max features limited to avoid overfitting 
#lower case to disallow duplication of words with different capitalization 
#Stop words clear words like pronouns articles etc. 
count_vec = CountVectorizer(lowercase=True, max_features=10000, stop_words="english")
count_vec.fit(df_train['text'])

The above builds a dictionary with the 10,000 most frequent words in the dataset and maps them to their respective counts 

In [108]:
count_vec.vocabulary_

{'started': 8515,
 'watching': 9725,
 'series': 7957,
 'cable': 1320,
 'idea': 4488,
 'hate': 4191,
 'character': 1544,
 'hold': 4339,
 'beautifully': 892,
 'developed': 2574,
 'understand': 9375,
 'react': 7196,
 'frustration': 3737,
 'fear': 3439,
 'greed': 4020,
 'temptation': 8974,
 'way': 9736,
 'viewer': 9574,
 'experiencing': 3280,
 'christopher': 1656,
 'learning': 5199,
 'br': 1151,
 'abuse': 188,
 'physically': 6608,
 'emotionally': 3046,
 'just': 4963,
 'read': 7199,
 'newspaper': 6088,
 'women': 9880,
 'tolerate': 9134,
 'behavior': 915,
 'dream': 2831,
 'house': 4418,
 'endless': 3074,
 'supply': 8779,
 'expensive': 3276,
 'things': 9036,
 'sure': 8791,
 'loving': 5426,
 'faithful': 3371,
 'husband': 4465,
 'maybe': 5640,
 'watch': 9719,
 'doesn': 2754,
 'matter': 5630,
 'times': 9104,
 'episode': 3140,
 'missed': 5813,
 'episodes': 3141,
 'sequence': 7950,
 'season': 7869,
 'late': 5151,
 'night': 6101,
 'commercials': 1874,
 'language': 5133,
 'reruns': 7427,
 'movie': 5

The following code transforms these datasets into feature vectors

In [109]:
x_train = count_vec.transform(df_train['text'])
x_test = count_vec.transform(df_test['text'])
x_val = count_vec.transform(df_val['text'])

In [110]:
print(x_train)

  (0, 188)	1
  (0, 892)	1
  (0, 915)	1
  (0, 1144)	1
  (0, 1151)	6
  (0, 1320)	1
  (0, 1544)	1
  (0, 1656)	2
  (0, 1816)	1
  (0, 1818)	1
  (0, 1874)	1
  (0, 2574)	1
  (0, 2754)	1
  (0, 2831)	1
  (0, 2910)	1
  (0, 3046)	1
  (0, 3074)	1
  (0, 3076)	1
  (0, 3112)	1
  (0, 3140)	1
  (0, 3141)	1
  (0, 3276)	1
  (0, 3280)	1
  (0, 3371)	1
  (0, 3386)	1
  :	:
  (34999, 8078)	1
  (34999, 8109)	1
  (34999, 8185)	1
  (34999, 8225)	1
  (34999, 8350)	1
  (34999, 8355)	1
  (34999, 8439)	1
  (34999, 8469)	1
  (34999, 8551)	1
  (34999, 8651)	1
  (34999, 8791)	1
  (34999, 9094)	2
  (34999, 9101)	2
  (34999, 9104)	1
  (34999, 9276)	1
  (34999, 9490)	1
  (34999, 9701)	1
  (34999, 9719)	1
  (34999, 9747)	1
  (34999, 9749)	2
  (34999, 9899)	1
  (34999, 9904)	2
  (34999, 9942)	1
  (34999, 9944)	1
  (34999, 9965)	1


In [111]:
x_train.shape

(35000, 10000)

We have 3500 training examples and 10,000 features per example so the above dimensionality makes sense 

In [112]:
type(x_train)

scipy.sparse._csr.csr_matrix

In [113]:
print(x_train.todense()[0])

[[0 0 0 ... 0 0 0]]


In [114]:
from torch.utils.data import Dataset, DataLoader

#build our own torch dataset given our data 
class text_dataset(Dataset): 
    def __init__(self, X, y): 
        #convert numpy arrays to torch tensors 
        self.features = torch.tensor(X, dtype=torch.float32)
        self.labels = torch.tensor(y, dtype=torch.int64)
        
    def __getitem__(self, index):
        x = self.features[index]
        y = self.labels[index]
        return x, y
    
    def __len__(self): 
        return self.labels.shape[0]

In [115]:
train_set = text_dataset(x_train.todense(), df_train['label'].values)
val_set = text_dataset(x_val.todense(), df_val['label'].values)

In [116]:
train_loader = DataLoader(dataset=train_set, batch_size=32, shuffle = True)

In [117]:
val_loader =  DataLoader(dataset=val_set, batch_size=32, shuffle = True)

In [118]:
for idx, (features, labels) in enumerate(train_loader): 
    print(features.shape)
    print(labels.shape)
    break

torch.Size([32, 10000])
torch.Size([32])


Next, we build our classifier model 

In [119]:
class logistic_regression(torch.nn.Module): 
    
    def __init__(self, num_features, num_classes) -> None:
        super().__init__()
        self.linear = torch.nn.Linear(num_features, num_classes)
    
    def forward(self, x): 
        outputs = torch.sigmoid(self.linear(x))
        return outputs     

In [120]:
torch_model = logistic_regression(num_features=10000, num_classes=2)

In [121]:
model = LightningModel(model=torch_model, learning_rate=0.05)

In [123]:
import lightning as L
trainer = L.Trainer(
    max_epochs=30, 
    accelerator="gpu", 
    deterministic=True, 
    logger=CSVLogger(save_dir="logs/", name="my-model"),
)

GPU available: True (cuda), used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
  rank_zero_warn(


In [124]:
trainer.fit(model=model, train_dataloaders=train_loader, val_dataloaders=val_loader)


  | Name      | Type                | Params
--------------------------------------------------
0 | model     | logistic_regression | 100 K 
1 | train_acc | MulticlassAccuracy  | 0     
2 | val_acc   | MulticlassAccuracy  | 0     
3 | test_acc  | MulticlassAccuracy  | 0     
--------------------------------------------------
100 K     Trainable params
0         Non-trainable params
100 K     Total params
0.400     Total estimated model params size (MB)


Sanity Checking DataLoader 0:   0%|          | 0/2 [00:00<?, ?it/s]

  rank_zero_warn(
  rank_zero_warn(


RuntimeError: Detected more unique values in `preds` than `num_classes`. Expected only 2 but found 10 in `preds`.