## Imports and installs

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
try:
    from google.colab import drive
    drive.mount('/content/drive')
    IN_COLAB = True
except:
    IN_COLAB = False

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
%%capture
if IN_COLAB:
  !pip install nltk
  !pip install transformers
  !pip install translators
  !pip install datasets
  !pip install langdetect
  !python -m spacy download en_core_web_sm
  !python -m spacy download en_core_web_trf
  !pip install bpemb

In [None]:
import pandas as pd
from datasets import load_dataset
from datasets import load_from_disk
from collections import Counter
import matplotlib.pyplot as plt
import numpy as np
import string
import spacy
from tqdm import tqdm
import translators as ts
from langdetect import detect
import random
import abc
import math
import collections
from collections import defaultdict
import regex as re
import pickle
# nltk imports
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.stem import SnowballStemmer
from nltk import word_tokenize
from nltk.tokenize import WhitespaceTokenizer
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
from nltk.corpus import stopwords
# pytorch
import torch
import torchtext
# Setting torch device
device = torch.device("cpu")
if torch.cuda.is_available():
    device = torch.device("cuda")
# BPE
from bpemb import BPEmb

Using region  server backend.

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
%cd '/content/drive/MyDrive/MASTERS KU/AUTUMN 2023/NLP/Week 37'

/content/drive/MyDrive/MASTERS KU/AUTUMN 2023/NLP/Week 37


## Local imports

In [None]:
from utils import *
from model_rnn import NextWordPredictor
from model_rnn import *

## Loading and saving datasets

In [None]:
# Saving
#train.save_to_disk('/content/drive/My Drive/MASTERS KU/AUTUMN 2023/NLP/tydiqa/train')
#val.save_to_disk('/content/drive/My Drive/MASTERS KU/AUTUMN 2023/NLP/tydiqa/validation')


In [None]:
# Loading
train = load_from_disk('/content/drive/My Drive/MASTERS KU/AUTUMN 2023/NLP/tydiqa/train')
val = load_from_disk('/content/drive/My Drive/MASTERS KU/AUTUMN 2023/NLP/tydiqa/validation')


## Initial preprocessing of datasets:
- Spliting into train and val
- Splitting into languages

In [None]:
train_df = pd.DataFrame(train)
val_df = pd.DataFrame(val)

In [None]:
bengali_train = get_df_lang(train_df, 'bengali')
arabic_train = get_df_lang(train_df, 'arabic')
indonesian_train = get_df_lang(train_df, 'indonesian')

bengali_val = get_df_lang(val_df, 'bengali')
arabic_val = get_df_lang(val_df, 'arabic')
indonesian_val = get_df_lang(val_df, 'indonesian')

# Using 'questions' as features

## Retrieving answer text

In [None]:
indonesian_train_columns = indonesian_train[['annotations', 'question_text','document_plaintext']]
indonesian_val_columns = indonesian_val[['annotations', 'question_text','document_plaintext']]
indonesian_train_columns['answer_text'] = indonesian_train_columns['annotations'].apply(custom_function) # answers train
indonesian_val_columns['answer_text'] = indonesian_val_columns['annotations'].apply(custom_function) # answers val

arabic_train_columns = arabic_train[['annotations', 'question_text','document_plaintext']]
arabic_val_columns = arabic_val[['annotations', 'question_text','document_plaintext']]
arabic_train_columns['answer_text'] = arabic_train_columns['annotations'].apply(custom_function) # answers train
arabic_val_columns['answer_text'] = arabic_val_columns['annotations'].apply(custom_function) # answers val

bengali_train_columns = bengali_train[['annotations', 'question_text','document_plaintext']]
bengali_val_columns = bengali_val[['annotations', 'question_text','document_plaintext']]
bengali_train_columns['answer_text'] = bengali_train_columns['annotations'].apply(custom_function) # answers train
bengali_val_columns['answer_text'] = bengali_val_columns['annotations'].apply(custom_function) # answers val

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  indonesian_train_columns['answer_text'] = indonesian_train_columns['annotations'].apply(custom_function) # answers train
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  indonesian_val_columns['answer_text'] = indonesian_val_columns['annotations'].apply(custom_function) # answers val
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#re

# RNN

In [None]:
# retrieved from course slides
def enforce_reproducibility(seed=42):
    # Sets seed manually for both CPU and CUDA
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    # System based
    random.seed(seed)
    np.random.seed(seed)
enforce_reproducibility()

device

device(type='cuda')

## Setting up the training and val corpuses (questions)

In [None]:
ben_corpus_val = bengali_val_columns['question_text'].to_list()
ben_corpus_train = bengali_train_columns['question_text'].to_list()

arabic_corpus_train = arabic_train_columns['question_text'].to_list()
arabic_corpus_val = arabic_val_columns['question_text'].to_list()

indonesian_corpus_train = indonesian_train_columns['question_text'].to_list()
indonesian_corpus_val = indonesian_val_columns['question_text'].to_list()

In [None]:
len(ben_corpus_train)

4779

In [None]:
len(ben_corpus_val)

224

## Building and saving/loading vocabulary

### Building and saving vocabulary

In [None]:
file_path = "/content/drive/MyDrive/MASTERS KU/AUTUMN 2023/NLP/Week 37/vocabs/bengali_questions_vocab.txt"

# This line of code builds the vocabulary with both the train and the validation corpuses
#total_vocabulary = build_vocab(ben_corpus_val + ben_corpus_train)

# This line of code saves the string representation to a text file
#with open(file_path, "w") as file:
  #file.write(repr(total_vocabulary))


### Loading vocabulary

In [None]:
# This line of code reads the saved vocabulary
#with open(file_path, "r") as file:
    #list_str = file.read()

# This line of code uses `eval` to parse the string into a list
#total_vocabulary = eval(list_str)

In [None]:
print(len(total_vocabulary))

3749


## Build embedding matrix

In [None]:
# load the pretrained embeddings
bpemb_ar = BPEmb(lang='ar', dim=100, vs=25000) # arabic model
bpemb_ben = BPEmb(lang='bn', dim=100, vs=25000) # bengali model
bpemb_ind = BPEmb(lang='id', dim=100, vs=25000) # indonesian model

downloading https://nlp.h-its.org/bpemb/ar/ar.wiki.bpe.vs25000.model


100%|██████████| 742254/742254 [00:01<00:00, 741462.92B/s]


downloading https://nlp.h-its.org/bpemb/ar/ar.wiki.bpe.vs25000.d100.w2v.bin.tar.gz


100%|██████████| 9491724/9491724 [00:02<00:00, 4727041.42B/s]


downloading https://nlp.h-its.org/bpemb/bn/bn.wiki.bpe.vs25000.model


100%|██████████| 863227/863227 [00:01<00:00, 614118.83B/s]


downloading https://nlp.h-its.org/bpemb/bn/bn.wiki.bpe.vs25000.d100.w2v.bin.tar.gz


100%|██████████| 9517491/9517491 [00:02<00:00, 4754752.32B/s]


downloading https://nlp.h-its.org/bpemb/id/id.wiki.bpe.vs25000.model


100%|██████████| 650018/650018 [00:00<00:00, 814099.10B/s]


downloading https://nlp.h-its.org/bpemb/id/id.wiki.bpe.vs25000.d100.w2v.bin.tar.gz


100%|██████████| 9465922/9465922 [00:02<00:00, 4316680.11B/s] 


In [None]:
#get the embedding matrix for our vocabulary
embedding_matrix, oov = create_embedding_matrix(total_vocabulary, bpemb_ben)

69.13843691651107 % of tokens are out of vocabulary


## Parse the data and vectorize

In [None]:
#train_features = [text_to_indices(x, total_vocabulary, lang='bengali') for x in ben_corpus_train]
#val_features = [text_to_indices(x, total_vocabulary, lang='bengali') for x in ben_corpus_val]

In [None]:
#longest_text = max(train_features+val_features, key=len)
#max_length = len(longest_text)
#padding_index = 0

# padding the feature vectors by applying the add_padding function to each text in the train and validation corpus
#train_features = [add_padding(x, max_length, padding_index) for x in train_features]
#val_features = [add_padding(x, max_length, padding_index) for x in val_features]

### Saving features

In [None]:
# These lines of code save the embedded features we just created

#with open('/content/drive/My Drive/MASTERS KU/AUTUMN 2023/NLP/Week 37/features/emb_bengali_questions_train_features.pkl', 'wb') as f:
    #pickle.dump(train_features, f)

#with open('/content/drive/My Drive/MASTERS KU/AUTUMN 2023/NLP/Week 37/features/emb_bengali_questions_val_features.pkl', 'wb') as f:
    #pickle.dump(val_features, f)

### Loading features

In [None]:
# These lines of code load the previously saved features

with open('/content/drive/My Drive/MASTERS KU/AUTUMN 2023/NLP/Week 37/features/emb_bengali_questions_train_features.pkl', 'rb') as f:
    train_features = pickle.load(f)

with open('/content/drive/My Drive/MASTERS KU/AUTUMN 2023/NLP/Week 37/features/emb_bengali_questions_val_features.pkl', 'rb') as f:
    val_features = pickle.load(f)

## Get inputs and targets by splitting sentences (window = 4)

In [None]:
inputs = []
targets = []
for sentence in train_features:
  for feature in split_sentence(4,sentence,'bengali'):
    inputs.append(feature)

  for target in split_sentence_target(4,sentence,'bengali'):
    targets.append(target)

inputs_test = []
targets_test = []
for sentence in val_features:
  for feature in split_sentence(4,sentence,'bengali'):

    inputs_test.append( feature)
  for target in  split_sentence_target(4,sentence,'bengali'):
    targets_test.append(target)

In [None]:
class PredictorTrain(torch.utils.data.Dataset):
    def __init__(self, features, labels):
        self.X = torch.LongTensor(features).type(torch.float32)

        self.y = torch.from_numpy(np.array(labels)).type(torch.float32)

    def __getitem__(self, index):
        X = self.X[index]
        y = self.y[index].unsqueeze(0)
        return X, y

    def __len__(self):
        return len(self.y)

num_classes = len(total_vocabulary)

X_train = torch.tensor(inputs)  # input sequences (train)
y_train = torch.tensor(targets)
X_test = torch.tensor(inputs_test)  # input sequences (test)
y_test = torch.tensor(targets_test)


# Shuffle to avoid overfitting based in the sequence of inputs
num_samples = X_train.size(0)
shuffled_indices = torch.randperm(num_samples)

# Use the shuffled indices to reorder both tensors
X_train_shuffled = X_train[shuffled_indices]
y_train_shuffled = y_train[shuffled_indices]


data_train = PredictorTrain(X_train_shuffled, y_train_shuffled) # this function takes train features and labels
data_val = PredictorTrain(X_test, y_test) # this function takes test features and labels

train_loader = torch.utils.data.DataLoader(data_train, batch_size=64)
val_loader = torch.utils.data.DataLoader(data_val, batch_size = 64)

## Run the model

In [None]:
from model_rnn import training_loop

In [None]:
model = NextWordPredictor(rnn_size=100, vocab_size=len(total_vocabulary),embedding_matrix=embedding_matrix)

In [None]:
print(model)

NextWordPredictor(
  (rnn): RNN(4, 100, batch_first=True)
  (fc_logits): Linear(in_features=100, out_features=3749, bias=True)
)


In [None]:
model = training_loop(model,10, train_loader) #training the model
output_probs = model.forward(X_train_shuffled) # generate outputs and evaluate the trained model
outputs = evaluate(model,val_loader)


Epoch 1: loss 7.588558952821383
Epoch 2: loss 7.57173035744103
Epoch 3: loss 7.57155711070464
Epoch 4: loss 7.572176210113132
Epoch 5: loss 7.571852297728523
Epoch 6: loss 7.571265943817532
Epoch 7: loss 7.571385404173549
Epoch 8: loss 7.571088767725462
Epoch 9: loss 7.571277988861265
Epoch 10: loss 7.571768843424338


## Get predictions

In [None]:
def get_prediction(total_vocabulary,output):
  highest_value = torch.max(output)
  position = torch.where(output==highest_value)
  if len(position[0]) >0:

    return total_vocabulary[position[0].item()]
  return None

In [None]:
for output in outputs:
  print(get_prediction(total_vocabulary,output))

?
?
[unk]
?
?



















[unk]
[unk]
?
?
?
?
?
?


## Saving the model

In [None]:
# this line of code saves the model
#torch.save(model.state_dict(), '/content/drive/MyDrive/MASTERS KU/AUTUMN 2023/NLP/Week 37/weights/rnn_bengali_questions_weights.pth')

## Looking at class distribution to understand predictions

In [None]:
counted_numbers = Counter(targets)
# Sort the items in descending order based on their counts
sorted_numbers = sorted(counted_numbers.items(), key=lambda x: x[1], reverse=True)
sorted_numbers = sorted_numbers[0:5]
# Print the counts
for number, count in sorted_numbers:
    print(f"{number}: {count} times, which corresponds to token: ", total_vocabulary[number])

0: 79587 times, which corresponds to token:  
1095: 4777 times, which corresponds to token:  ?
1096: 2894 times, which corresponds to token:  [unk]
1646: 944 times, which corresponds to token:  কী
1536: 906 times, which corresponds to token:  কত


# Using 'document_plaintext' as features

## Local imports

In [None]:
from utils import *

## Setting up the training and val corpuses (document_plaintext)

In [None]:
arabic_corpus_train = arabic_train_columns['document_plaintext'].to_list()
arabic_corpus_val = arabic_val_columns['document_plaintext'].to_list()

indonesian_corpus_train = indonesian_train_columns['document_plaintext'].to_list()
indonesian_corpus_val = indonesian_val_columns['document_plaintext'].to_list()

bengali_doc_train = bengali_train_columns['document_plaintext'].to_list()
bengali_doc_val = bengali_train_columns['document_plaintext'].to_list()

## Subsetting the corpuses for computational ease

In [None]:
bengali_doc_train = random.sample(bengali_doc_train, 200)
bengali_doc_val = random.sample(bengali_doc_val, 40)

In [None]:
len(bengali_doc_train)

200

In [None]:
len(bengali_doc_val)

40

### Building and saving vocabulary

In [None]:
file_path = "/content/drive/MyDrive/MASTERS KU/AUTUMN 2023/NLP/Week 37/vocabs/bengali_docs_vocab.txt"

# This line of code builds the vocabulary with both the train and the validation corpuses
#total_vocabulary = build_vocab(bengali_doc_train + bengali_doc_val)

# This line of code saves the string representation to a text file
#with open(file_path, "w") as file:
  #file.write(repr(total_vocabulary))

### Loading vocabulary

In [None]:
# This line of code reads the saved vocabulary
with open(file_path, "r") as file:
    list_str = file.read()

# This line of code uses `eval` to parse the string into a list
total_vocabulary = eval(list_str)

print(len(total_vocabulary))

6438


In [None]:
#get the embedding matrix for the  vocabulary
embedding_matrix, oov = create_embedding_matrix(total_vocabulary, bpemb_ben)

72.50698974836905 % of tokens are out of vocabulary


## Parse the data and vectorize

In [None]:
#train_features_doc = [text_to_indices(x, total_vocabulary, lang='bengali') for x in bengali_doc_train]
#val_features_doc = [text_to_indices(x, total_vocabulary, lang='bengali') for x in bengali_doc_val]

In [None]:
#longest_text = max(train_features_doc+val_features_doc, key=len)
#max_length = len(longest_text)
#padding_index = 0

# padding the feature vectors by applying the add_padding function to each text in the train and validation corpus
#train_features_doc = [add_padding(x, max_length, padding_index) for x in train_features_doc]
#val_features_doc = [add_padding(x, max_length, padding_index) for x in val_features_doc]

### Saving the features

In [None]:
# Save
#with open('/content/drive/My Drive/MASTERS KU/AUTUMN 2023/NLP/Week 37/features/emb_bengali_docs_train_features.pkl', 'wb') as f:
    #pickle.dump(train_features_doc, f)

#with open('/content/drive/My Drive/MASTERS KU/AUTUMN 2023/NLP/Week 37/features/emb_bengali_docs_val_features.pkl', 'wb') as f:
    #pickle.dump(val_features_doc, f)

### Loading the features

In [None]:
# Load
with open('/content/drive/My Drive/MASTERS KU/AUTUMN 2023/NLP/Week 37/features/emb_bengali_docs_train_features.pkl', 'rb') as f:
    train_features_doc = pickle.load(f)

with open('/content/drive/My Drive/MASTERS KU/AUTUMN 2023/NLP/Week 37/features/emb_bengali_docs_val_features.pkl', 'rb') as f:
    val_features_doc = pickle.load(f)

## Get inputs and targets by splitting sentences (window = 4)

In [None]:
inputs_doc = []
targets_doc = []
for sentence in train_features_doc:
  for feature in split_sentence(4,sentence,'bengali'):
    inputs_doc.append(feature)

  for target in  split_sentence_target(4,sentence,'bengali'):
    targets_doc.append(target)

inputs_test_doc = []
targets_test_doc = []
for sentence in val_features_doc:
  for feature in split_sentence(4,sentence,'bengali'):

    inputs_test_doc.append(feature)
  for target in  split_sentence_target(4,sentence,'bengali'):
    targets_test_doc.append(target)

In [None]:
class PredictorTrain(torch.utils.data.Dataset):
    def __init__(self, features, labels):
        self.X = torch.LongTensor(features).type(torch.float32)

        self.y = torch.from_numpy(np.array(labels)).type(torch.float32)

    def __getitem__(self, index):
        X = self.X[index]
        y = self.y[index].unsqueeze(0)
        return X, y

    def __len__(self):
        return len(self.y)

num_classes = len(total_vocabulary)

X_train = torch.tensor(inputs_doc)  # input sequences (train)
y_train = torch.tensor(targets_doc)
X_test = torch.tensor(inputs_test_doc)  # input sequences (test)
y_test = torch.tensor(targets_test_doc)


# Shuffle to avoid overfitting based in the sequence of inputs
num_samples = X_train.size(0)
shuffled_indices = torch.randperm(num_samples)
X_train_shuffled = X_train[shuffled_indices]
y_train_shuffled = y_train[shuffled_indices]


data_train = PredictorTrain(X_train_shuffled, y_train_shuffled) # this function takes train features and labels
data_val = PredictorTrain(X_test, y_test) # this function takes test features and labels

train_loader = torch.utils.data.DataLoader(data_train, batch_size=64)
val_loader = torch.utils.data.DataLoader(data_val, batch_size = 64)

In [None]:
from model_rnn import NextWordPredictor
from model_rnn import *

In [None]:
from model_rnn import training_loop

In [None]:
model = NextWordPredictor(rnn_size=100, vocab_size=len(total_vocabulary), embedding_matrix=embedding_matrix)

In [None]:
print(model)

NextWordPredictor(
  (rnn): RNN(4, 100, batch_first=True)
  (fc_logits): Linear(in_features=100, out_features=6438, bias=True)
)


In [None]:
model_doc = training_loop(model,10, train_loader) #training the model
output_probs_doc = model_doc.forward(X_train) # generate outputs and evaluate the trained model
outputs_doc = evaluate( model_doc,val_loader)

Epoch 1: loss 8.022043839298984
Epoch 2: loss 7.984240531921387
Epoch 3: loss 7.9814798602934225
Epoch 4: loss 7.981470536820384
Epoch 5: loss 7.981419946561509
Epoch 6: loss 7.981410040599708
Epoch 7: loss 7.981397965408172
Epoch 8: loss 7.981380777072401
Epoch 9: loss 7.981342724230828
Epoch 10: loss 7.98133451965323


In [None]:
outputs_doc.shape

torch.Size([24, 6438])

In [None]:
def get_prediction(total_vocabulary,output):
  highest_value = torch.max(output)
  position = torch.where(output==highest_value)
  if len(position[0]) >0:

    return total_vocabulary[position[0].item()]
  return None

for output in outputs_doc:
  print(get_prediction(total_vocabulary,output))

।
।
।
।
।
।
।
।
।
।
।
।
।
।
।
।
।
।
।
।
।
।
।
।


## Saving the model

In [None]:
# Saving the model
#torch.save(model.state_dict(), '/content/drive/MyDrive/MASTERS KU/AUTUMN 2023/NLP/Week 37/weights/rnn_bengali_documents_weights.pth')

## Looking at class distribution to understand the predictions

In [None]:
# Understanding how classes are distributed

from collections import Counter
counted_numbers = Counter(targets_doc)

# Sort the items in descending order based on their counts
sorted_numbers = sorted(counted_numbers.items(), key=lambda x: x[1], reverse=True)
sorted_numbers = sorted_numbers[0:5]

# Print the counts
for number, count in sorted_numbers:
    print(f"{number}: {count} times, which corresponds to token: ", total_vocabulary[number])

0: 84610 times, which corresponds to token:  
2030: 1051 times, which corresponds to token:  ।
1713: 461 times, which corresponds to token:  ,
1519: 218 times, which corresponds to token:  ##ের
2752: 217 times, which corresponds to token:  ও
