## Imports and installs

In [3]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [4]:
try:
    from google.colab import drive
    drive.mount('/content/drive')
    IN_COLAB = True
except:
    IN_COLAB = False

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
%%capture
if IN_COLAB:
  !pip install nltk
  !pip install transformers
  !pip install translators
  !pip install datasets
  !pip install langdetect
  !python -m spacy download en_core_web_sm
  !python -m spacy download en_core_web_trf
  !pip install bpemb

In [6]:
import pandas as pd
from datasets import load_dataset
from datasets import load_from_disk
from collections import Counter
import matplotlib.pyplot as plt
import numpy as np
import string
import spacy
from tqdm import tqdm
import translators as ts
from langdetect import detect
import random
import abc
import math
import collections
from collections import defaultdict
import regex as re
import pickle
# nltk imports
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.stem import SnowballStemmer
from nltk import word_tokenize
from nltk.tokenize import WhitespaceTokenizer
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
from nltk.corpus import stopwords
# pytorch
import torch
import torchtext
# Setting torch device
device = torch.device("cpu")
if torch.cuda.is_available():
    device = torch.device("cuda")
# BPE
from bpemb import BPEmb

Using region South Carolina server backend.

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [7]:
%cd '/content/drive/MyDrive/MASTERS KU/AUTUMN 2023/NLP/Week 37'

/content/drive/MyDrive/MASTERS KU/AUTUMN 2023/NLP/Week 37


## Local imports

In [8]:
from utils import *
from model_rnn import NextWordPredictor
from model_rnn import *

## Loading and saving datasets

In [9]:
# Saving
#train.save_to_disk('/content/drive/My Drive/MASTERS KU/AUTUMN 2023/NLP/tydiqa/train')
#val.save_to_disk('/content/drive/My Drive/MASTERS KU/AUTUMN 2023/NLP/tydiqa/validation')


In [10]:
# Loading
train = load_from_disk('/content/drive/My Drive/MASTERS KU/AUTUMN 2023/NLP/tydiqa/train')
val = load_from_disk('/content/drive/My Drive/MASTERS KU/AUTUMN 2023/NLP/tydiqa/validation')


## Initial preprocessing of datasets:
- Spliting into train and val
- Splitting into languages

In [11]:
train_df = pd.DataFrame(train)
val_df = pd.DataFrame(val)

In [12]:
bengali_train = get_df_lang(train_df, 'bengali')
arabic_train = get_df_lang(train_df, 'arabic')
indonesian_train = get_df_lang(train_df, 'indonesian')

bengali_val = get_df_lang(val_df, 'bengali')
arabic_val = get_df_lang(val_df, 'arabic')
indonesian_val = get_df_lang(val_df, 'indonesian')

# Using 'questions' as features

## Retrieving answer text

In [13]:
indonesian_train_columns = indonesian_train[['annotations', 'question_text','document_plaintext']]
indonesian_val_columns = indonesian_val[['annotations', 'question_text','document_plaintext']]
indonesian_train_columns['answer_text'] = indonesian_train_columns['annotations'].apply(custom_function) # answers train
indonesian_val_columns['answer_text'] = indonesian_val_columns['annotations'].apply(custom_function) # answers val

arabic_train_columns = arabic_train[['annotations', 'question_text','document_plaintext']]
arabic_val_columns = arabic_val[['annotations', 'question_text','document_plaintext']]
arabic_train_columns['answer_text'] = arabic_train_columns['annotations'].apply(custom_function) # answers train
arabic_val_columns['answer_text'] = arabic_val_columns['annotations'].apply(custom_function) # answers val

bengali_train_columns = bengali_train[['annotations', 'question_text','document_plaintext']]
bengali_val_columns = bengali_val[['annotations', 'question_text','document_plaintext']]
bengali_train_columns['answer_text'] = bengali_train_columns['annotations'].apply(custom_function) # answers train
bengali_val_columns['answer_text'] = bengali_val_columns['annotations'].apply(custom_function) # answers val

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  indonesian_train_columns['answer_text'] = indonesian_train_columns['annotations'].apply(custom_function) # answers train
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  indonesian_val_columns['answer_text'] = indonesian_val_columns['annotations'].apply(custom_function) # answers val
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#re

# RNN

In [14]:
# retrieved from course slides
def enforce_reproducibility(seed=42):
    # Sets seed manually for both CPU and CUDA
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    # System based
    random.seed(seed)
    np.random.seed(seed)
enforce_reproducibility()

device

device(type='cuda')

## Setting up the training and val corpuses (questions)

In [15]:
ben_corpus_val = bengali_val_columns['question_text'].to_list()
ben_corpus_train = bengali_train_columns['question_text'].to_list()

arabic_corpus_train = arabic_train_columns['question_text'].to_list()
arabic_corpus_val = arabic_val_columns['question_text'].to_list()

indonesian_corpus_train = indonesian_train_columns['question_text'].to_list()
indonesian_corpus_val = indonesian_val_columns['question_text'].to_list()

## Subsetting for computational ease

In [16]:
arabic_corpus_train = random.sample(arabic_corpus_train, 3000)
arabic_corpus_val = random.sample(arabic_corpus_val, 600)

In [17]:
len(arabic_corpus_train)

3000

In [18]:
len(arabic_corpus_val)

600

## Building and saving/loading vocabulary

### Building and saving vocabulary

In [19]:
file_path = "/content/drive/MyDrive/MASTERS KU/AUTUMN 2023/NLP/Week 37/vocabs/arabic_questions_vocab.txt"

# This line of code builds the vocabulary with both the train and the validation corpuses
#total_vocabulary = build_vocab(arabic_corpus_val + arabic_corpus_train)

# This line of code saves the string representation to a text file
#with open(file_path, "w") as file:
  #file.write(repr(total_vocabulary))


### Loading vocabulary

In [20]:
# This line of code reads the saved vocabulary
with open(file_path, "r") as file:
    list_str = file.read()

# This line of code uses `eval` to parse the string into a list
total_vocabulary = eval(list_str)

In [21]:
print(len(total_vocabulary))

5649


## Build embedding matrix

In [22]:
# load the pretrained embeddings
bpemb_ar = BPEmb(lang='ar', dim=100, vs=25000) # arabic model
bpemb_ben = BPEmb(lang='bn', dim=100, vs=25000) # bengali model
bpemb_ind = BPEmb(lang='id', dim=100, vs=25000) # indonesian model

downloading https://nlp.h-its.org/bpemb/ar/ar.wiki.bpe.vs25000.model


100%|██████████| 742254/742254 [00:00<00:00, 1491257.64B/s]


downloading https://nlp.h-its.org/bpemb/ar/ar.wiki.bpe.vs25000.d100.w2v.bin.tar.gz


100%|██████████| 9491724/9491724 [00:01<00:00, 8669701.94B/s] 


downloading https://nlp.h-its.org/bpemb/bn/bn.wiki.bpe.vs25000.model


100%|██████████| 863227/863227 [00:00<00:00, 1718306.36B/s]


downloading https://nlp.h-its.org/bpemb/bn/bn.wiki.bpe.vs25000.d100.w2v.bin.tar.gz


100%|██████████| 9517491/9517491 [00:01<00:00, 9494634.45B/s] 


downloading https://nlp.h-its.org/bpemb/id/id.wiki.bpe.vs25000.model


100%|██████████| 650018/650018 [00:00<00:00, 1312096.15B/s]


downloading https://nlp.h-its.org/bpemb/id/id.wiki.bpe.vs25000.d100.w2v.bin.tar.gz


100%|██████████| 9465922/9465922 [00:01<00:00, 7951564.06B/s] 


In [23]:
#get the embedding matrix for our vocabulary
embedding_matrix, oov = create_embedding_matrix(total_vocabulary, bpemb_ar)

81.51885289431759 % of tokens are out of vocabulary


## Parse the data and vectorize

In [24]:
#train_features = [text_to_indices(x, total_vocabulary, lang='arabic') for x in arabic_corpus_train]
#val_features = [text_to_indices(x, total_vocabulary, lang='arabic') for x in arabic_corpus_val]

In [25]:
#longest_text = max(train_features+val_features, key=len)
#max_length = len(longest_text)
#padding_index = 0

# padding the feature vectors by applying the add_padding function to each text in the train and validation corpus
#train_features = [add_padding(x, max_length, padding_index) for x in train_features]
#val_features = [add_padding(x, max_length, padding_index) for x in val_features]

### Saving features

In [26]:
# These lines of code save the embedded features we just created

#with open('/content/drive/My Drive/MASTERS KU/AUTUMN 2023/NLP/Week 37/features/emb_arabic_questions_train_features.pkl', 'wb') as f:
    #pickle.dump(train_features, f)

#with open('/content/drive/My Drive/MASTERS KU/AUTUMN 2023/NLP/Week 37/features/emb_arabic_questions_val_features.pkl', 'wb') as f:
    #pickle.dump(val_features, f)

### Loading features

In [27]:
# These lines of code load the previously saved features

with open('/content/drive/My Drive/MASTERS KU/AUTUMN 2023/NLP/Week 37/features/emb_arabic_questions_train_features.pkl', 'rb') as f:
    train_features = pickle.load(f)

with open('/content/drive/My Drive/MASTERS KU/AUTUMN 2023/NLP/Week 37/features/emb_arabic_questions_val_features.pkl', 'rb') as f:
    val_features = pickle.load(f)

## Get inputs and targets by splitting sentences (window = 4)

In [28]:
inputs = []
targets = []
for sentence in train_features:
  for feature in split_sentence(4,sentence,'arabic'):
    inputs.append(feature)

  for target in split_sentence_target(4,sentence,'arabic'):
    targets.append(target)

inputs_test = []
targets_test = []
for sentence in val_features:
  for feature in split_sentence(4,sentence,'arabic'):

    inputs_test.append( feature)
  for target in  split_sentence_target(4,sentence,'arabic'):
    targets_test.append(target)

In [29]:
class PredictorTrain(torch.utils.data.Dataset):
    def __init__(self, features, labels):
        self.X = torch.LongTensor(features).type(torch.float32)

        self.y = torch.from_numpy(np.array(labels)).type(torch.float32)

    def __getitem__(self, index):
        X = self.X[index]
        y = self.y[index].unsqueeze(0)
        return X, y

    def __len__(self):
        return len(self.y)

num_classes = len(total_vocabulary)

X_train = torch.tensor(inputs)  # input sequences (train)
y_train = torch.tensor(targets)
X_test = torch.tensor(inputs_test)  # input sequences (test)
y_test = torch.tensor(targets_test)


# Shuffle to avoid overfitting based in the sequence of inputs
num_samples = X_train.size(0)
shuffled_indices = torch.randperm(num_samples)

# Use the shuffled indices to reorder both tensors
X_train_shuffled = X_train[shuffled_indices]
y_train_shuffled = y_train[shuffled_indices]


data_train = PredictorTrain(X_train_shuffled, y_train_shuffled) # this function takes train features and labels
data_val = PredictorTrain(X_test, y_test) # this function takes test features and labels

train_loader = torch.utils.data.DataLoader(data_train, batch_size=64)
val_loader = torch.utils.data.DataLoader(data_val, batch_size = 64)

## Run the model

In [30]:
from model_rnn import training_loop

In [31]:
model = NextWordPredictor(rnn_size=100, vocab_size=len(total_vocabulary),embedding_matrix=embedding_matrix)

In [32]:
print(model)

NextWordPredictor(
  (rnn): RNN(4, 100, batch_first=True)
  (fc_logits): Linear(in_features=100, out_features=5649, bias=True)
)


In [33]:
model = training_loop(model,10, train_loader) #training the model
output_probs = model.forward(X_train_shuffled) # generate outputs and evaluate the trained model
outputs = evaluate(model,val_loader)


Epoch 1: loss 8.253302215605743
Epoch 2: loss 7.894435619198998
Epoch 3: loss 7.799576443757197
Epoch 4: loss 7.799425163010294
Epoch 5: loss 7.799505877402401
Epoch 6: loss 7.799596960230391
Epoch 7: loss 7.79951467486315
Epoch 8: loss 7.800005154092182
Epoch 9: loss 7.8002391138742135
Epoch 10: loss 7.80022353026294


## Get predictions

In [34]:
def get_prediction(total_vocabulary,output):
  highest_value = torch.max(output)
  position = torch.where(output==highest_value)
  if len(position[0]) >0:

    return total_vocabulary[position[0].item()]
  return None

In [35]:
for output in outputs:
  print(get_prediction(total_vocabulary,output))



















## Saving the model

In [36]:
# this line of code saves the model
torch.save(model.state_dict(), '/content/drive/MyDrive/MASTERS KU/AUTUMN 2023/NLP/Week 37/weights/rnn_arabic_questions_weights.pth')

## Looking at class distribution to understand predictions

In [37]:
counted_numbers = Counter(targets)
# Sort the items in descending order based on their counts
sorted_numbers = sorted(counted_numbers.items(), key=lambda x: x[1], reverse=True)
sorted_numbers = sorted_numbers[0:5]
# Print the counts
for number, count in sorted_numbers:
    print(f"{number}: {count} times, which corresponds to token: ", total_vocabulary[number])

0: 55389 times, which corresponds to token:  
5023: 774 times, which corresponds to token:  ما
5071: 708 times, which corresponds to token:  متى
5256: 639 times, which corresponds to token:  من
5445: 608 times, which corresponds to token:  هو


# Using 'document_plaintext' as features

## Local imports

In [38]:
from utils import *

## Setting up the training and val corpuses (document_plaintext)

In [39]:
arabic_doc_train = arabic_train_columns['document_plaintext'].to_list()
arabic_doc_val = arabic_val_columns['document_plaintext'].to_list()

indonesian_doc_train = indonesian_train_columns['document_plaintext'].to_list()
indonesian_doc_val = indonesian_val_columns['document_plaintext'].to_list()

bengali_doc_train = bengali_train_columns['document_plaintext'].to_list()
bengali_doc_val = bengali_train_columns['document_plaintext'].to_list()

## Subsetting the corpuses for computational ease

In [40]:
arabic_doc_train = random.sample(arabic_doc_train, 200)
arabic_doc_val = random.sample(arabic_doc_val, 40)

In [41]:
len(arabic_doc_train)

200

In [42]:
len(arabic_doc_val)

40

### Building and saving vocabulary

In [43]:
file_path = "/content/drive/MyDrive/MASTERS KU/AUTUMN 2023/NLP/Week 37/vocabs/arabic_docs_vocab.txt"

# This line of code builds the vocabulary with both the train and the validation corpuses
#total_vocabulary = build_vocab(arabic_doc_val + arabic_doc_train)

# This line of code saves the string representation to a text file
#with open(file_path, "w") as file:
  #file.write(repr(total_vocabulary))

### Loading vocabulary

In [44]:
# This line of code reads the saved vocabulary
with open(file_path, "r") as file:
    list_str = file.read()

# This line of code uses `eval` to parse the string into a list
total_vocabulary = eval(list_str)

print(len(total_vocabulary))

8482


In [45]:
#get the embedding matrix for the  vocabulary
embedding_matrix, oov = create_embedding_matrix(total_vocabulary, bpemb_ar)

83.85993869370432 % of tokens are out of vocabulary


## Parse the data and vectorize

In [46]:
#train_features_doc = [text_to_indices(x, total_vocabulary, lang='arabic') for x in arabic_doc_train]
#val_features_doc = [text_to_indices(x, total_vocabulary, lang='arabic') for x in arabic_doc_val]

In [47]:
#longest_text = max(train_features_doc+val_features_doc, key=len)
#max_length = len(longest_text)
#padding_index = 0

# padding the feature vectors by applying the add_padding function to each text in the train and validation corpus
#train_features_doc = [add_padding(x, max_length, padding_index) for x in train_features_doc]
#val_features_doc = [add_padding(x, max_length, padding_index) for x in val_features_doc]

### Saving the features

In [48]:
# Save
#with open('/content/drive/My Drive/MASTERS KU/AUTUMN 2023/NLP/Week 37/features/emb_arabic_docs_train_features.pkl', 'wb') as f:
    #pickle.dump(train_features_doc, f)

#with open('/content/drive/My Drive/MASTERS KU/AUTUMN 2023/NLP/Week 37/features/emb_arabic_docs_val_features.pkl', 'wb') as f:
    #pickle.dump(val_features_doc, f)

### Loading the features

In [49]:
# Load
with open('/content/drive/My Drive/MASTERS KU/AUTUMN 2023/NLP/Week 37/features/emb_arabic_docs_train_features.pkl', 'rb') as f:
    train_features_doc = pickle.load(f)

with open('/content/drive/My Drive/MASTERS KU/AUTUMN 2023/NLP/Week 37/features/emb_arabic_docs_val_features.pkl', 'rb') as f:
    val_features_doc = pickle.load(f)

## Get inputs and targets by splitting sentences (window = 4)

In [50]:
inputs_doc = []
targets_doc = []
for sentence in train_features_doc:
  for feature in split_sentence(4,sentence,'arabic'):
    inputs_doc.append(feature)

  for target in  split_sentence_target(4,sentence,'arabic'):
    targets_doc.append(target)

inputs_test_doc = []
targets_test_doc = []
for sentence in val_features_doc:
  for feature in split_sentence(4,sentence,'arabic'):

    inputs_test_doc.append(feature)
  for target in  split_sentence_target(4,sentence,'arabic'):
    targets_test_doc.append(target)

In [51]:
class PredictorTrain(torch.utils.data.Dataset):
    def __init__(self, features, labels):
        self.X = torch.LongTensor(features).type(torch.float32)

        self.y = torch.from_numpy(np.array(labels)).type(torch.float32)

    def __getitem__(self, index):
        X = self.X[index]
        y = self.y[index].unsqueeze(0)
        return X, y

    def __len__(self):
        return len(self.y)

num_classes = len(total_vocabulary)

X_train = torch.tensor(inputs_doc)  # input sequences (train)
y_train = torch.tensor(targets_doc)
X_test = torch.tensor(inputs_test_doc)  # input sequences (test)
y_test = torch.tensor(targets_test_doc)


# Shuffle to avoid overfitting based in the sequence of inputs
num_samples = X_train.size(0)
shuffled_indices = torch.randperm(num_samples)
X_train_shuffled = X_train[shuffled_indices]
y_train_shuffled = y_train[shuffled_indices]


data_train = PredictorTrain(X_train_shuffled, y_train_shuffled) # this function takes train features and labels
data_val = PredictorTrain(X_test, y_test) # this function takes test features and labels

train_loader = torch.utils.data.DataLoader(data_train, batch_size=64)
val_loader = torch.utils.data.DataLoader(data_val, batch_size = 64)

In [52]:
from model_rnn import NextWordPredictor
from model_rnn import *

In [53]:
from model_rnn import training_loop

In [54]:
model = NextWordPredictor(rnn_size=100, vocab_size=len(total_vocabulary), embedding_matrix=embedding_matrix)

In [55]:
print(model)

NextWordPredictor(
  (rnn): RNN(4, 100, batch_first=True)
  (fc_logits): Linear(in_features=100, out_features=8482, bias=True)
)


In [56]:
model_doc = training_loop(model,10, train_loader) #training the model
output_probs_doc = model_doc.forward(X_train) # generate outputs and evaluate the trained model
outputs_doc = evaluate( model_doc,val_loader)

Epoch 1: loss 8.115064733132506
Epoch 2: loss 8.106919239161574
Epoch 3: loss 8.106873661610976
Epoch 4: loss 8.106887794336068
Epoch 5: loss 8.106881582316365
Epoch 6: loss 8.106902733850886
Epoch 7: loss 8.10687248960716
Epoch 8: loss 8.106882178766131
Epoch 9: loss 8.106865393667958
Epoch 10: loss 8.106945276145142


In [57]:
outputs_doc.shape

torch.Size([16, 8482])

In [58]:
def get_prediction(total_vocabulary,output):
  highest_value = torch.max(output)
  position = torch.where(output==highest_value)
  if len(position[0]) >0:

    return total_vocabulary[position[0].item()]
  return None

for output in outputs_doc:
  print(get_prediction(total_vocabulary,output))



















## Saving the model

In [59]:
# Saving the model
torch.save(model.state_dict(), '/content/drive/MyDrive/MASTERS KU/AUTUMN 2023/NLP/Week 37/weights/rnn_arabic_documents_weights.pth')

## Looking at class distribution to understand the predictions

In [60]:
# Understanding how classes are distributed

from collections import Counter
counted_numbers = Counter(targets_doc)

# Sort the items in descending order based on their counts
sorted_numbers = sorted(counted_numbers.items(), key=lambda x: x[1], reverse=True)
sorted_numbers = sorted_numbers[0:5]

# Print the counts
for number, count in sorted_numbers:
    print(f"{number}: {count} times, which corresponds to token: ", total_vocabulary[number])

0: 372408 times, which corresponds to token:  
1908: 793 times, which corresponds to token:  ،
6216: 683 times, which corresponds to token:  في
1602: 533 times, which corresponds to token:  .
7140: 414 times, which corresponds to token:  من
