## Imports and installations

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
try:
    from google.colab import drive
    drive.mount('/content/drive')
    IN_COLAB = True
except:
    IN_COLAB = False

Mounted at /content/drive


In [3]:
%%capture
if IN_COLAB:
  !pip install nltk
  !pip install datasets
  !pip install bpemb

In [4]:
import pandas as pd
import datasets
from datasets import load_dataset, load_from_disk
from tqdm import tqdm
import numpy as np

from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import RandomizedSearchCV
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score
from sklearn import metrics

# Setting torch device
import torch
device = torch.device("cpu")
if torch.cuda.is_available():
    device = torch.device("cuda")

# BPE
from bpemb import BPEmb

## Local imports

In [5]:
%cd '/content/drive/MyDrive/MASTERS KU/AUTUMN 2023/NLP/Week 38'

/content/drive/MyDrive/MASTERS KU/AUTUMN 2023/NLP/Week 38


In [18]:
from utils import *
from utils import preprocess_text_column
from models import *
from models import LrwithTfidf

## Loading and splitting datasets into train and validation by language

In [7]:
train = load_from_disk('/content/drive/My Drive/MASTERS KU/AUTUMN 2023/NLP/tydiqa/train')
val = load_from_disk('/content/drive/My Drive/MASTERS KU/AUTUMN 2023/NLP/tydiqa/validation')

In [8]:
train_df = pd.DataFrame(train)
val_df = pd.DataFrame(val)

In [9]:
bengali_train = get_df_lang(train_df, 'bengali')
arabic_train = get_df_lang(train_df, 'arabic')
indonesian_train = get_df_lang(train_df, 'indonesian')

bengali_val = get_df_lang(val_df, 'bengali')
arabic_val = get_df_lang(val_df, 'arabic')
indonesian_val = get_df_lang(val_df, 'indonesian')

## Building a dataframe with questions, answers, the document text and a binary variable indicating if the question has been answered or not

In [10]:
arabic_qa_train = build_qa_df(arabic_train)
arabic_qa_val = build_qa_df(arabic_val)

bengali_qa_train = build_qa_df(bengali_train)
bengali_qa_val = build_qa_df(bengali_val)

indonesian_qa_train = build_qa_df(indonesian_train)
indonesian_qa_val = build_qa_df(indonesian_val)

## Preprocessing text

In [11]:
# If this cell is ran, all logistic regressions will be done with preprocessed text

arabic_qa_train = preprocess_text_column(arabic_qa_train, column_name='question', lang='arabic')
arabic_qa_train = preprocess_text_column(arabic_qa_train, column_name='document_text', lang='arabic')

bengali_qa_train = preprocess_text_column(bengali_qa_train, column_name='question', lang='bengali')
bengali_qa_train = preprocess_text_column(bengali_qa_train, column_name='document_text', lang='bengali')

indonesian_qa_train = preprocess_text_column(indonesian_qa_train, column_name='question', lang='indonesian')
indonesian_qa_train = preprocess_text_column(indonesian_qa_train, column_name='document_text', lang='indonesian')

arabic_qa_val = preprocess_text_column(arabic_qa_val, column_name='question', lang='arabic')
arabic_qa_val = preprocess_text_column(arabic_qa_val, column_name='document_text', lang='arabic')

bengali_qa_val = preprocess_text_column(bengali_qa_val, column_name='question', lang='bengali')
bengali_qa_val = preprocess_text_column(bengali_qa_val, column_name='document_text', lang='bengali')

indonesian_qa_val = preprocess_text_column(indonesian_qa_val, column_name='question', lang='indonesian')
indonesian_qa_val = preprocess_text_column(indonesian_qa_val, column_name='document_text', lang='indonesian')

In [12]:
datasets = {
    'arabic': {'train': arabic_qa_train, 'test': arabic_qa_val},
    'bengali': {'train': bengali_qa_train, 'test': bengali_qa_val},
    'indonesian': {'train': indonesian_qa_train, 'test': indonesian_qa_val},
}

In [13]:
# load models with 25k word-pieces. Matrix (dim x vocab_size)
bpemb_ar = BPEmb(lang='ar', dim=100, vs=25000) # arabic model
bpemb_ben = BPEmb(lang='bn', dim=100, vs=25000) # bengali model
bpemb_ind = BPEmb(lang='id', dim=100, vs=25000) # indonesian model

downloading https://nlp.h-its.org/bpemb/ar/ar.wiki.bpe.vs25000.model


100%|██████████| 742254/742254 [00:00<00:00, 1656818.90B/s]


downloading https://nlp.h-its.org/bpemb/ar/ar.wiki.bpe.vs25000.d100.w2v.bin.tar.gz


100%|██████████| 9491724/9491724 [00:00<00:00, 10690821.48B/s]


downloading https://nlp.h-its.org/bpemb/bn/bn.wiki.bpe.vs25000.model


100%|██████████| 863227/863227 [00:00<00:00, 1942976.92B/s]


downloading https://nlp.h-its.org/bpemb/bn/bn.wiki.bpe.vs25000.d100.w2v.bin.tar.gz


100%|██████████| 9517491/9517491 [00:00<00:00, 10572737.02B/s]


downloading https://nlp.h-its.org/bpemb/id/id.wiki.bpe.vs25000.model


100%|██████████| 650018/650018 [00:00<00:00, 1463304.37B/s]


downloading https://nlp.h-its.org/bpemb/id/id.wiki.bpe.vs25000.d100.w2v.bin.tar.gz


100%|██████████| 9465922/9465922 [00:00<00:00, 10419968.14B/s]


In [14]:
arabic_train.loc[102606, 'document_plaintext']

'.\n.\n.\n.\n.'

In [15]:
arabic_qa_train.loc[102606, 'document_text'] = 'no text' # removing empty string as it casues problems when performing PCA

## Models

### Logistic Regression with BPEmb Embeddings

In [16]:
lr_bpe_model = LrWithBPE(bpemb_ar) # setting up the model

all_preds_bpe = {}
# Train and evaluate the models for each language
for lang, data in datasets.items():
    X_train, y_train = lr_bpe_model.get_bpemb_features(data['train']) # X_train contains nan
    X_test, y_test = lr_bpe_model.get_bpemb_features(data['test'])

    pca_variance = 0.8
    preds_bpe = lr_bpe_model.train(pca_variance, X_train, y_train, X_test, y_test)
    preds_bpe_df = lr_bpe_model.create_dataframe_with_preds(data['test'], preds_bpe)

    # Store the dataframe in the dictionary with the language name as the key
    all_preds_bpe[lang] = preds_bpe_df

    accuracy = lr_bpe_model.calculate_accuracy(y_test, preds_bpe)
    report = lr_bpe_model.create_report(y_test, preds_bpe)
    print(f"Accuracy and classification report ({lang}): {accuracy}")
    print(report)

Accuracy and classification report (arabic): 0.6251314405888538
              precision    recall  f1-score   support

           0       0.64      0.59      0.61       951
           1       0.62      0.66      0.64       951

    accuracy                           0.63      1902
   macro avg       0.63      0.63      0.62      1902
weighted avg       0.63      0.63      0.62      1902

Accuracy and classification report (bengali): 0.6339285714285714
              precision    recall  f1-score   support

           0       0.67      0.52      0.59       112
           1       0.61      0.75      0.67       112

    accuracy                           0.63       224
   macro avg       0.64      0.63      0.63       224
weighted avg       0.64      0.63      0.63       224

Accuracy and classification report (indonesian): 0.5919395465994962
              precision    recall  f1-score   support

           0       0.61      0.50      0.55       594
           1       0.58      0.68      0

### Logistic Regression with TfIdf

In [20]:
# Initialize the model.
# Error with PCA: PCA does not support sparse input. See TruncatedSVD for a possible alternative.
# TODO: Need to find the right TruncatedSVD number of components
lr_tfidf_model = LrwithTfidf()
all_preds_tfidf = {}
# Train and evaluate the models for each language
for lang, data in datasets.items():
    X_train, y_train = lr_tfidf_model.get_tfidf_features(data['train'])
    X_test, y_test = lr_tfidf_model.get_tfidf_features(data['test'])

    n_components = 10
    preds_tfidf = lr_tfidf_model.train_classifier(n_components, X_train, y_train, X_test, y_test)
    preds_tfidf_df = lr_tfidf_model.create_dataframe_with_preds(data['test'], preds_tfidf)

    # Store the dataframe in the dictionary with the language name as the key
    all_preds_tfidf[lang] = preds_tfidf_df

    accuracy = lr_tfidf_model.calculate_accuracy(y_test, preds_tfidf)
    report = lr_tfidf_model.create_report(y_test, preds_tfidf)
    print(f"Accuracy ({lang}): {accuracy}")
    print(report)

Accuracy (arabic): 0.5420609884332281
              precision    recall  f1-score   support

           0       0.53      0.84      0.65       951
           1       0.60      0.25      0.35       951

    accuracy                           0.54      1902
   macro avg       0.56      0.54      0.50      1902
weighted avg       0.56      0.54      0.50      1902

Accuracy (bengali): 0.5
              precision    recall  f1-score   support

           0       0.50      0.99      0.66       112
           1       0.50      0.01      0.02       112

    accuracy                           0.50       224
   macro avg       0.50      0.50      0.34       224
weighted avg       0.50      0.50      0.34       224

Accuracy (indonesian): 0.4987405541561713
              precision    recall  f1-score   support

           0       0.50      0.87      0.63       594
           1       0.50      0.13      0.21       597

    accuracy                           0.50      1191
   macro avg       0.50 