# Important resources

https://github.com/kk7nc/Text_Classification

https://medium.com/analytics-vidhya/bengali-text-visualization-using-word2vec-211e2ed9fa30

# Nomenclature

In [None]:
'''
List of important variables:

  train_df
  val_df
  english_train, arabic_train, bengali_train, indonesian_train
  english_val, arabic_val, bengali_val, indonesian_val
  english_test, arabic_test, bengali_test, indonesian_test
  english_qa_train, arabic_qa_train, bengali_qa_train, indonesian_qa_train
  english_qa_val, arabic_qa_val, bengali_qa_val, indonesian_qa_val

'''


'\nList of important variables:\n\n  train_df\n  val_df\n  english_train, arabic_train, bengali_train, indonesian_train\n  english_val, arabic_val, bengali_val, indonesian_val\n  english_test, arabic_test, bengali_test, indonesian_test\n  english_qa_train, arabic_qa_train, bengali_qa_train, indonesian_qa_train\n  english_qa_val, arabic_qa_val, bengali_qa_val, indonesian_qa_val\n\n'

In [None]:
# Magic used in the notebooks :

# automatically re-load imported modules every time before executing the Python code typed
%load_ext autoreload
%autoreload 2

# Mounting Google Drive

In [None]:
try:
    from google.colab import drive
    drive.mount('/content/drive')
    IN_COLAB = True
except:
    IN_COLAB = False

Mounted at /content/drive


# Imports and installations

In [None]:
%%capture
if IN_COLAB:
  !pip install nltk
  !pip install transformers
  !pip install translators
  !pip install datasets
  !pip install langdetect
  !python -m spacy download en_core_web_sm
  !python -m spacy download en_core_web_trf
  !pip install bpemb

In [None]:
import pandas as pd
from datasets import load_dataset
import matplotlib.pyplot as plt
import numpy as np
import string
import spacy
from tqdm import tqdm
import translators as ts
from langdetect import detect
import random
import abc
import math
import collections

# nltk imports
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.stem import SnowballStemmer
from nltk import word_tokenize
from nltk.tokenize import WhitespaceTokenizer
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
from nltk.corpus import stopwords
stopwords = set(stopwords.words('english')) # english stopwords


# bengali tokenizer
from transformers import AutoTokenizer, AutoModel
import matplotlib.font_manager as fm # bengali characters for matplotlib

# general roberta tokenizer
xlm_tokeniser = AutoTokenizer.from_pretrained("xlm-roberta-base")

# loading the vectors for classification task (week 38)
import gensim.downloader
glove_vectors = gensim.downloader.load('glove-wiki-gigaword-100')

# classification task - lr (week 37)
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import RandomizedSearchCV
from sklearn.decomposition import PCA # https://builtin.com/machine-learning/pca-in-python
from sklearn.metrics import accuracy_score

# pytorch
import torch

# BPE
from bpemb import BPEmb

Using region District of Columbia server backend.

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Downloading (…)lve/main/config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

Downloading (…)tencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]



## Loading general dataset, train and val splits

In [None]:
dataset = load_dataset("copenlu/answerable_tydiqa")
train = load_dataset("copenlu/answerable_tydiqa", split='train')
val = load_dataset("copenlu/answerable_tydiqa", split='validation')

Downloading readme:   0%|          | 0.00/4.94k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/2.47k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/71.6M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/7.49M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/116067 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/13325 [00:00<?, ? examples/s]

## Turning train and val datasets into dataframes

In [None]:
train_df = pd.DataFrame(train)
val_df = pd.DataFrame(val)

In [None]:
train_df.head()

Unnamed: 0,question_text,document_title,language,annotations,document_plaintext,document_url
0,Milloin Charles Fort syntyi?,Charles Fort,finnish,"{'answer_start': [18], 'answer_text': ['6. elo...",Charles Hoy Fort (6. elokuuta (joidenkin lähte...,https://fi.wikipedia.org/wiki/Charles%20Fort
1,“ダン” ダニエル・ジャドソン・キャラハンの出身はどこ,ダニエル・J・キャラハン,japanese,"{'answer_start': [35], 'answer_text': ['カリフォルニ...",“ダン”こと、ダニエル・ジャドソン・キャラハンは1890年7月26日、カリフォルニア州サンフ...,https://ja.wikipedia.org/wiki/%E3%83%80%E3%83%...
2,వేప చెట్టు యొక్క శాస్త్రీయ నామం ఏమిటి?,వేప,telugu,"{'answer_start': [12], 'answer_text': ['Azadir...","వేప (లాటిన్ Azadirachta indica, syn. Melia aza...",https://te.wikipedia.org/wiki/%E0%B0%B5%E0%B1%...
3,চেঙ্গিস খান কোন বংশের রাজা ছিলেন ?,চেঙ্গিজ খান,bengali,"{'answer_start': [414], 'answer_text': ['বোরজি...",চেঙ্গিজ খান (মঙ্গোলীয়: Чингис Хаан আ-ধ্ব-ব: ...,https://bn.wikipedia.org/wiki/%E0%A6%9A%E0%A7%...
4,రెయ్యలగడ్ద గ్రామ విస్తీర్ణత ఎంత?,రెయ్యలగడ్ద,telugu,"{'answer_start': [259], 'answer_text': ['27 హె...","రెయ్యలగడ్ద, విశాఖపట్నం జిల్లా, గంగరాజు మాడుగుల...",https://te.wikipedia.org/wiki/%E0%B0%B0%E0%B1%...


In [None]:
train_df.columns

Index(['question_text', 'document_title', 'language', 'annotations',
       'document_plaintext', 'document_url'],
      dtype='object')

## Filtering train and val dataframes by language

In [None]:
def get_df_lang(df, lang):
  return df[df['language'] == lang]

english_train = get_df_lang(train_df, 'english')
bengali_train = get_df_lang(train_df, 'bengali')
arabic_train = get_df_lang(train_df, 'arabic')
indonesian_train = get_df_lang(train_df, 'indonesian')

english_val = get_df_lang(val_df, 'english')
bengali_val = get_df_lang(val_df, 'bengali')
arabic_val = get_df_lang(val_df, 'arabic')
indonesian_val = get_df_lang(val_df, 'indonesian')


In [None]:
# this can be deleted, as the english corpus is just for checking steps

## Statistics for train and val of the three languages

In [None]:
bengali_train.describe()

Unnamed: 0,question_text,document_title,language,annotations,document_plaintext,document_url
count,4779,4779,4779,4779,4779,4779
unique,2376,1126,1,2093,3891,1126
top,ব্রহ্মপুত্র নদের মোট দৈর্ঘ্য কত ?,কুরআন,bengali,"{'answer_start': [-1], 'answer_text': ['']}",ইসলামী ইতিহাস অনুসারে দীর্ঘ তেইশ বছর ধরে খণ্ড ...,https://bn.wikipedia.org/wiki/%E0%A6%95%E0%A7%...
freq,4,52,4779,2389,14,52


In [None]:
# unanswerable - ['']
english_train['annotations']

26        {'answer_start': [159], 'answer_text': ['1920s']}
43        {'answer_start': [610], 'answer_text': ['Sully...
112       {'answer_start': [129], 'answer_text': ['disco...
123       {'answer_start': [88], 'answer_text': ['Sejong...
125       {'answer_start': [0], 'answer_text': ['Grassho...
                                ...                        
116000          {'answer_start': [-1], 'answer_text': ['']}
116012          {'answer_start': [-1], 'answer_text': ['']}
116027          {'answer_start': [-1], 'answer_text': ['']}
116055          {'answer_start': [-1], 'answer_text': ['']}
116066          {'answer_start': [-1], 'answer_text': ['']}
Name: annotations, Length: 7389, dtype: object

## Part b)

For each of the languages Arabic, Bengali and Indonesian, report the 5 most common words in the documents from the training set (question_text). Then report the 5 most common words in the questions from the training set (document_plaintext). What do you observe?

## Retrieving only the document text (document_plaintext) and the question text (question_text)

In [None]:
bengali_texts = bengali_train["document_plaintext"].values
arabic_texts = arabic_train["document_plaintext"].values
indonesian_texts = indonesian_train["document_plaintext"].values

In [None]:
bengali_questions = bengali_train["question_text"].values
arabic_questions = arabic_train["question_text"].values
indonesian_questions = indonesian_train["question_text"].values

In [None]:
# this can be deleted, as english is not required, but it is good for checking if the steps are right
english_texts = english_train['document_plaintext'].values
english_questions = english_train['question_text'].values
english_questions

array(['When was quantum field theory developed?',
       'Who was the first Nobel prize winner for Literature?',
       'When is the dialectical method used?', ...,
       'Who was costume designer for the first Star Wars movie?',
       'Who developed the first thermonuclear weapon?',
       'What is the population of Mahwah, NJ?'], dtype=object)

# Retrieving questions, answers and answerable or not 

## Building questions and answers dataframe, with features (0 or 1)

In [None]:
def build_qa_df(df):
  # 1. retrieve answers
  answers = np.array([item['answer_text'][0] for item in df['annotations'].values])
  for i in range(len(answers)):
      if answers[i] == '':
          answers[i] = 'unanswered' #  marking unanswered answers (['']) as unanswered
  # 2. retrieve questions
  questions = df['question_text']
  # 3. retrieve doc plaintext
  document_text = df['document_plaintext']
  # 4. build questions and answers dataframe
  dataframe = pd.DataFrame({'question': questions, 'answer': answers, 'document_text': document_text})
  dataframe['y'] = [0 if answer == 'unanswered' else 1 for answer in dataframe['answer']]
  return dataframe

english_qa_train = build_qa_df(english_train)
english_qa_val = build_qa_df(english_val)

arabic_qa_train = build_qa_df(arabic_train)
arabic_qa_val = build_qa_df(arabic_val)

bengali_qa_train = build_qa_df(bengali_train)
bengali_qa_val = build_qa_df(bengali_val)

indonesian_qa_train = build_qa_df(indonesian_train)
indonesian_qa_val = build_qa_df(indonesian_val)

In [None]:
# Count the number of unanswered answers
unanswered_questions = english_qa_train['answer'].value_counts().get('unanswered') # 3693
answered_questions = len(english_qa_train) - unanswered_questions # 3696

# Week 37: Building Models


Let k be the number of members in your group (k ∈ {1,2,3}). Implement k different language models for each of the three languages, separately for the questions and the documents (total k×3×2 language models), using the training data. Evaluate each of them on the validation data, report their performance and discuss the results.

In [None]:
# slide retrieved from the class material

def enforce_reproducibility(seed=42):
    # Sets seed manually for both CPU and CUDA
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    # For atomic operations there is currently
    # no simple way to enforce determinism, as
    # the order of parallel operations is not known.
    # CUDNN
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    # System based
    random.seed(seed)
    np.random.seed(seed)
enforce_reproducibility()

device = torch.device("cpu")
if torch.cuda.is_available():
    device = torch.device("cuda")
device

device(type='cpu')

### Model 2: RNN

Motivation of the model choice: __

https://towardsdatascience.com/pytorch-basics-how-to-train-your-neural-net-intro-to-rnn-cb6ebc594677

Maybe BERT (https://mccormickml.com/2019/07/22/BERT-fine-tuning/#3-tokenization--input-formatting)


# Week 38

Let k be the number of members in your group. Implement and train k different supervised classifiers for each of the three languages separately, using the training data for that language. The classifiers must only use the document and question as input. Evaluate the classifiers on the respective validation sets, report and analyse the performance for each language and compare the scores across languages.
The classifiers can use linguistic/lexical features, e.g., bag-of-words, n-gram counts, overlaps of words between question and document, etc.; word embed- dings, or word/sentence representations from neural language models. You can, for example, find pretrained Transformer language models for different languages, trained with different language objectives, and fine-tuned for different downstream tasks, fromHuggingFace.9 You can also train or fine-tune your own neural language models on the dataset. Motivate your choice of features and classifier.

Model 1.1: Logistic Regression with BPE features (Carolina)

Model 1.2: Logistic Regression with TF-IDF features (Carolina)

Model 2: __

Model 3: __

#### Defining a datasets dictionary for later loading the results easier

In [None]:
datasets = {
    'arabic': {'train': arabic_qa_train, 'test': arabic_qa_val},
    'bengali': {'train': bengali_qa_train, 'test': bengali_qa_val},
    'indonesian': {'train': indonesian_qa_train, 'test': indonesian_qa_val},
}

## Model 1.1: Logistic Regression with BPE Features

Motivation of the choice:  ____

BPE features on both questions and document text with posterior PCA

Note: Do I need to preprocess before classifying?

### Loading BPE models

In [None]:
bpemb_en = BPEmb(lang='en', dim=100, vs=25000) # Load english model with 25k word-pieces. Matrix of 100 x 1000 (dim x vocab_size)
bpemb_ar = BPEmb(lang='ar', dim=100, vs=25000) # arabic model
bpemb_ben = BPEmb(lang='bn', dim=100, vs=25000) # bengali model
bpemb_ind = BPEmb(lang='id', dim=100, vs=25000) # indonesian model

downloading https://nlp.h-its.org/bpemb/en/en.wiki.bpe.vs25000.model


100%|██████████| 661443/661443 [00:00<00:00, 1480664.54B/s]


downloading https://nlp.h-its.org/bpemb/en/en.wiki.bpe.vs25000.d100.w2v.bin.tar.gz


100%|██████████| 9477142/9477142 [00:00<00:00, 10256310.82B/s]


downloading https://nlp.h-its.org/bpemb/ar/ar.wiki.bpe.vs25000.model


100%|██████████| 742254/742254 [00:00<00:00, 1388318.47B/s]


downloading https://nlp.h-its.org/bpemb/ar/ar.wiki.bpe.vs25000.d100.w2v.bin.tar.gz


100%|██████████| 9491724/9491724 [00:00<00:00, 10532210.37B/s]


downloading https://nlp.h-its.org/bpemb/bn/bn.wiki.bpe.vs25000.model


100%|██████████| 863227/863227 [00:00<00:00, 1924803.62B/s]


downloading https://nlp.h-its.org/bpemb/bn/bn.wiki.bpe.vs25000.d100.w2v.bin.tar.gz


100%|██████████| 9517491/9517491 [00:00<00:00, 10418879.87B/s]


downloading https://nlp.h-its.org/bpemb/id/id.wiki.bpe.vs25000.model


100%|██████████| 650018/650018 [00:00<00:00, 1458366.08B/s]


downloading https://nlp.h-its.org/bpemb/id/id.wiki.bpe.vs25000.d100.w2v.bin.tar.gz


100%|██████████| 9465922/9465922 [00:00<00:00, 10406536.21B/s]


In [None]:
class LrWithBPE:
    def __init__(self, bpemb_model):
        self.bpemb_model = bpemb_model

    def get_bpemb_features(self, dataset):
        X_question = [self.bpemb_model.embed(x).mean(0) for x in dataset.values[:, 0]]  # Assuming 1st column has questions
        X_doc_text = [self.bpemb_model.embed(x).mean(0) for x in dataset.values[:, 2]]  # Assuming 3rd column has document text
        X = np.column_stack([X_question, X_doc_text])
        y = list(dataset.values[:, 3])  # Assuming 4th column is 'y'
        return X, y

    def train(self, pca_variance, X_train, y_train, X_test, y_test):
        pca = PCA(pca_variance)
        pca.fit(X_train)

        X_train_pca = pca.transform(X_train)
        X_test_pca = pca.transform(X_test)

        classifier = LogisticRegression(penalty='l2', max_iter=1000)
        classifier.fit(X_train_pca, y_train)
        y_pred = classifier.predict(X_test_pca)

        return y_pred

    def calculate_accuracy(self, y_test, y_pred):
      accuracy = accuracy_score(y_test, y_pred)
      return accuracy

    def create_dataframe_with_preds(self, dataset, y_pred):
        dataset = pd.DataFrame(dataset, columns=["question", "answer", "document_text", "y"])
        dataset['y_pred'] = y_pred
        return dataset

In [33]:
lr_bpe_model = LrWithBPE(bpemb_en) # setting up the model

all_preds_bpe = {}
# Train and evaluate the models for each language
for lang, data in datasets.items():
    X_train, y_train = lr_bpe_model.get_bpemb_features(data['train'])
    X_test, y_test = lr_bpe_model.get_bpemb_features(data['test'])

    pca_variance = 0.8
    preds_bpe = lr_bpe_model.train(pca_variance, X_train, y_train, X_test, y_test)
    preds_bpe_df = lr_bpe_model.create_dataframe_with_preds(data['test'], preds_bpe)

    # Store the dataframe in the dictionary with the language name as the key
    all_preds_bpe[lang] = preds_bpe_df

    accuracy = lr_bpe_model.calculate_accuracy(y_test, preds_bpe)
    print(f"Accuracy ({lang}): {accuracy}")

Accuracy (arabic): 0.5715036803364879
Accuracy (bengali): 0.5714285714285714
Accuracy (indonesian): 0.5726280436607892


In [34]:
all_preds_bpe['bengali']

Unnamed: 0,question,answer,document_text,y,y_pred
16,পশ্চিম ভারতের মহারাষ্ট্র রাজ্যের মুম্বাই শহরে ...,স্যার জর্জ সিডেনহাম ক্লার্ক,"দিল্লী দর্বার তৈরীর পূর্বে, গেটওয়ে অব ইন্ডিয়...",1,0
103,ভারতীয় বাঙালি কথাসাহিত্যিক মহাশ্বেতা দেবীর প্...,ঝাঁসির রানি,মহাশ্বেতা দেবী ১০০টিরও বেশি উপন্যাস এবং ২০টিরও...,1,1
155,কলকাতার মেট্রো প্রথম কবে তৈরী হয় ?,১৯৮৪ সালে,প্রকল্পের কাজ শুরু হলেও ১৯৭৭-৭৮ সালে অর্থের জো...,1,0
180,বিখ্যাত জ্যোতির্বিজ্ঞানী নিকোলাউস কোপের্নিকুসে...,১৪৭৩ সালের ১৮ ফেব্রুয়ারী,নিকোলাস কোপারনিকাস (পলিশ ভাষায় মিকলজ কোপারনিক...,1,1
298,কত সালে সর্বপ্রথম সৌদি আরবে সর্বোচ্চ সরকারী ধর...,"২৯শে আগস্ট, ১৯৭১ সালে",সৌদি আরবের বাদশাহ ফয়সাল ইবনে আব্দুল আজিজ ২৯শে...,1,1
...,...,...,...,...,...
12963,পশ্চিম ভারতের মহারাষ্ট্র রাজ্যের মুম্বাই শহরে ...,unanswered,বিষয়শ্রেণী:মুম্বাইয়ের স্থাপত্য\nবিষয়শ্রেণী:...,0,0
13058,মালয়েশিয়ান রাজনীতিবিদ ও বিরোধীদলীয় নেতা আনো...,unanswered,আনোয়ার ইব্রাহীম ১৯৪৭ সালে মালয়েশিয়ার উত্তরা...,0,1
13136,অভিজিৎ রায়ের ব্লগের নাম কী ?,unanswered,"অভিজিৎ রায় ইন্টারনেট, ম্যাগাজিন এবং দৈনিক পত্...",0,1
13145,আবুল হাসনাত মোহাম্মদ কামারুজ্জামান কত সালে মার...,unanswered,২০০৮ সালের ২৮শে আগস্ট বাংলাদেশের সুপ্রীম কোর্ট...,0,1


## Model 1.2: Logistic Regression with TF-IDF 

#### Lr with TF-IDF Class

In [35]:
class LrwithTfidf():
  def __init__(self):
    pass

  def get_tfidf_features(self, dataset):
    vectorizer = TfidfVectorizer(ngram_range=(1,2), max_features=1000)
    X = vectorizer.fit_transform(dataset.values[:, 0]+ ' ' + dataset.values[:,2]) # taking the questions and doc text as features
    y = dataset.values[:, 3].astype('int')
    return X, y

  def train_classifier(self, X_train, y_train, X_test, y_test):
    lr = LogisticRegression(penalty='l2', max_iter=1000, multi_class='multinomial')
    lr.fit(X_train, y_train) # fitting the model to the train features ('question' and 'document_text') and the train labels ('y')
    y_pred = lr.predict(X_test)
    return y_pred

  def calculate_accuracy(self, y_test, y_pred):
      accuracy = accuracy_score(y_test, y_pred)
      return accuracy

  def create_dataframe_with_preds(self, dataset, y_pred):
        dataset = pd.DataFrame(dataset, columns=["question", "answer", "document_text", "y"])
        dataset['y_pred'] = y_pred
        return dataset

In [36]:
# Initialize the model
lr_tfidf_model = LrwithTfidf()

all_preds_tfidf = {}
# Train and evaluate the models for each language
for lang, data in datasets.items():
    X_train, y_train = lr_tfidf_model.get_tfidf_features(data['train'])
    X_test, y_test = lr_tfidf_model.get_tfidf_features(data['test'])

    preds_tfidf = lr_tfidf_model.train_classifier(X_train, y_train, X_test, y_test)
    preds_tfidf_df = lr_tfidf_model.create_dataframe_with_preds(data['test'], preds_tfidf)

    # Store the dataframe in the dictionary with the language name as the key
    all_preds_tfidf[lang] = preds_tfidf_df

    accuracy = lr_tfidf_model.calculate_accuracy(y_test, preds_tfidf)
    print(f"Accuracy ({lang}): {accuracy}")

Accuracy (arabic): 0.6230283911671924
Accuracy (bengali): 0.5625
Accuracy (indonesian): 0.5415617128463476
