In [9]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
# from transformers import *
from transformers import default_data_collator, Trainer
from transformers import AutoTokenizer, TrainingArguments,AutoModelForQuestionAnswering
import tensorflow as tf
# import collection
from datasets import Dataset
import os
import math


  from .autonotebook import tqdm as notebook_tqdm


In [70]:

import spacy
from spacy.lang.hi import Hindi, STOP_WORDS
from nltk.corpus import wordnet
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

nlp = Hindi()

In [4]:
train = pd.read_csv('/kaggle/input/chaii-filter-qa/train.csv')
train.head()

Unnamed: 0,id,context,question,answer_text,answer_start,language
0,416091aeb,विषाणु अकोशिकीय अतिसूक्ष्म जीव हैं जो केवल जीव...,सन १८८६ में किसने बताया कि तम्बाकू में मोजेक र...,एडोल्फ मेयर,935,hindi
1,9d274ae3c,फ्लोरीन एक रासायनिक तत्व है। यह आवर्त सारणी (p...,फ्लोरीन की परमाणु संख्या क्या है?,9,166,hindi
2,da7397c5e,सीऐटल (अंग्रेजी: Seattle) अमेरिका के वाशिंगटन ...,सीटल शहर कहाँ स्थित है?,अमेरिका के वाशिंगटन राज्य,26,hindi
3,661880e43,सूर्य अथवा सूरज सौरमंडल के केन्द्र में स्थित ए...,पृथ्वी को सूर्य की परिक्रमा करने में कितने दिन...,28 दिनों,2762,hindi
4,3e3a2bed4,"दिल्ली नगर निगम एक शहर व नगर निगम है, जो दिल्ल...","दिल्ली नगर निगम, दिल्ली के कितने जिलों में कार...",कुल नौ जिलों,51,hindi


In [5]:
test = pd.read_csv('/kaggle/input/chaii-filter-qa/test.csv')
test = test.dropna()
test.head()

Unnamed: 0,id,context,question,language
0,22bff3dec,"ज्वाला गुट्टा (जन्म: 7 सितंबर 1983; वर्धा, महा...",ज्वाला गुट्टा की माँ का नाम क्या है,hindi
1,282758170,गूगल मानचित्र (Google Maps) (पूर्व में गूगल लो...,गूगल मैप्स कब लॉन्च किया गया था?,hindi
2,d60987e0e,गुस्ताव रॉबर्ट किरचॉफ़ (१२ मार्च १८२४ - १७ अक्...,गुस्ताव किरचॉफ का जन्म कब हुआ था?,hindi


In [106]:
def keyword_extraction(question, n=2):
    doc = nlp(question)  # Specify Hindi language
    tokens = [str(token) for token in doc if str(token) not in STOP_WORDS]

    named_ents = [entity.text for entity in doc.ents if entity.text not in STOP_WORDS]
    for ent in named_ents:
        if ent not in tokens:
            tokens.append(str(ent))
    
    for i in range(len(tokens) - n + 1):
        ngram = " ".join(tokens[i:i + n])
        if ngram not in tokens:
            tokens.append(ngram)

    return tokens

def tokenize_and_lemmatize_question(question):
    doc = nlp(question)
    tokens = [str(token) for token in doc if str(token) not in STOP_WORDS]
    return tokens

def tokenize_and_lemmatize_documents(documents):
    preprocessed_sentences = []
    sentences = []
    
    for document in documents:
        doc = nlp(document)
        for sentence in doc.sents:
            lemmatized_tokens = [str(token) for token in sentence if str(token) not in STOP_WORDS]
            lemmatized_sentence = " ".join(lemmatized_tokens)
            preprocessed_sentences.append(lemmatized_sentence)
            sentences.append(str(sentence))
    
    return preprocessed_sentences, sentences

def expand_query_with_synonyms(tokens):
    expanded_tokens = set(tokens)

    for token in tokens:
        synonyms = wordnet.synsets(token, lang='hin')  # Specify Hindi language for WordNet
        synonyms_and_lemmas = [lemma.name() for syn in synonyms for lemma in syn.lemmas('hin')]
        expanded_tokens.update(synonyms_and_lemmas)

    return list(expanded_tokens)

def calculate_cosine_similarity(token, title):
    try:
        corpus = [title, token]
        tfidf_vectorizer = TfidfVectorizer(min_df=1)
        tfidf_matrix = tfidf_vectorizer.fit_transform(corpus)
            
        # Check if the vocabulary is empty
        if tfidf_matrix.shape[1] == 0:
            return 0.0

        cosine_similarity_score = cosine_similarity(tfidf_matrix[0], tfidf_matrix[1])[0][0]
        
        if title == token:
            return 1.0
        return cosine_similarity_score
    
    except ValueError as e:
        return 0.0

def preprocess_query(question, dataset):
    print('Preprocessing query...')
    keywords = keyword_extraction(question)
    
    data_dict = {'id': [], 'question': [], 'similarity_score': []}
    for index, row in dataset.iterrows():
        for token in keywords:
            if token in row['context']:
                similarity = calculate_cosine_similarity(token, row['context'])
                if similarity > 0:
                    data_dict['id'].append(row['id'])
                    data_dict['question'].append(row['question'])
                    data_dict['similarity_score'].append(similarity)

    df = pd.DataFrame(data_dict)
    top_df = df.sort_values(by='similarity_score', ascending=False)

    print(len(data_dict['id']), 'hits were found ...')

    return top_df.head(5)

def bm25_score(query_tokens, documents, k1=1.5, b=0.75):
    avg_doc_length = np.mean([len(doc.split()) for doc in documents])
    scores = []
    document_tokens, sentences = tokenize_and_lemmatize_documents(documents)

    for doc_tokens in document_tokens:
        doc_length = len(doc_tokens.split())
        doc_score = 0
        for word in query_tokens:
            word_count = doc_tokens.split().count(word)
            idf = math.log((len(document_tokens) - document_tokens.count(word) + 0.5) / (document_tokens.count(word) + 0.5) + 1.0)
            doc_score += (idf * (word_count * (k1 + 1))) / (word_count + k1 * (1 - b + b * (doc_length / avg_doc_length)))

        scores.append(doc_score)

    if not scores:
        print('no context found.')
        exit(1)

    ranked_docs = [(sentences[i], score) for i, score in enumerate(scores)]
    ranked_docs.sort(key=lambda x: x[1], reverse=True)

    ranked = pd.DataFrame({
        'id': range(1, len(ranked_docs) + 1),
        'sentences': [item[0] for item in ranked_docs],
        'BM25_Score': [item[1] for item in ranked_docs]
    })

    return ranked[:5]

In [126]:
question = test.iloc[0]['question']
preprocess_query(question, train)

Unnamed: 0,id,document,similarity score
0,22bff3dec,"ज्वाला गुट्टा (जन्म: 7 सितंबर 1983; वर्धा, महा...",1.0
1,9d274ae3c,फ्लोरीन एक रासायनिक तत्व है। यह आवर्त सारणी (p...,0.56
2,da7397c5e,सीऐटल (अंग्रेजी: Seattle) अमेरिका के वाशिंगटन ...,0.44
3,661880e43,सूर्य अथवा सूरज सौरमंडल के केन्द्र में स्थित ए...,0.21
4,3e3a2bed4,"दिल्ली नगर निगम एक शहर व नगर निगम है, जो दिल्ल...",0.15


In [127]:
query_tokens = preprocess_query(question)
scores = bm25_score(query_tokens, train['context'])
scores


Unnamed: 0,id,sentences,BM25_Score
0,22bff3dec,"ज्वाला गुट्टा (जन्म: 7 सितंबर 1983; वर्धा, महा...",0.95
1,9d274ae3c,फ्लोरीन एक रासायनिक तत्व है। यह आवर्त सारणी (p...,0.56
2,da7397c5e,सीऐटल (अंग्रेजी: Seattle) अमेरिका के वाशिंगटन ...,0.44
3,661880e43,सूर्य अथवा सूरज सौरमंडल के केन्द्र में स्थित ए...,0.21
4,3e3a2bed4,"दिल्ली नगर निगम एक शहर व नगर निगम है, जो दिल्ल...",0.15


In [4]:
tokenizer = AutoTokenizer.from_pretrained("deepset/xlm-roberta-large-squad2")

Downloading (…)okenizer_config.json:   0%|          | 0.00/179 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/606 [00:00<?, ?B/s]

Downloading (…)tencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

In [5]:
batch_size = 4
max_length = 384 
doc_stride = 128
pad_on_right = tokenizer.padding_side == "right"

In [6]:
def prepare_train_features(examples):
    examples["question"] = [q.lstrip() for q in examples["question"]]
    tokenized_examples = tokenizer(
        examples["question" if pad_on_right else "context"],
        examples["context" if pad_on_right else "question"],
        truncation="only_second" if pad_on_right else "only_first",
        max_length=max_length,
        stride=doc_stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
    offset_mapping = tokenized_examples.pop("offset_mapping")

    tokenized_examples["start_positions"] = []
    tokenized_examples["end_positions"] = []

    for i, offsets in enumerate(offset_mapping):
        input_ids = tokenized_examples["input_ids"][i]
        cls_index = input_ids.index(tokenizer.cls_token_id)

        sequence_ids = tokenized_examples.sequence_ids(i)

        # One example can give several spans, this is the index of the example containing this span of text.
        sample_index = sample_mapping[i]
        answers = examples["answers"][sample_index]
        # If no answers are given, set the cls_index as answer.
        if len(answers["answer_start"]) == 0:
            tokenized_examples["start_positions"].append(cls_index)
            tokenized_examples["end_positions"].append(cls_index)
        else:
            # Start/end character index of the answer in the text.
            start_char = answers["answer_start"][0]
            end_char = start_char + len(answers["text"][0])

            # Start token index of the current span in the text.
            token_start_index = 0
            while sequence_ids[token_start_index] != (1 if pad_on_right else 0):
                token_start_index += 1

            # End token index of the current span in the text.
            token_end_index = len(input_ids) - 1
            while sequence_ids[token_end_index] != (1 if pad_on_right else 0):
                token_end_index -= 1

            # Detect if the answer is out of the span (in which case this feature is labeled with the CLS index).
            if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char):
                tokenized_examples["start_positions"].append(cls_index)
                tokenized_examples["end_positions"].append(cls_index)
            else:
                # Otherwise move the token_start_index and token_end_index to the two ends of the answer.
                # Note: we could go after the last offset if the answer is the last word (edge case).
                while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
                    token_start_index += 1
                tokenized_examples["start_positions"].append(token_start_index - 1)
                while offsets[token_end_index][1] >= end_char:
                    token_end_index -= 1
                tokenized_examples["end_positions"].append(token_end_index + 1)

    return tokenized_examples

In [7]:
def convert_answers(r):
    start = r[0]
    text = r[1]
    return {
        'answer_start': [start],
        'text': [text]
    }

train = train.sample(frac=1, random_state=42)
train['answers'] = train[['answer_start', 'answer_text']].apply(convert_answers, axis=1)

df_train = train[:-64].reset_index(drop=True)
df_valid = train[-64:].reset_index(drop=True)

train_dataset = Dataset.from_pandas(df_train)
valid_dataset = Dataset.from_pandas(df_valid)

In [8]:
train_dataset[0]

{'id': 'b5ef4590a',
 'context': 'बुर्ज ख़लीफ़ा दुबई में आठ अरब डॉलर की लागत से छह साल में निर्मित ८२८ मीटर ऊँची १६८ मंज़िला दुनिया की सबसे ऊँची इमारत है (जनवरी, सन् २०१० में)। इसका लोकार्पण ४ जनवरी, २०१० को भव्य उद्घाटन समारोह के साथ किया गया। इसमें तैराकी का स्थान, खरीदारी की व्यवस्था, दफ़्तर, सिनेमा घर सहित सारी सुविधाएँ मौजूद हैं। इसकी ७६ वीं मंजिल पर एक मस्जिद भी बनायी गयी है। इसे ९६ किलोमीटर दूर से भी साफ़-साफ़ देखा जा सकता है। इसमें लगायी गयी लिफ़्ट दुनिया की सबसे तेज़ चलने वाली लिफ़्ट है। “ऐट द टॉप” नामक एक दरवाज़े के बाहर अवलोकन डेक, 124 वीं मंजिल पर, 5 जनवरी 2010 पर खुला। यह 452 मीटर (1,483 फुट) पर, दुनिया में तीसरे सर्वोच्च अवलोकन डेक और दुनिया में दूसरा सबसे बड़ा दरवाज़े के बाहर अवलोकन डेक है।\nनिर्माण विशेषता सन्दर्भ\nबाहरी\xa0कड़ियाँ\nNo URL found. Please specify a URL here or add one to Wikidata.\nश्रेणी: गगनचुम्बी इमारतें\nश्रेणी: सर्वोच्च गगनचुम्बी',
 'question': 'बुर्ज खलीफा कहाँ स्थित है?',
 'answer_text': 'दुबई',
 'answer_start': 14,
 'language': 'hindi',
 'answers':

In [9]:
tokenized_train_ds = train_dataset.map(prepare_train_features, batched=True, remove_columns=train_dataset.column_names)
tokenized_valid_ds = valid_dataset.map(prepare_train_features, batched=True, remove_columns=train_dataset.column_names)

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [10]:
%env WANDB_DISABLED=True
args = TrainingArguments(
    f"chaii-qa",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=3e-5,
    warmup_ratio=0.1,
    gradient_accumulation_steps=8,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=1,
    weight_decay=0.01,
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


env: WANDB_DISABLED=True


In [11]:
data_collator = default_data_collator

model = AutoModelForQuestionAnswering.from_pretrained('deepset/xlm-roberta-large-squad2')

trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_train_ds,
    eval_dataset=tokenized_valid_ds,
    data_collator=data_collator,
    tokenizer=tokenizer,
)

Downloading model.safetensors:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

Some weights of the model checkpoint at deepset/xlm-roberta-large-squad2 were not used when initializing XLMRobertaForQuestionAnswering: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing XLMRobertaForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [12]:
trainer.train()
trainer.save_model("chaii-bert-trained")

Epoch,Training Loss,Validation Loss
0,No log,0.207534


In [13]:
def prepare_validation_features(examples):
    examples["question"] = [q.lstrip() for q in examples["question"]]
    tokenized_examples = tokenizer(
        examples["question" if pad_on_right else "context"],
        examples["context" if pad_on_right else "question"],
        truncation="only_second" if pad_on_right else "only_first",
        max_length=max_length,
        stride=doc_stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )
    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")

    tokenized_examples["example_id"] = []

    for i in range(len(tokenized_examples["input_ids"])):
        sequence_ids = tokenized_examples.sequence_ids(i)
        context_index = 1 if pad_on_right else 0

        sample_index = sample_mapping[i]
        tokenized_examples["example_id"].append(examples["id"][sample_index])

        tokenized_examples["offset_mapping"][i] = [
            (o if sequence_ids[k] == context_index else None)
            for k, o in enumerate(tokenized_examples["offset_mapping"][i])
        ]

    return tokenized_examples

In [14]:
validation_features = valid_dataset.map(
    prepare_validation_features,
    batched=True,
    remove_columns=valid_dataset.column_names
)

  0%|          | 0/1 [00:00<?, ?ba/s]

In [15]:
len(validation_features)

843

In [16]:
valid_dataset

Dataset({
    features: ['id', 'context', 'question', 'answer_text', 'answer_start', 'language', 'answers'],
    num_rows: 64
})

In [17]:
valid_feats_small = validation_features.map(lambda example: example, remove_columns=['example_id', 'offset_mapping'])
valid_feats_small

  0%|          | 0/843 [00:00<?, ?ex/s]

Dataset({
    features: ['input_ids', 'attention_mask'],
    num_rows: 843
})

Prediction

In [18]:
raw_predictions = trainer.predict(valid_feats_small)

In [19]:
raw_predictions[0]

(array([[ 2.9103127, -8.093126 , -9.636034 , ..., -7.019672 , -7.768854 ,
         -9.207588 ],
        [ 6.607559 , -8.536687 , -9.589536 , ..., -9.304174 , -8.241798 ,
         -9.710736 ],
        [ 6.562418 , -8.753011 , -9.534787 , ..., -8.432135 , -8.885662 ,
         -9.622976 ],
        ...,
        [ 6.9062653, -8.552932 , -9.4033375, ..., -9.4056   , -8.552918 ,
         -9.720844 ],
        [ 6.9285574, -8.664242 , -9.423424 , ..., -8.626818 , -9.493884 ,
         -9.448549 ],
        [ 6.757641 , -8.547923 , -9.486352 , ..., -9.918131 , -9.918131 ,
         -9.918131 ]], dtype=float32),
 array([[  1.7459182, -10.649314 , -10.019013 , ...,  -9.928308 ,
          -9.473663 ,  -9.013246 ],
        [  6.032118 , -10.928494 , -10.468949 , ...,  -8.739706 ,
          -9.394252 , -10.06396  ],
        [  5.9281254, -10.928891 , -10.409733 , ...,  -8.3001585,
          -8.813304 ,  -9.809639 ],
        ...,
        [  6.1030817, -10.411725 , -10.731021 , ...,  -8.779191 ,
         

In [20]:
max_answer_length = 30

In [21]:
import collections

examples = valid_dataset
features = validation_features

example_id_to_index = {k: i for i, k in enumerate(examples["id"])}
features_per_example = collections.defaultdict(list)
for i, feature in enumerate(features):
    features_per_example[example_id_to_index[feature["example_id"]]].append(i)

In [22]:
from tqdm.auto import tqdm

def postprocess_qa_predictions(examples, features, raw_predictions, n_best_size = 20, max_answer_length = 30):
    all_start_logits, all_end_logits = raw_predictions
    example_id_to_index = {k: i for i, k in enumerate(examples["id"])}
    features_per_example = collections.defaultdict(list)
    for i, feature in enumerate(features):
        features_per_example[example_id_to_index[feature["example_id"]]].append(i)

    predictions = collections.OrderedDict()

    print(f"Post-processing {len(examples)} example predictions split into {len(features)} features.")

    for example_index, example in enumerate(tqdm(examples)):
        feature_indices = features_per_example[example_index]

        min_null_score = None # Only used if squad_v2 is True.
        valid_answers = []
        
        context = example["context"]
        for feature_index in feature_indices:
            # We grab the predictions of the model for this feature.
            start_logits = all_start_logits[feature_index]
            end_logits = all_end_logits[feature_index]
            # This is what will allow us to map some the positions in our logits to span of texts in the original
            # context.
            offset_mapping = features[feature_index]["offset_mapping"]

            # Update minimum null prediction.
            cls_index = features[feature_index]["input_ids"].index(tokenizer.cls_token_id)
            feature_null_score = start_logits[cls_index] + end_logits[cls_index]
            if min_null_score is None or min_null_score < feature_null_score:
                min_null_score = feature_null_score

            # Go through all possibilities for the `n_best_size` greater start and end logits.
            start_indexes = np.argsort(start_logits)[-1 : -n_best_size - 1 : -1].tolist()
            end_indexes = np.argsort(end_logits)[-1 : -n_best_size - 1 : -1].tolist()
            for start_index in start_indexes:
                for end_index in end_indexes:
                    # Don't consider out-of-scope answers, either because the indices are out of bounds or correspond
                    # to part of the input_ids that are not in the context.
                    if (
                        start_index >= len(offset_mapping)
                        or end_index >= len(offset_mapping)
                        or offset_mapping[start_index] is None
                        or offset_mapping[end_index] is None
                    ):
                        continue
                    # Don't consider answers with a length that is either < 0 or > max_answer_length.
                    if end_index < start_index or end_index - start_index + 1 > max_answer_length:
                        continue

                    start_char = offset_mapping[start_index][0]
                    end_char = offset_mapping[end_index][1]
                    valid_answers.append(
                        {
                            "score": start_logits[start_index] + end_logits[end_index],
                            "text": context[start_char: end_char]
                        }
                    )
        
        if len(valid_answers) > 0:
            best_answer = sorted(valid_answers, key=lambda x: x["score"], reverse=True)[0]
        else:
            # In the very rare edge case we have not a single non-null prediction, we create a fake prediction to avoid
            # failure.
            best_answer = {"text": "", "score": 0.0}
        
        # Let's pick our final answer: the best one or the null answer (only for squad_v2)
        predictions[example["id"]] = best_answer["text"]

    return predictions

In [23]:
final_predictions = postprocess_qa_predictions(valid_dataset, validation_features, raw_predictions.predictions)

Post-processing 64 example predictions split into 843 features.


  0%|          | 0/64 [00:00<?, ?it/s]

In [24]:
prediction = pd.DataFrame([{"questions":x1['question'], "pred_answer":x2} for x1, x2 in zip(valid_dataset, [i for i in final_predictions.values()])])

In [25]:
prediction

Unnamed: 0,questions,pred_answer
0,जंतु जगत में मधुमक्खी किसके संघ का कीट है?,आर्थोपोडा
1,भारतीय नौसेना का गठन कब हुआ?,1613 ई.
2,तबलीग़ी जमात की स्थापना किस वर्ष में हुई थी?,1927
3,भारत का प्रथम महिला विश्विद्यालय कौन सा है?,एस एन डी टी
4,तानसेन किसके साथ वृन्दावन संगीत की शिक्षा ग्रह...,स्वामी हरिदास जी
...,...,...
59,मुंबई की आधिकारिक भाषा क्या है?,मराठी
60,CPU का पूर्ण प्रपत्र क्या है?,सेंट्रल प्रोसेसिंग यूनिट
61,कैमरा का अविष्कार किसने किया था?,इब्न-अल-हज़ैन
62,वैज्ञानिक रेने देकार्त की राष्ट्रीयता क्या थी?,फ़्रांसिसी
