In [1]:
!pip install jsonlines

Collecting jsonlines
  Downloading jsonlines-4.0.0-py3-none-any.whl (8.7 kB)
Installing collected packages: jsonlines
Successfully installed jsonlines-4.0.0


In [3]:
# to convert given dataset into proper qna format
import json

def extract_json_objects(text):
    """
    Extract JSON objects from a string by balancing curly braces.
    """
    json_objects = []
    start = 0
    depth = 0
    in_string = False

    for i, char in enumerate(text):
        if char == '"' and (i == 0 or text[i-1] != '\\'):
            in_string = not in_string
        if not in_string:
            if char == '{':
                if depth == 0:
                    start = i
                depth += 1
            elif char == '}':
                depth -= 1
                if depth == 0:
                    try:
                        json_obj = json.loads(text[start:i+1])
                        json_objects.append(json_obj)
                    except json.JSONDecodeError:
                        continue
    return json_objects

def clean_json_file(input_filename, output_filename):
    """
    Clean a JSON file and save the valid JSON objects to a new file.
    """
    with open(input_filename, 'r', encoding='utf-8') as file:
        content = file.read()

    json_objects = extract_json_objects(content)

    with open(output_filename, 'w', encoding='utf-8') as file:
        for obj in json_objects:
            json.dump(obj, file)
            file.write('\n')

def load_cleaned_json(filename):
    """
    Load cleaned JSON objects from a file.
    """
    data = []
    with open(filename, 'r', encoding='utf-8') as file:
        for line in file:
            try:
                data.append(json.loads(line))
            except json.JSONDecodeError:
                continue
    return data

input_filename = '/content/news.article.json'
cleaned_filename = 'cleaned_dataset.jsonl'

# Clean the JSON file
clean_json_file(input_filename, cleaned_filename)

# Load the cleaned JSON data
data = load_cleaned_json(cleaned_filename)

# Convert the cleaned data to the required QA format
qa_dataset = []

for article in data:
    context = article.get("articleBody", "")

    if not isinstance(context, str):
        context = str(context)

    title = article.get("title", "")
    date_modified = article.get("dateModified", "")
    source = article.get("source", "")

    if not isinstance(title, str):
        title = str(title)
    if not isinstance(date_modified, str):
        date_modified = str(date_modified)
    if not isinstance(source, str):
        source = str(source)

    questions_and_answers = [
        {
            "question": "What is the title of the article?",
            "answers": [
                {
                    "text": title,
                    "answer_start": context.find(title)
                }
            ]
        },
        {
            "question": "When was the article modified?",
            "answers": [
                {
                    "text": date_modified,
                    "answer_start": context.find(date_modified) if date_modified in context else -1
                }
            ]
        },
        {
            "question": "What is the source of the article?",
            "answers": [
                {
                    "text": source,
                    "answer_start": context.find(source) if source in context else -1
                }
            ]
        }
    ]

    qa_dataset.append({
        "context": context,
        "qas": questions_and_answers
    })

# Save the new dataset
with open('qa_dataset.json', 'w', encoding='utf-8') as file:
    json.dump(qa_dataset, file, indent=2)

print("Conversion to QA format completed successfully.")


Conversion to QA format completed successfully.


In [4]:
import json
import uuid

def transform_qa_dataset(input_file, output_file):
    with open(input_file, 'r') as f:
        data = json.load(f)

    transformed_data = []
    for entry in data:
        context = entry['context']
        qas = entry['qas']
        new_qas = []

        for qa in qas:
            question = qa['question']
            answers = qa['answers']
            new_answers = []

            for answer in answers:
                text = answer['text']
                answer_start = answer['answer_start']
                if answer_start != -1:
                    new_answers.append({
                        "text": text,
                        "answer_start": answer_start
                    })

            # Generate a unique ID for each question
            question_id = str(uuid.uuid4())
            new_qas.append({
                "id": question_id,
                "is_impossible": len(new_answers) == 0,
                "question": question,
                "answers": new_answers
            })

        transformed_data.append({
            "context": context,
            "qas": new_qas
        })

    with open(output_file, 'w') as f:
        json.dump(transformed_data, f, indent=2)

# Example usage
input_file = 'qa_dataset.json'
output_file = 'transformed_qa_dataset.json'
transform_qa_dataset(input_file, output_file)


In [5]:
# generate test and train files
import json
import random

# Load the QA dataset
with open('transformed_qa_dataset.json', 'r', encoding='utf-8') as file:
    qa_dataset = json.load(file)

# Shuffle the dataset to ensure random distribution
random.shuffle(qa_dataset)

# Define the split ratio (e.g., 80% training, 20% testing)
split_ratio = 0.8
split_index = int(len(qa_dataset) * split_ratio)

# Split the dataset into training and testing sets
train_data = qa_dataset[:split_index]
test_data = qa_dataset[split_index:]

# Save the training set to train.json
with open('train.json', 'w', encoding='utf-8') as train_file:
    json.dump(train_data, train_file, indent=2)

# Save the testing set to test.json
with open('test.json', 'w', encoding='utf-8') as test_file:
    json.dump(test_data, test_file, indent=2)

print("Training and testing datasets have been created successfully.")


Training and testing datasets have been created successfully.


In [6]:
import json
with open(r"/content/train.json", "r") as read_file:
    train = json.load(read_file)

train

[{'context': 'OMAHA — Days after he and his family drew threats from Republican populists for resisting Ohio Rep. Jim Jordan’s push to be House Speaker, U.S. Rep. Don Bacon supported a consensus GOP pick who, like Jordan, voted to reject the 2020 election results.\n\nCongressional Republicans unanimously backed Louisiana Rep. Mike Johnson on Wednesday to lead the House following three weeks of party infighting over whether Speaker Kevin McCarthy should have lost his leadership post and who should replace him. Nebraska’s three Republican House members — Reps. Bacon, Mike Flood and Adrian Smith — voted with their GOP colleagues to elect Johnson. Flood and Smith had also backed Jordan. Bacon drew national attention for opposing him.\n\nStatements from Nebraska’s House members Rep. Mike Flood, 1st District: “Congratulations to Rep. Mike Johnson on his election to be the next Speaker of the House. He is a principled, constitutional conservative who will do a fantastic job leading House Repu

In [7]:
import json
with open(r"/content/test.json", "r") as read_file:
    test = json.load(read_file)

train

[{'context': 'OMAHA — Days after he and his family drew threats from Republican populists for resisting Ohio Rep. Jim Jordan’s push to be House Speaker, U.S. Rep. Don Bacon supported a consensus GOP pick who, like Jordan, voted to reject the 2020 election results.\n\nCongressional Republicans unanimously backed Louisiana Rep. Mike Johnson on Wednesday to lead the House following three weeks of party infighting over whether Speaker Kevin McCarthy should have lost his leadership post and who should replace him. Nebraska’s three Republican House members — Reps. Bacon, Mike Flood and Adrian Smith — voted with their GOP colleagues to elect Johnson. Flood and Smith had also backed Jordan. Bacon drew national attention for opposing him.\n\nStatements from Nebraska’s House members Rep. Mike Flood, 1st District: “Congratulations to Rep. Mike Johnson on his election to be the next Speaker of the House. He is a principled, constitutional conservative who will do a fantastic job leading House Repu

In [8]:
!pip3 install simpletransformers

Collecting simpletransformers
  Downloading simpletransformers-0.70.1-py3-none-any.whl (316 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/316.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m122.9/316.3 kB[0m [31m3.6 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.3/316.3 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
Collecting datasets (from simpletransformers)
  Downloading datasets-2.19.2-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.1/542.1 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
Collecting seqeval (from simpletransformers)
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting tensorboardx (from simple

In [9]:

import logging


from simpletransformers.question_answering import QuestionAnsweringModel, QuestionAnsweringArgs

In [10]:
model_type="bert"
model_name= "bert-base-cased"
if model_type == "bert":
    model_name = "bert-base-cased"

elif model_type == "roberta":
    model_name = "roberta-base"

elif model_type == "distilbert":
    model_name = "distilbert-base-cased"

elif model_type == "distilroberta":
    model_type = "roberta"
    model_name = "distilroberta-base"

elif model_type == "electra-base":
    model_type = "electra"
    model_name = "google/electra-base-discriminator"

elif model_type == "electra-small":
    model_type = "electra"
    model_name = "google/electra-small-discriminator"

elif model_type == "xlnet":
    model_name = "xlnet-base-cased"

In [11]:
# Configure the model
model_args = QuestionAnsweringArgs()
model_args.train_batch_size = 16
model_args.evaluate_during_training = True
model_args.n_best_size=3
model_args.num_train_epochs=5


In [12]:
### Advanced Methodology
train_args = {
    "reprocess_input_data": True,
    "overwrite_output_dir": True,
    "use_cached_eval_features": True,
    "output_dir": f"outputs/{model_type}",
    "best_model_dir": f"outputs/{model_type}/best_model",
    "evaluate_during_training": True,
    "max_seq_length": 128,
    "num_train_epochs": 5,
    "evaluate_during_training_steps": 1000,
    "wandb_project": "Question Answer Application",
    "wandb_kwargs": {"name": model_name},
    "save_model_every_epoch": False,
    "save_eval_checkpoints": False,
    "n_best_size":3,
    # "use_early_stopping": True,
    # "early_stopping_metric": "mcc",
    # "n_gpu": 2,
    # "manual_seed": 4,
    # "use_multiprocessing": False,
    "train_batch_size": 128,
    "eval_batch_size": 64,
    # "config": {
    #     "output_hidden_states": True
    # }
}

In [None]:
# Initialize the question answering model
model = QuestionAnsweringModel(
    model_type,
    model_name,
    args=train_args,
)

# Train the model with evaluation during training
train_data = "train.json"
eval_data = "test.json"  # Specify evaluation data
model.train_model(eval_data, eval_data=eval_data)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

  self.pid = os.fork()
convert squad examples to features:   0%|          | 0/636 [00:00<?, ?it/s]

In [None]:
result, texts = model.eval_model(test)