In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install transformers

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
import json

model_dir = "/kaggle/input/claim-decomp/Model/checkpoint-1200"

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_dir)

# Load the model
model = AutoModelForSeq2SeqLM.from_pretrained(model_dir)

In [None]:
tokenizer

In [None]:
# input claims : 
training_claims = []
validation_claims = []
with open("/kaggle/input/testclaimsnlp/train_claims_quantemp.json") as f:
    train_data = json.load(f)
    
with open("/kaggle/input/quantemp-bm25-reranked/val_claims_quantemp_bm25.json") as f:
    val_data = json.load(f)

for index, fact in enumerate(train_data):
    claim = fact["claim"]
    training_claims.append(claim)
    
for index, fact in enumerate(val_data):
    claim = fact["claim"]
    validation_claims.append(claim)

In [None]:
def generate_output(test_samples, model):
    inputs = tokenizer(
        test_samples,
        max_length=128,
        return_tensors="pt")

    input_ids = inputs.input_ids.to(model.device)
    attention_mask = inputs.attention_mask.to(model.device)
    outputs = model.generate(input_ids, attention_mask=attention_mask,min_length = 64, max_length = 128, do_sample=True, top_p=0.95, top_k=50)
    output_str = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    return output_str

In [None]:
training_subquestions = []
validation_subquestions = []

# Here, we split the training and validation datasets into parts to prevent timeouts in kaggle
for claim in training_claims[1000:1500]:
    training_subquestions.append(generate_output("decompose the compositional question:"+claim, model))
    
for claim in validation_claims[1000:1500]:
    validation_subquestions.append(generate_output("decompose the compositional question:"+claim, model))

In [None]:
import re
for index, q in enumerate(training_subquestions):
    # The string inside the array
    text = q[0]

    # Using regex to extract questions
    questions = re.findall(r'Question \d+: (.*?)(?= Answer \d+:)', text)

    # Creating a dictionary with questions
    questions_dict = {f"Question {i}": question.strip() for i, question in enumerate(questions)}
    
    train_data[1000+index]['subquestions'] = questions_dict
    
for index, q in enumerate(validation_subquestions):
    # The string inside the array
    text = q[0]

    # Using regex to extract questions
    questions = re.findall(r'Question \d+: (.*?)(?= Answer \d+:)', text)

    # Creating a dictionary with questions
    questions_dict = {f"Question {i}": question.strip() for i, question in enumerate(questions)}
    
    val_data[index]['subquestions'] = questions_dict

In [None]:
with open('test_data.json', 'w') as f:
    json.dump(train_data, f)
    
with open('validation_data_subquestions.json', 'w') as f:
    json.dump(val_data, f)