In [17]:
import json
import os

def extract_features(input_folder, output_file, encoding='utf-8'):
    data = []
    for filename in os.listdir(input_folder):
        if filename.endswith('.json'):
            with open(os.path.join(input_folder, filename), 'r', encoding=encoding) as f:
                file_data = json.load(f)
                for item in file_data['data']:
                    title = item['title']
                    for paragraph in item['paragraphs']:
                        context = paragraph['context']
                        for qa in paragraph['qas']:
                            id = qa['id']
                            question = qa['question']
                            answers = []
                            for answer in qa['answers']:
                                answers.append({
                                    'text': answer['text'],
                                    'answer_start': answer['answer_start']
                                })
                            data.append({
                                'id': id,
                                'title': title,
                                'context': context,
                                'question': question,
                                'answers': answers
                            })
    with open(output_file, 'w', encoding=encoding) as f:
        json.dump(data, f, indent=4)

extract_features('MhaInformatics2trainingdatasetMinistryWise', 'MhaInformatics_merge.json', encoding='utf-8')


In [18]:
import json

def remove_duplicate_questions(input_file, output_file, encoding='utf-8'):
    with open(input_file, 'r', encoding=encoding) as f:
        data = json.load(f)
    unique_questions = set()
    new_data = []
    for item in data:
        question = item['question']
        if question not in unique_questions:
            unique_questions.add(question)
            new_data.append(item)
    with open(output_file, 'w', encoding=encoding) as f:
        json.dump(new_data, f, indent=4)

remove_duplicate_questions('MhaInformatics_merge.json', 'MhaInformatics_merge_unique.json', encoding='utf-8')


In [None]:
# context = paragraph.get('context')
# if context:
#     # do something with context
# else:
#     # handle the case where context is not found


In [None]:
# if 'context' in paragraph:
#     context = paragraph['context']
#     # do something with context
# else:
#     # handle the case where context is not found


In [43]:
import json
import os

def extract_features(input_folder, output_file, encoding='utf-8'):
    data = []
    for filename in os.listdir(input_folder):
        if filename.endswith('.json'):
            with open(os.path.join(input_folder, filename), 'r', encoding=encoding) as f:
                file_data = json.load(f)
                for item in file_data['data']:
                    title = item['title']
                    for paragraph in item['paragraphs']:
                        context = paragraph.get('context')
                        if context:
                            for qa in paragraph['qas']:
                                id = qa['id']
                                question = qa['question']
                                answers = []
                                for answer in qa['answers']:
                                    answers.append({
                                        'text': answer['text'],
                                        'answer_start': answer['answer_start']
                                    })
                                data.append({
                                    'id': id,
                                    'title': title,
                                    'context': context,
                                    'question': question,
                                    'answers': answers
                                })
    with open(output_file, 'w', encoding=encoding) as f:
        json.dump(data, f, indent=4)

extract_features('Xa', 'Xa_merge.json', encoding='utf-8')


In [44]:
import json

def remove_duplicate_questions(input_file, output_file, encoding='utf-8'):
    with open(input_file, 'r', encoding=encoding) as f:
        data = json.load(f)
    unique_questions = set()
    new_data = []
    for item in data:
        question = item['question']
        if question not in unique_questions:
            unique_questions.add(question)
            new_data.append(item)
    with open(output_file, 'w', encoding=encoding) as f:
        json.dump(new_data, f, indent=4)

remove_duplicate_questions('Xa_merge.json', 'Xa_merge_unique.json', encoding='utf-8')


In [45]:
import json
import os

def merge_json_files(input_folder, output_file):
    merged_data = []
    for filename in os.listdir(input_folder):
        if filename.endswith('.json'):
            with open(os.path.join(input_folder, filename), 'r') as f:
                file_data = json.load(f)
                merged_data.extend(file_data)
    with open(output_file, 'w') as f:
        json.dump(merged_data, f, indent=4)

# Example usage
merge_json_files('XFinal', 'XFinal_merge.json')


In [16]:
import json

def remove_empty_answers(input_file, output_file):
    with open(input_file, 'r') as f:
        data = json.load(f)
    filtered_data = [item for item in data if item['answers']]
    with open(output_file, 'w') as f:
        json.dump(filtered_data, f, indent=4)

# Example usage
remove_empty_answers('XFinal_merge.json', 'XFinal_clean.json')
