In [1]:
import json
import pandas as pd
import pickle

In [2]:
# Set display options
pd.set_option('display.max_rows', None)  # Show all rows
pd.set_option('display.max_columns', None)  # Show all columns
pd.set_option('display.max_colwidth', None)  # Show full text in columns


In [3]:
# #Reset options to default if needed
# pd.reset_option('display.max_rows')
# pd.reset_option('display.max_columns')
# pd.reset_option('display.max_colwidth')

In [4]:


# Load the CUAD JSON data
file_path = 'input_data/CUAD_v1.json'  # Update with your actual file path
with open(file_path, 'r') as f:
    cuad_data = json.load(f)

# List to hold the combined data
combined_data = []

# Iterate through the CUAD data and combine everything into a single list
for doc_idx, entry in enumerate(cuad_data['data']):
    # Extract Document Info
    document_id = f"doc_{doc_idx+1}"
    doc_title = entry.get('title', 'N/A')
    
    for para_idx, paragraph in enumerate(entry['paragraphs']):
        paragraph_id = f"{document_id}_para_{para_idx+1}"
        paragraph_text = paragraph['context']
        
        for qa_idx, qa in enumerate(paragraph['qas']):
            question_id = f"{paragraph_id}_q_{qa_idx+1}"
            question_text = qa['question']
            is_impossible = qa['is_impossible']
            
            # Handle answers
            if qa['answers']:
                for ans_idx, answer in enumerate(qa['answers']):
                    answer_id = f"{question_id}_a_{ans_idx+1}"
                    answer_text = answer['text']
                    answer_start = answer['answer_start']
                    
                    # Combine all relevant data into one row
                    combined_data.append({
                        'document_id': document_id,
                        'document_title': doc_title,
                        'paragraph_id': paragraph_id,
                        'paragraph_text': paragraph_text,
                        'question_id': question_id,
                        'question_text': question_text,
                        'is_impossible': is_impossible,
                        'answer_id': answer_id,
                        'answer_text': answer_text,
                        'answer_start': answer_start
                    })
            else:
                # If no answers are present, still include the question
                combined_data.append({
                    'document_id': document_id,
                    'document_title': doc_title,
                    'paragraph_id': paragraph_id,
                    'paragraph_text': paragraph_text,
                    'question_id': question_id,
                    'question_text': question_text,
                    'is_impossible': is_impossible,
                    'answer_id': None,
                    'answer_text': None,
                    'answer_start': None
                })

# Convert the combined data into a Pandas DataFrame
combined_df = pd.DataFrame(combined_data)

# # Save to a CSV file
# combined_df.to_csv('cuad_combined.csv', index=False)

# Optionally, print the head of the DataFrame to verify
# print(combined_df.head())


### filtering data by document name

In [None]:
import os

# Folder path
folder_path = 'topic_embeddings'

# List all CSV files and remove everything after the first dot (.) including any extensions
csv_files = ['.'.join(file.split('.')[:-2]) for file in os.listdir(folder_path) if file.endswith('.csv')]

# Print the modified list of CSV files
print(csv_files)


### generating document list available to query

In [None]:
import os

# Folder path
folder_path = 'topic_embeddings'

# List all CSV files and remove everything after the first dot (.) including any extensions
document_list = ['.'.join(file.split('.')[:-1]) for file in os.listdir(folder_path) if file.endswith('.csv')]

# Print the modified list of CSV files
print(document_list)

# Convert the list to a DataFrame
document_df = pd.DataFrame(document_list, columns=['file_name'])

# Save the DataFrame to a CSV file
output_file = 'document_list_available_to_query.csv'
document_df.to_csv(output_file, index=False)

In [7]:
# Function to filter DataFrame by a list of document titles
def filter_by_document_titles(df, document_titles):
    # Filter the DataFrame based on the list of document titles
    return df[df['document_title'].isin(document_titles)]

In [8]:
# Example list of document titles to filter by
document_titles_to_filter = csv_files
#document_titles_to_filter =['AIRSPANNETWORKSINC_04_11_2000-EX-10.5-Distributor Agreement','BEYONDCOMCORP_08_03_2000-EX-10.2-CO-HOSTING AGREEMENT','Columbia Laboratories, (Bermuda) Ltd. - AMEND NO. 2 TO MANUFACTURING AND SUPPLY AGREEMENT.PDF','ENERGYXXILTD_05_08_2015-EX-10.13-Transportation AGREEMENT.PDF','MERCATAINC_03_09_2000-EX-10.21-SPONSORSHIP AGREEMENT.PDF','TALLGRASSENERGY,LP_02_20_2020-EX-99.26-JOINT FILING AGREEMENT.PDF','UNITEDNATIONALBANCORP_03_03_1999-EX-99-Outsourcing Agreement with the BISYS Group, Inc..PDF','VERTICALNETINC_04_01_2002-EX-10.19-MAINTENANCE AND SUPPORT AGREEMENT.PDF']
#document_titles_to_filter=['ALLISONTRANSMISSIONHOLDINGSINC_12_15_2014-EX-99.1-COOPERATION AGREEMENT','DeltathreeInc_19991102_S-1A_EX-10.19_6227850_EX-10.19_Co-Branding Agreement_ Service Agreement']

# Apply filtering based on the given document titles
combined_df = filter_by_document_titles(combined_df, document_titles_to_filter)

In [9]:
# combined_df.head()

### generating random sample questions

In [None]:
combined_df = combined_df[combined_df['is_impossible'] == False]


full_sample = combined_df[['document_title', 'question_text', 'answer_text']]
# Randomly select 15 rows
random_sample = combined_df[['document_title', 'question_text', 'answer_text']].sample(10,random_state=5)

# Save the random sample to a new DataFrame
random_df = pd.DataFrame(random_sample)
full_df = pd.DataFrame(full_sample)
# Save the DataFrame to a CSV file
random_df.to_csv('random_questions.csv', index=False)
full_df.to_csv('full_questions.csv', index=False)

print("Random sample of questions saved to 'random_questions.csv'")


In [None]:
random_df.head()

In [None]:
 #Save the final processed text as a pickle file
pickle_file_path = 'random_question_dataframe.pkl'
with open(pickle_file_path, 'wb') as file:
    pickle.dump(random_df, file)

print(f"Processed text saved to {pickle_file_path}")

In [None]:
#Save the final processed text as a pickle file
pickle_file_path = 'full_question_dataframe.pkl'
with open(pickle_file_path, 'wb') as file:
    pickle.dump(random_df, file)

print(f"Processed text saved to {pickle_file_path}")