# Generate invalid_documents column for each query

In [15]:
import pandas as pd
import random

# Load the CSV file
file_path = 'emath_qns_eval_data.csv'
df = pd.read_csv(file_path)

# Ensure 'valid_documents' and 'invalid_documents' columns are strings for processing
df['valid_documents'] = df['valid_documents'].astype(str)
if 'invalid_documents' not in df.columns:
    df['invalid_documents'] = ''

# Extract all unique docIDs from the valid_documents column
all_doc_ids = set()
for docs in df['valid_documents']:
    doc_ids = [doc.strip() for doc in docs.split(';') if doc.strip()]
    all_doc_ids.update(doc_ids)

print(all_doc_ids)


{"'674dcd9fb00b977d048c9306'", "'674dce98b00b977d048c93dc'", "'674dcdccb00b977d048c932a'", "'674dcdb4b00b977d048c931a'", "'674dce23b00b977d048c93b5'", "'674dce19b00b977d048c93a1'", "'674dcd9fb00b977d048c9305'", "'674dcddfb00b977d048c934f'", "'674dcd9db00b977d048c9300'", "'674dcddfb00b977d048c9350'", "'674dcde3b00b977d048c9357'", "'674dcdb4b00b977d048c931b'", "'674dce0fb00b977d048c9395'", "'674dce8db00b977d048c93c7'", "'674dcdd3b00b977d048c9337'", "'674dcdccb00b977d048c932b'", "'674dcd9cb00b977d048c92ff'", "'674dcde4b00b977d048c9358'", "'674dcd9eb00b977d048c9304'", "'674dce93b00b977d048c93d2'", "'674dce1ab00b977d048c93a4'", "'674dce93b00b977d048c93d3'", "'674dcdd3b00b977d048c9338'", "'674dcde1b00b977d048c9352'", "'674dce8eb00b977d048c93ca'", "'674dce25b00b977d048c93b9'", "'674dcdb5b00b977d048c931d'", "'674dce10b00b977d048c9397'", "'674dce0cb00b977d048c938f'", "'674dce18b00b977d048c939f'", "'674dcddeb00b977d048c934d'", "'674dcde3b00b977d048c9356'"}


In [16]:
# Clean and populate the 'invalid_documents' column with random docIDs
for index, row in df.iterrows():
    # Strip whitespace and create a set of valid document IDs
    valid_docs = set(doc.strip() for doc in row['valid_documents'].split(';') if doc.strip())
    print("Valid docs: ", valid_docs)
    
    # Create a pool of invalid document IDs, excluding the valid ones
    invalid_docs_pool = list(all_doc_ids - valid_docs)  # Exclude valid documents from the pool
    print("Invalid docs pool: ", invalid_docs_pool)
    
    # Randomly select a number of invalid documents
    if invalid_docs_pool:  # Ensure there are invalid docs to choose from
        num_invalid_docs = random.randint(1, len(valid_docs)-1)  # Random number of invalid docs, up to the number of valid docs
        invalid_docs = random.sample(invalid_docs_pool, num_invalid_docs)
        df.at[index, 'invalid_documents'] = '; '.join(invalid_docs)
    else:
        # If no invalid docs are available, leave the column empty
        df.at[index, 'invalid_documents'] = ''


# Save the updated dataframe to a new CSV file
output_path = 'emath_qns_eval_data_updated.csv'
df.to_csv(output_path, index=False)

output_path

Valid docs:  {"'674dcdb4b00b977d048c931b'", "'674dce0fb00b977d048c9395'", "'674dce10b00b977d048c9397'", "'674dcde1b00b977d048c9352'", "'674dce93b00b977d048c93d2'", "'674dce93b00b977d048c93d3'"}
Invalid docs pool:  ["'674dcd9fb00b977d048c9306'", "'674dce98b00b977d048c93dc'", "'674dcdccb00b977d048c932a'", "'674dcdb4b00b977d048c931a'", "'674dce23b00b977d048c93b5'", "'674dce19b00b977d048c93a1'", "'674dcd9fb00b977d048c9305'", "'674dcddfb00b977d048c934f'", "'674dcd9db00b977d048c9300'", "'674dcddfb00b977d048c9350'", "'674dcde3b00b977d048c9357'", "'674dce8db00b977d048c93c7'", "'674dcdd3b00b977d048c9337'", "'674dcdccb00b977d048c932b'", "'674dcd9cb00b977d048c92ff'", "'674dcde4b00b977d048c9358'", "'674dcd9eb00b977d048c9304'", "'674dce1ab00b977d048c93a4'", "'674dcdd3b00b977d048c9338'", "'674dce8eb00b977d048c93ca'", "'674dce25b00b977d048c93b9'", "'674dcdb5b00b977d048c931d'", "'674dce0cb00b977d048c938f'", "'674dce18b00b977d048c939f'", "'674dcddeb00b977d048c934d'", "'674dcde3b00b977d048c9356'"]
Valid

'emath_qns_eval_data_updated.csv'

# Verify that docIDs in invalid_documents are not present in valid_documents

In [20]:
df = pd.read_csv('emath_qns_eval_data_updated.csv')

# Validate that "invalid_documents" do not contain any docIDs from "valid_documents"
validation_errors = []

for index, row in df.iterrows():
    valid_docs = set(doc.strip() for doc in row['valid_documents'].split(';') if doc.strip())
    invalid_docs = set(doc.strip() for doc in row['invalid_documents'].split(';') if doc.strip())
    
    # Check if there is any overlap between valid and invalid documents
    if valid_docs & invalid_docs:  # Intersection should be empty
        validation_errors.append((index, valid_docs & invalid_docs))  # Record row index and overlapping docIDs

# Display validation results
if validation_errors:
    print(f"Validation errors found in {len(validation_errors)} rows:")
    for error in validation_errors:
        print(f"Row {error[0]} has overlapping docIDs: {error[1]}")
else:
    print("Validation passed: No overlapping docIDs between valid_documents and invalid_documents.")

Validation passed: No overlapping docIDs between valid_documents and invalid_documents.
