Question Variation Generation:

In [None]:
import pandas as pd
import os
import yaml
from dotenv import load_dotenv
from utils.common import read_yaml
from utils.llm import LargeLanguageModel
import random

def llm_retriever(llm, question, variation_number):
    variation_prompt = read_yaml('./prompts/generate_variation_prompt.yml')
    system_message = variation_prompt['system_message']
    prompt_template = variation_prompt['prompt']

    prompt = prompt_template.format(query=question, number_variations=variation_number)

    # Generate variations for the current question
    generated_variations = generate_variations(llm, prompt, system_message)
    generated_variations = generated_variations.split('```\n')[1].split('\n```')[0]
    generated_variations = generated_variations.split('\n')

    # Return variation
    return generated_variations

# Run LLM
def generate_variations(llm, prompt, system_message):
    generated_variations = llm.llm_runnable.invoke({'system_message': system_message, 'prompt': prompt})
    return generated_variations

def read_input_file(file_path):
    # Detect file type and read data
    if file_path.endswith('.csv'):
        return pd.read_csv(file_path)
    elif file_path.endswith('.xlsx'):
        return pd.read_excel(file_path)
    else:
        raise ValueError("Unsupported file format.")

def main():
    # Load environment variable
    load_dotenv('./.env.development')

    # Load model config
    model_config_path = os.environ['model']
    with open(model_config_path, 'r') as f:
        model_config = yaml.safe_load(f)

    # Initialize LLM
    llm = LargeLanguageModel(**model_config)
    
    # Load questions and table descriptions
    input_file = "test_input.xlsx"  # Update input file
    questions_data = read_input_file(input_file)

    variation_number = 5  # Adjust the number of variations to generate

    # Accumulate variations in list
    all_train_set = []
    all_test_set = []

    for idx, row in questions_data.iterrows():
        question = row['Questions']
        table_description = row['Table_Description']
        variations = llm_retriever(llm, question, variation_number)

        # Split variations into train_set and test_set
        split_index = int(0.8 * len(variations))
        variations_train_set = variations[:split_index]
        variations_test_set = variations[split_index:]

        # Flatten variations and accumulate 'Table_Description' and 'Positive' in respective lists
        variations_train_set_flat = [(table_description, variation) for variation in variations_train_set]
        variations_test_set_flat = [(table_description, variation) for variation in variations_test_set]

        all_train_set.extend(variations_train_set_flat)
        all_test_set.extend(variations_test_set_flat)

    # Convert to DataFrames
    df_train_set = pd.DataFrame(all_train_set, columns=['Table_Description', 'Positive'])
    df_test_set = pd.DataFrame(all_test_set, columns=['Table_Description', 'Positive'])

    # Save to Excel
    output_file_train_set = "train_set.xlsx"  # Update output file path for train_set
    output_file_test_set = "test_set.xlsx"  # Update output file path for test_set
    
    df_train_set.to_excel(output_file_train_set, index=False)
    df_test_set.to_excel(output_file_test_set, index=False)

    print(f"Variations saved to {output_file_train_set} (train_set) and {output_file_test_set} (test_set)")

if __name__ == "__main__":
    main()


Making Triplet Dataset:

In [None]:
import pandas as pd

# Read data from Excel file
input_file = "train_set.xlsx"
df = pd.read_excel(input_file)

# Create a dictionary to store Positive values for each Table_Description
positive_dict = df.groupby('Table_Description')['Positive'].apply(list).to_dict()

# Function to get the Negative values based on the Positive values and Table_Description
def get_negative(row):
    table_desc = row['Table_Description']
    positive_val = row['Positive']
    return [val for desc, vals in positive_dict.items() if desc != table_desc for val in vals]

# Apply function to create the 'Negative' column
df['Negative'] = df.apply(get_negative, axis=1)

# create rows for each Negative value
df_exploded = df.explode('Negative')

result_df = df_exploded[['Table_Description', 'Positive', 'Negative']].reset_index(drop=True)

# Save the result to a new Excel file
output_file = "output_test2.xlsx"
result_df.to_excel(output_file, index=False)
print(f"Result saved to {output_file}")
