Question Variation Generation:

In [9]:
import pandas as pd
import os
import yaml
from dotenv import load_dotenv
from utils.common import read_yaml
from utils.llm import LargeLanguageModel
import random

def llm_retriever(llm, question, variation_number):
    variation_prompt = read_yaml('./prompts/generate_variation_prompt.yml')
    system_message = variation_prompt['system_message']
    prompt_template = variation_prompt['prompt']

    prompt = prompt_template.format(query=question, number_variations=variation_number)

    # Generate variations for the current question
    generated_variations = generate_variations(llm, prompt, system_message)
    generated_variations = generated_variations.split('```\n')[1].split('\n```')[0]
    generated_variations = generated_variations.split('\n')

    # Return variation
    return generated_variations

# Run LLM
def generate_variations(llm, prompt, system_message):
    generated_variations = llm.llm_runnable.invoke({'system_message': system_message, 'prompt': prompt})
    return generated_variations

def read_input_file(file_path):
    # Detect file type and read data
    if file_path.endswith('.csv'):
        return pd.read_csv(file_path)
    elif file_path.endswith('.xlsx'):
        return pd.read_excel(file_path)
    else:
        raise ValueError("Unsupported file format.")

def main():
    # Load environment variable
    load_dotenv('./.env.development')

    # Load model config
    model_config_path = os.environ['model']
    with open(model_config_path, 'r') as f:
        model_config = yaml.safe_load(f)

    # Initialize LLM
    llm = LargeLanguageModel(**model_config)
    
    # Load questions and table descriptions
    input_file = "test_input.xlsx"  # Update input file
    questions_data = read_input_file(input_file)

    variation_number = 5  # Adjust the number of variations to generate

    # Accumulate variations in list
    all_train_set = []
    all_test_set = []

    for idx, row in questions_data.iterrows():
        question = row['Questions']
        table_description = row['Table_Description']
        variations = llm_retriever(llm, question, variation_number)

        # Split variations into train_set and test_set
        split_index = int(0.8 * len(variations))
        variations_train_set = variations[:split_index]
        variations_test_set = variations[split_index:]

        # Flatten variations and accumulate 'Table_Description' and 'Positive' in respective lists
        variations_train_set_flat = [(table_description, variation) for variation in variations_train_set]
        variations_test_set_flat = [(table_description, variation) for variation in variations_test_set]

        all_train_set.extend(variations_train_set_flat)
        all_test_set.extend(variations_test_set_flat)

    # Convert to DataFrames
    df_train_set = pd.DataFrame(all_train_set, columns=['Table_Description', 'Positive'])
    df_test_set = pd.DataFrame(all_test_set, columns=['Table_Description', 'Positive'])

    # Save to Excel
    output_file_train_set = "train_set.xlsx"  # Update output file path for train_set
    output_file_test_set = "test_set.xlsx"  # Update output file path for test_set
    
    df_train_set.to_excel(output_file_train_set, index=False)
    df_test_set.to_excel(output_file_test_set, index=False)

    print(f"Variations saved to {output_file_train_set} (train_set) and {output_file_test_set} (test_set)")

if __name__ == "__main__":
    main()


                offload_kqv was transferred to model_kwargs.
                Please confirm that offload_kqv is what you intended.
llama_model_loader: loaded meta data with 23 key-value pairs and 291 tensors from models/openchat-3.5-0106.Q6_K.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = openchat_openchat-3.5-0106
llama_model_loader: - kv   2:                       llama.context_length u32              = 8192
llama_model_loader: - kv   3:                     llama.embedding_length u32              = 4096
llama_model_loader: - kv   4:                          llama.block_count u32              = 32
llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
llama_model_loader: - k

 ```
What is the total amount due for cash transactions from last month?
Please provide the outstanding balance for cash sales during the previous month.
Can you give me the remaining cash payment balance for the last month's sales?
Could you tell me the current outstanding cash sales balance from the past month?
Kindly inform me of the total cash sales balance that is still pending from last month.
```


llama_print_timings:        load time =    1329.74 ms
llama_print_timings:      sample time =      25.25 ms /    86 runs   (    0.29 ms per token,  3405.40 tokens per second)
llama_print_timings: prompt eval time =   29445.06 ms /   170 tokens (  173.21 ms per token,     5.77 tokens per second)
llama_print_timings:        eval time =  113505.89 ms /    85 runs   ( 1335.36 ms per token,     0.75 tokens per second)
llama_print_timings:       total time =  143396.21 ms /   255 tokens
Llama.generate: prefix-match hit


 ```
What is the total amount of cash sales not yet paid this month?
How much money from cash sales remains unpaid for this month?
Can you provide the sum of cash sales that are still outstanding for this month?
Could you tell me the current balance of cash sales that haven't been settled for this month?
What is the total amount of cash transactions pending payment this month?
```


llama_print_timings:        load time =    1329.74 ms
llama_print_timings:      sample time =      24.10 ms /    86 runs   (    0.28 ms per token,  3568.46 tokens per second)
llama_print_timings: prompt eval time =   24354.53 ms /   140 tokens (  173.96 ms per token,     5.75 tokens per second)
llama_print_timings:        eval time =  113184.15 ms /    85 runs   ( 1331.58 ms per token,     0.75 tokens per second)
llama_print_timings:       total time =  137924.57 ms /   225 tokens
Llama.generate: prefix-match hit


 ```
Which company has the largest invoice amount?
What is the company with the highest invoice total?
Which business has the greatest invoiced sum?
Which enterprise possesses the most significant invoice figure?
Which organization boasts the highest invoice value?
```Variations saved to train_set.xlsx (train_set) and test_set.xlsx (test_set)



llama_print_timings:        load time =    1329.74 ms
llama_print_timings:      sample time =      12.59 ms /    61 runs   (    0.21 ms per token,  4846.66 tokens per second)
llama_print_timings: prompt eval time =   24442.12 ms /   140 tokens (  174.59 ms per token,     5.73 tokens per second)
llama_print_timings:        eval time =   81871.56 ms /    60 runs   ( 1364.53 ms per token,     0.73 tokens per second)
llama_print_timings:       total time =  106591.69 ms /   200 tokens


Making Triplet Dataset:

In [10]:
import pandas as pd

# Read data from Excel file
input_file = "train_set.xlsx"
df = pd.read_excel(input_file)

# Create a dictionary to store Positive values for each Table_Description
positive_dict = df.groupby('Table_Description')['Positive'].apply(list).to_dict()

# Create a function to get the Negative values based on the Positive values and Table_Description
def get_negative(row):
    table_desc = row['Table_Description']
    positive_val = row['Positive']
    return [val for desc, vals in positive_dict.items() if desc != table_desc for val in vals]

# Apply the function to create the 'Negative' column
df['Negative'] = df.apply(get_negative, axis=1)

# Explode the 'Negative' column to create rows for each Negative value
df_exploded = df.explode('Negative')

# Reorder columns and reset index
result_df = df_exploded[['Table_Description', 'Positive', 'Negative']].reset_index(drop=True)

# Save the result to a new Excel file
output_file = "output_test2.xlsx"
result_df.to_excel(output_file, index=False)
print(f"Result saved to {output_file}")


Result saved to output_test2.xlsx
