In [None]:
import pandas as pd
import os

In [None]:
# Read all CSV files in the directory
data_frames = []
directory = "D:\Repos\My-Digital-Clone\Data"  
for filename in os.listdir(directory):
    if filename.endswith(".csv"):
        temp=pd.read_csv(os.path.join(directory, filename))
        data_frames.append(temp)

# Concatenate all data frames
data = pd.concat(data_frames, ignore_index=True)

In [None]:
def augment_data(df):
    # Initialize an empty DataFrame to store the augmented data
    augmented_data = pd.DataFrame(columns=['question', 'answer'])

    # Extract all answers from the original DataFrame
    all_answers = df['answer'].tolist()

    # Augment data by repeating each batch of questions and answers
    for i in range(0, len(df), 5):
        batch = df.iloc[i:i+5]

        # Duplicate questions
        duplicated_questions = batch['question'].repeat(5).reset_index(drop=True)

        # Map questions to corresponding answers
        answers = batch['answer'].tolist() * 5

        # Concatenate the augmented data
        augmented_data = pd.concat([augmented_data, pd.DataFrame({'question': duplicated_questions, 'answer': answers})], ignore_index=True)

    # Initialize an empty DataFrame to store context data
    context_data = pd.DataFrame(columns=['question', 'answer','context'])

    # Generate context data by combining each question-answer pair with all answers
    for i in range(len(augmented_data)):
        batch = augmented_data.iloc[i:i+1]
        duplicated_questions = batch['question'].repeat(len(all_answers)).reset_index(drop=True)
        duplicated_answers = batch['answer'].repeat(len(all_answers)).reset_index(drop=True)

        # Concatenate the context data
        context_data = pd.concat([context_data, pd.DataFrame({'question': duplicated_questions, 'answer': duplicated_answers, 'context': all_answers})], ignore_index=True)

    # Create a copy of the DataFrame without the 'context' column
    context_data_copy = context_data.drop(columns=['context']).copy()

    # Append the copy to the original DataFrame
    context_data = pd.concat([context_data, context_data_copy], ignore_index=True)

    return context_data


In [None]:
# Augment data frames
augmented_data_frames = [augment_data(df) for df in data_frames]

# Concatenate augmented data frames
augmented_data = pd.concat(augmented_data_frames, ignore_index=True)

# Shuffle the data
augmented_data = augmented_data.sample(frac=1).reset_index(drop=True)

In [None]:
augmented_data.info()

In [None]:
# Save the augmented dataset to a CSV file
augmented_data.to_csv('Data\data.csv', index=False)