In [2]:
import pandas as pd
import os
import numpy as np

In [4]:
def create_multi_dataset(directory_path, output_path):
    final_df = pd.DataFrame()

    # List all CSV files in the directory
    csv_files = [file for file in os.listdir(
        directory_path) if file.endswith('.csv')]

    # Process each file
    for filename in csv_files:
        file_path = os.path.join(directory_path, filename)
        df = pd.read_csv(file_path)

        # Calculate word count in 'text' column
        df['word_count'] = df['text'].apply(lambda x: len(str(x).split()))

        # Separate filters for Human and Ai
        human_rows = df[df['write_by'] == 'Human']
        ai_rows = df[df['write_by'] == 'Ai']

        # For 'Ai', further filter by 'domain' and word count
        ai_bbc = ai_rows[(ai_rows['domain'] == 'BBC') & (
            ai_rows['word_count'] >= 350) & (ai_rows['word_count'] <= 500)]
        ai_wikipedia = ai_rows[(ai_rows['domain'] == 'Wikipedia') & (
            ai_rows['word_count'] >= 60) & (ai_rows['word_count'] <= 100)]
        ai_filtered = pd.concat([ai_bbc, ai_wikipedia])

        # Filter by 'method' and take a sample of 335 from each category if available
        if not ai_filtered.empty:
            method_continue = ai_filtered[ai_filtered['method'] == 'Continue'].sample(
                n=min(335, len(ai_filtered[ai_filtered['method'] == 'Continue'])), random_state=1)
            method_topic = ai_filtered[ai_filtered['method'] == 'Topic'].sample(
                n=min(335, len(ai_filtered[ai_filtered['method'] == 'Topic'])), random_state=1)
            combined_rows = pd.concat(
                [human_rows, method_continue, method_topic])
        else:
            combined_rows = human_rows

        # Remove 'word_count' column and append to the final DataFrame
        combined_rows.drop(columns=['word_count'], inplace=True)
        final_df = pd.concat([final_df, combined_rows], ignore_index=True)

    # Save the final DataFrame to a new CSV file
    final_df.to_csv(output_path, index=False)
    print(f"Filtered data has been saved to {output_path}.")


# usage:
directory_path = 'C:/Users/amirm/Desktop/MONASH/Thesis/Working/Prepare/Code/2_Gen_Ai_Text/Combine_Datasets/Output_Files'
output_path = 'C:/Users/amirm/Desktop/MONASH/Thesis/Working/Prepare/Code/3_Final_Dataset/Multi_Model_Multi_Lingual.csv'
create_multi_dataset(directory_path, output_path)


Filtered data has been saved to C:/Users/amirm/Desktop/MONASH/Thesis/Working/Prepare/Code/3_Final_Dataset/Multi_Model_Multi_Lingual.csv.
