This notebook is used to compare results generated by OpenAI GPT and Amazon Bedrock models. The comparison involves reading the results CSV files, where each file contains results from one model. The notebook merges these files based on a common column, which is an article URL, and selects a specific evaluation column (e.g., 'is_happened') from each file for comparison.
The final comparison is saved to an output CSV file.

In [7]:
import pandas as pd
import glob

In [40]:
def merge_csv_files(input_folder, output_file, file_prefix, merge_column, selected_column):
    """
    Reads multiple CSV files with a specific prefix, merges them on the given column, 
    selects specified columns from each, and writes the result to a CSV.

    Args:
    :param input_folder (str): Path to the folder containing CSV files.
    :param output_file (str): Path to save the merged CSV file.
    :param file_prefix (str): Prefix that the CSV filenames should match.
    :param merge_column (str): The column to merge on.
    :param selected_column (str): The column to select from each CSV file.

    Returns:
        pd.DataFrame: Dataframe with merged data from inout CSV files.
    """
    csv_files = glob.glob(f"{input_folder}/{file_prefix}*.csv")
    merged_df = None
    
    for file in csv_files:
        # Read each CSV file
        df = pd.read_csv(file, sep='|')
        
        # Get the filename without extension
        filename = file.split('/')[-1]
        
        # Get the name of the model from the filename
        start_str = "output\\nlp_results_"
        end_str = "_2024-08-10.csv"
        start_idx = filename.index(start_str) + len(start_str)
        end_idx = filename.index(end_str, start_idx)
        model_name = filename[start_idx:end_idx]
        
        # Select the merge column and one column specified for this CSV
        if selected_column in df.columns:
            df = df[[merge_column, selected_column]]
            df.columns = [merge_column, f"{model_name}_{selected_column}"]  # Rename columns to avoid conflicts
            
            # Merge dataframes
            if merged_df is None:
                merged_df = df
            else:
                merged_df = pd.merge(merged_df, df, on=merge_column, how='outer')

    # Save the merged dataframe to a new CSV
    merged_df.to_csv(output_file, index=False)
    print(f"Merged file saved as {output_file}")

    return merged_df

In [45]:
input_folder = '../../output/'                                      # Folder containing the results files
output_file = '../../output/nlp_models_comparison-ishappened.csv'   # Output CSV file
file_prefix = 'nlp_results_'                                        # Prefix for CSV files
merge_column = 'link'                                               # Column to merge on
selected_column = 'is_happened'                                     # The same column to select from each file

# Call the function 'merge_csv_files' to perform the merging operation
comparison_df = merge_csv_files(input_folder, output_file, file_prefix, merge_column, selected_column)
comparison_df

Merged file saved as ../../output/nlp_models_comparison-ishappened.csv


Unnamed: 0,link,meta-llama3-70b_is_happened,meta-llama3-8b_is_happened,mistral-7b_is_happened,mistral-large_is_happened,mixtral-8x7b_is_happened,openai-gpt35_is_happened
0,https://www.thestar.com/news/world/asia/north-...,Yes,Yes,Yes,Yes,Yes,No
1,https://www.thespec.com/life/debby-finally-mov...,Yes,Yes,Yes,Yes,,
2,https://montreal.ctvnews.ca/live-updates-as-he...,Yes,Yes,Yes,Yes,Yes,Yes
3,https://toronto.citynews.ca/2024/08/10/north-k...,Yes,Yes,Yes,Yes,Yes,Yes
4,https://www.thestar.com/news/world/united-stat...,Yes,Yes,Yes,Yes,Yes,No
5,https://montreal.citynews.ca/video/2024/08/09/...,No,No,,No,No,
6,https://www.lakelandtoday.ca/environment-news/...,Yes,Yes,Yes,Yes,Yes,Yes
7,https://www.lakelandtoday.ca/world-news/debby-...,Yes,Yes,Yes,Yes,Yes,Yes
8,https://halton.insauga.com/curled-up-in-balls-...,Yes,Yes,Yes,Yes,Yes,Yes
9,https://www.thestar.com/news/world/united-stat...,Yes,Yes,Yes,Yes,Yes,No


In [46]:
input_folder = '../../output/'                                      # Folder containing the results files
output_file = '../../output/nlp_models_comparison-coountry.csv'     # Output CSV file
file_prefix = 'nlp_results_'                                        # Prefix for CSV files
merge_column = 'link'                                               # Column to merge on
selected_column = 'country'                                         # The same column to select from each file

# Call the function 'merge_csv_files' to perform the merging operation
comparison_df = merge_csv_files(input_folder, output_file, file_prefix, merge_column, selected_column)
comparison_df

Merged file saved as ../../output/nlp_models_comparison-coountry.csv


Unnamed: 0,link,meta-llama3-70b_country,meta-llama3-8b_country,mistral-7b_country,mistral-large_country,mixtral-8x7b_country,openai-gpt35_country
0,https://www.thestar.com/news/world/asia/north-...,North Korea,North Korea,North Korea,North Korea,North Korea,
1,https://www.thespec.com/life/debby-finally-mov...,USA,"United States, Canada",United States,"United States, Canada",,
2,https://montreal.ctvnews.ca/live-updates-as-he...,Canada,Unknown,Canada,Canada,Canada,Canada
3,https://toronto.citynews.ca/2024/08/10/north-k...,North Korea,South Korea,North Korea,North Korea,North Korea,North Korea
4,https://www.thestar.com/news/world/united-stat...,USA,United States,United States,United States,Unknown,
5,https://montreal.citynews.ca/video/2024/08/09/...,,,Canada,,,
6,https://www.lakelandtoday.ca/environment-news/...,USA,United States,United States,United States,USA,United States
7,https://www.lakelandtoday.ca/world-news/debby-...,USA,United States,United States,United States,United States,United States
8,https://halton.insauga.com/curled-up-in-balls-...,Unknown,Unknown,Canada,Unknown,,Unknown
9,https://www.thestar.com/news/world/united-stat...,USA,United States,United States,United States,United States,
