In [1]:
import sys
import pandas as pd

In [2]:
source_csv = "C:/Users/asaha/Downloads/RAG Chatbot/summarized_output.csv"        # Path to the source CSV (contains 'final_summary')
target_csv = "C:/Users/asaha/Downloads/RAG Chatbot/sample_dataset.csv"        # Path to the target CSV (will have 'description_text' replaced)
output_csv = "C:/Users/asaha/Downloads/RAG Chatbot/final_dataset.csv"  # Path where the merged CSV will be written

In [3]:
def merge_summary(source_path: str, target_path: str, output_path: str) -> None:
    """
    Parameters
    ----------
    source_path : str
        CSV that **contains** a `final_summary` column.
    target_path : str
        CSV whose `description_text` column will be replaced.
    output_path : str
        Where the merged CSV will be written.
    """

    # 1. Load data (keep everything as text to avoid type surprises)
    src = pd.read_csv(source_path, dtype=str)
    tgt = pd.read_csv(target_path, dtype=str)

    # 2. Build a Series that maps publication_number ➜ final_summary
    #    If a publication_number appears more than once in the source,
    #    we keep the first occurrence; drop duplicates to be explicit.
    summary_map = (
        src.drop_duplicates(subset="publication_number")
           .set_index("publication_number")["final_summary"]
    )

    # 3. Replace description_text in the target, when we have a match.
    #    Where there’s no match, we keep the original description_text.
    tgt["description_text"] = (
        tgt["publication_number"]
           .map(summary_map)
           .fillna(tgt["description_text"])
    )

    # 4. Save
    tgt.to_csv(output_path, index=False, encoding="utf-8")

    print(f"✅  Updated CSV written to: {output_path}")

In [7]:
# Run the merge_summary function using the defined file paths
merge_summary(source_csv, target_csv, output_csv)


✅  Updated CSV written to: C:/Users/asaha/Downloads/RAG Chatbot/final_dataset.csv


In [None]:
# Load both CSVs as DataFrames
src_df = pd.read_csv(source_csv, dtype=str)
tgt_df = pd.read_csv(target_csv, dtype=str)

# Get sets of publication numbers from each
src_pub_numbers = set(src_df["publication_number"])
tgt_pub_numbers = set(tgt_df["publication_number"])

# Find publication numbers in target that are not in source
unmatched_pub_numbers = tgt_pub_numbers - src_pub_numbers
print(f"Number of publication_numbers in target not found in source: {len(unmatched_pub_numbers)}")


# Find publication numbers in source not found in target
unmatched_in_source = src_pub_numbers - tgt_pub_numbers
print(f"Number of publication_numbers in source not found in target: {len(unmatched_in_source)}")


Number of publication_numbers in target not found in source: 0
Number of publication_numbers in source not found in target: 0


In [None]:
Mixtral_API_KEY = "jyRBVvRGl2ZAZjtF8W5rHlwxqcDPC7m5"