In [None]:
import pandas as pd
import os

# File paths and group mapping
file_map = {
    "group1.csv": {
        "OutputTitle": "title",
        "Abstract": "abstract",
        "ProjectPI": "project_pi",
        "ProjectStatus": "project_status",
        "OutputYear": "year",
        "DOI": "doi"
    },
    "group2.csv": {
        "OutputTitle": "title",
        "Abstract": "abstract",
        "ProjectPI": "researcher",
        "OutputYear": "year",
        "OutputType": "type_crossref",
        "OutputVenue": "source_display_name",
        "DOI": "doi"
    },
    "group3.csv": {
        "OutputTitle": "Title",
        "Abstract": "Abstract",
        "ProjectPI": "PI",
        "OutputYear": "publication_year",
        "ProjectRDC": "RDC",
        "ProjectYearStarted": "Start Year",
        "ProjectYearEnded": "End Year",
        "OutputVenue": "host_organization_name",
        "FRDC": "Has FSRDC Evidence?",
        "DOI": "doi"
    },
    "group4.csv": {
        "OutputTitle": "title",
        "Abstract": "abstract",
        "ProjectPI": "researcher",
        "OutputYear": "year",
        "OutputVenue": "source"
    },
    "group5.csv": {
        "OutputTitle": "title",
        "ProjectPI": "pi",
        "OutputYear": "year",
        "DOI": "doi"
    },
    "group6.csv": {
        "OutputTitle": "Title",
        "Abstract": "Abstract",
        "ProjectPI": "Researchers",
        "OutputYear": "Year",
        "OutputVenue": "RDC",
        "OutputBiblio": "Keywords", 
        "DOI": "DOI",
        "ProjectRDC": "RDC",

    },
    "group7.csv": {
        "OutputTitle": "title",
        "Abstract": "abstract",
        "ProjectStatus": "is_fsrdc",
        "OutputVenue": "source"
    },
    "group8.csv": {
        "ProjID": "ProjectID",
        "ProjectTitle": "ProjectTitle",
        "ProjectRDC": "ProjectRDC",
        "ProjectPI": "ProjectPI",
        "OutputTitle": "OutputTitle",
        "OutputBiblio": "OutputBiblio",
        "OutputYear": "OutputYear",
        "DOI": "DOI",
        "Abstract": "Abstract",
        "Authors": "Authors"
    }
}

# Final desired columns
final_columns = [
    "ProjID", "ProjectTitle", "ProjectRDC", "ProjectPI", "Authors",
    "OutputTitle", "OutputBiblio", "OutputYear", "DOI", "Abstract", "source_file"
]

# Directory where files are stored
data_dir = "./"

# List to hold standardized DataFrames
standardized_dfs = []

# Load, rename, and align each file
for file, mapping in file_map.items():
    path = os.path.join(data_dir, file)
    try:
        df = pd.read_excel(path) if file.endswith(".xlsx") else pd.read_csv(path)

        # Normalize all column names for safer matching
        df.columns = [col.strip().lower() for col in df.columns]
        normalized_mapping = {k: v.lower().strip() for k, v in mapping.items()}

        df_renamed = pd.DataFrame()

        for final_col in final_columns:
            src_col = normalized_mapping.get(final_col)
            if src_col and src_col in df.columns:
                df_renamed[final_col] = df[src_col]
            else:
                df_renamed[final_col] = None

        df_renamed["source_file"] = file
        standardized_dfs.append(df_renamed)

    except Exception as e:
        print(f"Error processing {file}: {e}")
# Combine all
combined_final_df = pd.concat(standardized_dfs, ignore_index=True)

# Normalize column names just in case
# Drop rows missing DOI
combined_final_df.columns = [col.strip().lower() for col in combined_final_df.columns]

if 'doi' in combined_final_df.columns:
    combined_final_df = combined_final_df[
        ~combined_final_df['doi'].isna() & (combined_final_df['doi'].str.strip() != "")
    ]
else:
    print("Warning: 'DOI' column not found. No filtering applied.")

# Drop duplicate DOIs, keeping the first occurrence
df = df.drop_duplicates(subset='doi', keep='first')

# Optionally save to Excel or CSV

combined_final_df.to_csv("Final_ResearchOutputs_Cleaned.csv", index=False)




  combined_final_df = pd.concat(standardized_dfs, ignore_index=True)
