# CIT5900 Project 3 - Task 2: Data Enrichment Without Metadata Merge

In [None]:

import pandas as pd
import os

# Step 1: Load main cleaned CSV
main_path = "Final_ResearchOutputs_Cleaned.csv"
try:
    main_df = pd.read_csv(main_path, dtype=str, encoding='utf-8')
    print("[INFO] Loaded main file with {} records.".format(main_df.shape[0]))
except Exception as e:
    raise FileNotFoundError("[ERROR] Cannot load {}: {}".format(main_path, e))

# Step 2: Normalize column names
main_df.columns = main_df.columns.map(str).str.strip().str.lower()
print("Main CSV Columns:", main_df.columns.tolist())

# Step 3: Define required output columns
required_columns = [
    'projid', 'projectstatus', 'projecttitle', 'projectrdc', 'projectyearstarted',
    'projectyearended', 'projectpi', 'outputtitle', 'outputbiblio', 'outputtype',
    'outputstatus', 'outputvenue', 'outputyear', 'outputmonth', 'outputvolume',
    'outputnumber', 'outputpages'
]

# Step 4: Ensure all required columns exist
for col in required_columns:
    if col not in main_df.columns:
        main_df[col] = ""

# Step 5: Metadata enrichment fallback (no merge, fill defaults)
main_df['projectstatus'] = main_df['projectstatus'].replace("", "Unknown")
main_df['outputstatus'] = main_df['outputstatus'].replace("", "Published")
main_df['outputtype'] = main_df['outputtype'].replace("", "Unknown")

# Step 6: Filter to likely FSRDC-related records
def is_likely_fsrdc(title, rdc):
    title = str(title).lower()
    rdc = str(rdc).lower()
    keywords = ['fsrdc', 'census', 'research data center', 'rdc']
    return any(k in title or k in rdc for k in keywords)

main_df['is_relevant'] = main_df.apply(
    lambda row: is_likely_fsrdc(row.get('projecttitle', ''), row.get('projectrdc', '')), axis=1
)
filtered_df = main_df[main_df['is_relevant'] == True].drop(columns=['is_relevant'])

print("[INFO] Filtered dataset contains {} likely FSRDC records.".format(filtered_df.shape[0]))

# Step 7: Convert year and month fields to numeric
filtered_df['outputyear'] = pd.to_numeric(filtered_df['outputyear'], errors='coerce')
filtered_df['outputmonth'] = pd.to_numeric(filtered_df['outputmonth'], errors='coerce')

# Step 8: Strip whitespace from all string cells
filtered_df = filtered_df.applymap(lambda x: x.strip() if isinstance(x, str) else x)

# Step 9: Save the result
output_dir = './data/processed'
os.makedirs(output_dir, exist_ok=True)
output_path = os.path.join(output_dir, 'ResearchOutputs_GroupX_Task2_Enriched.csv')
filtered_df.to_csv(output_path, index=False)
print("[SUCCESS] Final enriched data saved to: {}".format(output_path))
