# O-ISAC Systematic Review - Search and Deduplication

This notebook implements the deduplication pipeline described in the protocol (Section 7.1).

**Steps:**
1. Load search results from `data/raw_search_results/` (CSV/RIS).
2. Normalize titles and authors.
3. Identify duplicates using DOI, Title similarity, and Year.
4. Merge duplicates, prioritizing peer-reviewed versions.
5. Export unique records to `screening/screening_log.csv`.

In [None]:
import pandas as pd
import glob
import os
import re

# Configuration
RAW_DATA_DIR = "../../data/raw_search_results"
SCREENING_LOG_PATH = "../../screening/screening_log.csv"

def normalize_text(text):
    if not isinstance(text, str):
        return ""
    # Lowercase, remove punctuation, remove extra whitespace
    text = text.lower()
    text = re.sub(r'[^a-z0-9\s]', '', text)
    return " ".join(text.split())

def load_data():
    # Placeholder for loading logic. 
    # In a real scenario, we would parse RIS files or specific CSV exports from IEEE/Scopus.
    # For now, we assume standard CSVs with columns: Title, Authors, Year, DOI, Venue, Abstract
    all_files = glob.glob(os.path.join(RAW_DATA_DIR, "*.csv"))
    df_list = []
    for f in all_files:
        try:
            temp_df = pd.read_csv(f)
            temp_df['source_file'] = os.path.basename(f)
            df_list.append(temp_df)
        except Exception as e:
            print(f"Error reading {f}: {e}")
    
    if not df_list:
        print("No data found. Creating empty DataFrame for demonstration.")
        return pd.DataFrame(columns=['Title', 'Authors', 'Year', 'DOI', 'Venue', 'Abstract'])
        
    return pd.concat(df_list, ignore_index=True)

def deduplicate(df):
    print(f"Initial records: {len(df)}")
    
    # 1. Exact DOI Match
    # Normalize DOI
    df['norm_doi'] = df['DOI'].astype(str).str.lower().str.strip()
    
    # 2. Title Match
    df['norm_title'] = df['Title'].apply(normalize_text)
    
    # Create a unique ID for deduplication grouping
    # Priority: DOI -> Title+Year
    
    # Simple deduplication strategy: drop duplicates based on DOI first (if valid), then Title
    # Note: This is a simplified version. The protocol suggests a more complex hierarchy.
    
    # Filter out invalid DOIs for the DOI check
    valid_dois = df[df['norm_doi'].str.len() > 5]
    invalid_dois = df[df['norm_doi'].str.len() <= 5]
    
    deduped_dois = valid_dois.drop_duplicates(subset=['norm_doi'], keep='first')
    
    # For invalid DOIs, deduplicate by Title
    deduped_titles = invalid_dois.drop_duplicates(subset=['norm_title'], keep='first')
    
    # Combine (this is a heuristic, real implementation needs more care to merge info)
    final_df = pd.concat([deduped_dois, deduped_titles])
    
    # Final check on Title for the combined set (in case a DOI record matches a non-DOI record by title)
    final_df = final_df.drop_duplicates(subset=['norm_title'], keep='first')
    
    print(f"Final unique records: {len(final_df)}")
    return final_df

def export_screening_log(df):
    # Prepare columns for screening log
    log_df = pd.DataFrame()
    log_df['record_id'] = range(1, len(df) + 1)
    log_df['title'] = df['Title']
    log_df['year'] = df['Year']
    log_df['source'] = df.get('Venue', 'Unknown')
    log_df['status'] = 'unsure' # Default status
    log_df['exclusion_reason'] = ''
    
    # Check if log exists to preserve existing decisions (optional advanced feature)
    if os.path.exists(SCREENING_LOG_PATH):
        print("Screening log exists. Appending new records not implemented in this demo.")
        # In a real run, we would merge and only add new records.
    
    log_df.to_csv(SCREENING_LOG_PATH, index=False)
    print(f"Exported {len(log_df)} records to {SCREENING_LOG_PATH}")

# Main Execution
df = load_data()
if not df.empty:
    clean_df = deduplicate(df)
    export_screening_log(clean_df)
else:
    print("No data to process.")