# **양자과학기술 저자명 통일**

### Hybrid version

In [None]:
import pandas as pd
import numpy as np
from collections import Counter

# Set directory path
dir = 'D:/GD_awekimm/[YU]/[Project]/[Quantum]/Quantum_2nd/04_Analysis/QuanTech_R2/R file/'

# =================================================================
# Step 1: Data Loading and ID Unification
# =================================================================
print("[Step 1] ...")

# Dta loading & preprocessing
df = pd.read_csv(dir+'quant_author_ed_eu_val.csv')
inst = pd.read_csv(dir+'quant_inst_ed_eu_val_cleaned.csv')

# Required columns
required_columns = [
    'pubid', 'city', 'country', 'pubyear', 'author_id_te',
    'full_name', 'matched', 'organization_cleaned'
]
df = df[required_columns].drop_duplicates()
df['full_name'] = df['full_name'].str.replace(r'\.$', '', regex=True)
inst = inst[['pubid', 'organization_cleaned']]

# Prepare data for ID unification
df_matched = df[df['matched'] == 'matched'].copy()
df_rest = df[df['matched'] == 'rest'].copy()
df_rest_for_id = pd.merge(
    df_rest.drop(columns=['organization_cleaned']),
    inst, on='pubid', how='left'
)
df_for_id_unification = pd.concat([df_matched, df_rest_for_id], ignore_index=True)

# Unify author IDs within same (name + organization) groups
min_ids = df_for_id_unification.groupby(['full_name', 'organization_cleaned'])['author_id_te'].min()
matched_ids = df_for_id_unification[df_for_id_unification['matched'] == 'matched'].groupby(['full_name', 'organization_cleaned'])['author_id_te'].first()
canonical_s = matched_ids.combine_first(min_ids)
canonical_ids = canonical_s.reset_index(name='canonical_id')
df_for_id_unification = df_for_id_unification.merge(canonical_ids, on=['full_name', 'organization_cleaned'], how='left')

# Generate final author IDs based on matched records
final_id_from_matched = df_for_id_unification[df_for_id_unification['matched'] == 'matched'].groupby('full_name')['canonical_id'].first()
default_final_id = df_for_id_unification.groupby('full_name')['canonical_id'].min()
final_id_map = final_id_from_matched.combine_first(default_final_id)
df_for_id_unification['author_id_te_cleaned'] = df_for_id_unification['full_name'].map(final_id_map)

# Add unified IDs back to the 'rest' dataset
df_rest = pd.merge(df_rest, df_for_id_unification[['pubid', 'author_id_te', 'author_id_te_cleaned']].drop_duplicates(), on=['pubid', 'author_id_te'])
print("ID unification complete.")

# =================================================================
# Step 2. Rule-Based Precise Matching (PASS 1)
# =================================================================
print("\n[Step 2] PASS 1: Rule-based matching...")

author_history = df_for_id_unification.groupby('author_id_te_cleaned')['organization_cleaned'].unique().apply(list)
pub_orgs = df_for_id_unification.groupby('pubid')['organization_cleaned'].unique().apply(list)
forbidden_orgs_map = df_for_id_unification[df_for_id_unification['matched'] == 'matched'].groupby('pubid')['organization_cleaned'].first()

def find_likely_org_with_rule(row, author_history_map, pub_orgs_map, forbidden_map):
    author_id, pub_id = row['author_id_te_cleaned'], row['pubid']
    history_orgs = author_history_map.get(author_id, [])
    candidate_orgs = pub_orgs_map.get(pub_id, []).copy()
    forbidden_org = forbidden_map.get(pub_id)
    if forbidden_org and forbidden_org in candidate_orgs:
        candidate_orgs.remove(forbidden_org)
    intersection = [org for org in candidate_orgs if org in history_orgs]
    return intersection[0] if len(intersection) == 1 else np.nan

df_rest['likely_organization'] = df_rest.apply(lambda row: find_likely_org_with_rule(row, author_history, pub_orgs, forbidden_orgs_map), axis=1)

rest_pass1_success = df_rest.dropna(subset=['likely_organization']).copy()
rest_pass1_success['organization_cleaned'] = rest_pass1_success['likely_organization']
rest_pass1_success['match_method'] = 'Rule-Based'
rest_still_unmatched = df_rest[df_rest['likely_organization'].isna()].copy()
print(f"PASS 1 결과: {len(rest_pass1_success)} 건 매칭 성공.")

# =================================================================
# Step 3.  Inference-based Matching (Pass 2)
# =================================================================
print("\n[Step 3] PASS 2: Inference-based Matching...")

inst_location_map = df_matched.groupby('organization_cleaned')[['city', 'country']].first().to_dict('index')
pub_org_map_inference = df_matched.groupby('pubid')['organization_cleaned'].unique().apply(list).to_dict()

candidate_orgs_list = []
for _, row in rest_still_unmatched.iterrows():
    candidates = pub_org_map_inference.get(row['pubid'], [])
    possible_orgs = inst[inst['pubid'] == row['pubid']]['organization_cleaned'].unique()
    for org in possible_orgs:
        loc = inst_location_map.get(org)
        if loc and loc['city'] == row['city'] and loc['country'] == row['country']:
            candidates.append(org)
    candidate_orgs_list.append(candidates)
rest_still_unmatched['candidate_orgs'] = candidate_orgs_list

author_profiles = {}
for _, row in pd.concat([df_matched, rest_pass1_success]).iterrows():
    author_profiles.setdefault(row['full_name'], []).append(row['organization_cleaned'])
for _, row in rest_still_unmatched.iterrows():
    author_profiles.setdefault(row['full_name'], []).extend(row['candidate_orgs'])

inferred_org_map = {}
frequent_authors = set(df['full_name'].value_counts().loc[lambda x: x >= 5].index)
for author, org_list in author_profiles.items():
    if not org_list: continue
    counts = Counter(org_list)
    most_common_org, top_count = counts.most_common(1)[0]
    if author in frequent_authors:
        if (top_count / len(org_list)) >= 0.8:
            inferred_org_map[author] = most_common_org
    else:
        inferred_org_map[author] = most_common_org

rest_still_unmatched['inferred_org'] = rest_still_unmatched['full_name'].map(inferred_org_map)
rest_pass2_success = rest_still_unmatched.dropna(subset=['inferred_org']).copy()
rest_pass2_success['organization_cleaned'] = rest_pass2_success['inferred_org']
rest_pass2_success['match_method'] = 'Inference-Based'
rest_final_unmatched = rest_still_unmatched[rest_still_unmatched['inferred_org'].isna()]
print(f"PASS 2 Reults: {len(rest_pass2_success)} matching completed.")


# =================================================================
# Step 4. Unify Final Results and Canonical Author/Institution 
# =================================================================
print("\n[Step 4] Unify final results and canonical author/institution...")

# 4-1. Integrate all data set
df_matched = pd.merge(df_matched, df_for_id_unification[['pubid', 'author_id_te', 'author_id_te_cleaned']].drop_duplicates(), on=['pubid', 'author_id_te'], how='left')
df_matched['match_method'] = 'Original'
rest_final_unmatched['match_method'] = 'Unmatched'

# 4-2. Combine all records 
final_output_columns = [
    'pubid', 'city', 'country', 'pubyear', 'author_id_te',
    'full_name', 'matched', 'organization_cleaned',
    'author_id_te_cleaned', 'match_method'
]
detailed_df = pd.concat([
    df_matched.reindex(columns=final_output_columns),
    rest_pass1_success.reindex(columns=final_output_columns),
    rest_pass2_success.reindex(columns=final_output_columns),
    rest_final_unmatched.reindex(columns=final_output_columns)
], ignore_index=True)

# 4-3. Canonical institution assignment per author
# Rule 1: Create mapping for authors with 'matched' records 
matched_records = detailed_df[detailed_df['matched'] == 'matched'].copy()
matched_records_sorted = matched_records.sort_values(by=['author_id_te_cleaned', 'pubyear'], ascending=[True, False])
matched_map = matched_records_sorted.drop_duplicates(subset='author_id_te_cleaned', keep='first')
matched_map = matched_map.set_index('author_id_te_cleaned')['organization_cleaned']

# Rule 2: For authors with only 'rest' records, assign the most frequent institution
rest_map = {}
for author_name, org_list in author_profiles.items():
    if org_list:
        most_common_org = Counter(org_list).most_common(1)[0][0]
        rest_map[author_name] = most_common_org

author_id_mapping = df_for_id_unification[['full_name', 'author_id_te_cleaned']].dropna().drop_duplicates()
name_to_id = author_id_mapping.set_index('full_name')['author_id_te_cleaned']
rest_map_by_id = {}
for name, org in rest_map.items():
    if name in name_to_id.index:
        author_id = name_to_id[name]
        rest_map_by_id[author_id] = org

final_canonical_map = matched_map.combine_first(pd.Series(rest_map_by_id))
detailed_df['organization_cleaned'] = detailed_df['author_id_te_cleaned'].map(final_canonical_map)

# 4-4. Final deduplication 
final_df = detailed_df.drop_duplicates(subset=['author_id_te_cleaned', 'pubid'])

# =================================================================
# Step 5. Unify Canonical Country and City Information 
# =================================================================
print("\nStep 5. Unify Canonical Country and City Information")

# Remove records with missing organization information
canonical_country_map = final_df.groupby('organization_cleaned')['country'].apply(lambda x: x.mode()[0] if not x.empty else None)
final_df['country'] = final_df['organization_cleaned'].map(canonical_country_map)

canonical_city_map = final_df.groupby('organization_cleaned')['city'].apply(lambda x: x.mode()[0] if not x.empty else None)
final_df['city'] = final_df['organization_cleaned'].map(canonical_city_map)


# =================================================================
# Step 6. Final Data Saving
# =================================================================
print("\nStep 6. Fianal Data Saving...")
final_df.to_parquet(dir + 'quant_author_ed_eu_val_final_unified.parquet', index=False)
final_df.to_csv(dir + 'quant_author_ed_eu_val_final_unified.csv', index=False)

print("\n--- Summary ---")
total_rest = len(df_rest)
p1_count = len(rest_pass1_success)
p2_count = len(rest_pass2_success)
final_fail_count = len(rest_final_unmatched)
print(f"Among {total_rest} 'rest' records,")
print(f"  - rule-based matching successful: {p1_count} 건")
print(f"  - inference-based matching successful: {p2_count} 건")
print(f"  - final matching fail: {final_fail_count} 건")
print(f"\nAll completed. Final data is now at 'quant_author_ed_eu_val_final_unified.csv'.")