# **양자과학기술 저자명 통일**

- column 'matched'로 표시된 row는 정확한 정보를 의미함
- WoS에서 절반밖에 제공하지 않음
- affiliation 정보가 있는 저자만 정확한 정보라고 볼 수 있음 

### Rule-based Approach

In [3]:
import pandas as pd
import numpy as np

dir = 'H:/GD_awekimm/[YU]/[Project]/[Quantum]/Quantum_2nd/04_Analysis/QuanTech_R2/R file/'

# 1, 2, 3단계는 이전과 완전히 동일합니다.
# 1. 데이터 로딩 및 전처리
df = pd.read_csv(dir+'quant_author_ed_eu_val.csv')
inst = pd.read_csv(dir+'quant_inst_ed_eu_val_cleaned.csv')

required_columns = [
    'pubid', 'city', 'country', 'pubyear', 'author_id_te', 
    'full_name', 'matched', 'organization_cleaned', 'suborganization_cleaned'
]
df = df[required_columns]
inst = inst[['pubid', 'organization_cleaned', 'suborganization_cleaned']]

df = df.drop_duplicates()
df['full_name'] = df['full_name'].str.replace(r'\.$', '', regex=True)

# 2. 'matched'와 'rest' 데이터 분리 및 준비
df_matched = df[df['matched'] == 'matched'].copy()
df_rest = df[df['matched'] == 'rest'].copy()

df_rest = df_rest.drop(columns=['organization_cleaned', 'suborganization_cleaned'])
df_rest = pd.merge(df_rest, inst, on='pubid', how='left')

df_final = pd.concat([df_matched, df_rest], ignore_index=True)

# 3. ID 통일 로직 실행
min_ids = df_final.groupby(['full_name', 'organization_cleaned'])['author_id_te'].min()
matched_ids = df_final[df_final['matched'] == 'matched'].groupby(['full_name', 'organization_cleaned'])['author_id_te'].first()
canonical_s = matched_ids.combine_first(min_ids)
canonical_ids = canonical_s.reset_index(name='canonical_id')
canonical_ids['canonical_id'] = canonical_ids['canonical_id'].astype(df_final['author_id_te'].dtype)

df_final = df_final.merge(canonical_ids, on=['full_name','organization_cleaned'], how='left')

final_id_from_matched = df_final[df_final['matched'] == 'matched'].groupby('full_name')['canonical_id'].first()
default_final_id = df_final.groupby('full_name')['canonical_id'].min()
final_id_map = final_id_from_matched.combine_first(default_final_id)
df_final['author_id_te_cleaned'] = df_final['full_name'].map(final_id_map)

df_final['cleaned_or_not'] = np.where(
    df_final['author_id_te'] == df_final['author_id_te_cleaned'], 'original', 'cleaned'
)
df_final = df_final.drop(columns=['canonical_id'])


# --- ⭐️ 여기서부터 수정된 매칭 로직 시작 ⭐️ ---

print("ID 통일 완료. 'rest' 저자 매칭을 시작합니다...")

# 4. 'rest' 저자 매칭을 위한 준비
# 4-1. 저자 프로필 생성 (기존과 동일)
author_history = df_final.groupby('author_id_te_cleaned')['organization_cleaned'].unique().apply(list)

# 4-2. 논문별 참여 기관 목록 생성 (기존과 동일)
pub_orgs = df_final.groupby('pubid')['organization_cleaned'].unique().apply(list)

# ⭐️ (규칙 추가) 4-3. 논문별 '제외할 기관' 맵 생성
# matched 저자의 소속 기관은 다른 rest 저자들의 후보에서 제외합니다.
forbidden_orgs_map = df_final[df_final['matched'] == 'matched'].groupby('pubid')['organization_cleaned'].first()

# 4-4. 처리할 'rest' 데이터 분리 (기존과 동일)
final_matched = df_final[df_final['matched'] == 'matched'].copy()
final_rest = df_final[df_final['matched'] == 'rest'].copy()


# 5. 'rest' 저자 매칭 시도
rest_unique_authors = final_rest.drop_duplicates(subset=['pubid', 'author_id_te_cleaned']).copy()

# ⭐️ (규칙 추가) 매칭 함수에 '제외 규칙' 로직 추가
def find_likely_org_with_rule(row, author_history_map, pub_orgs_map, forbidden_map):
    author_id = row['author_id_te_cleaned']
    pub_id = row['pubid']
    
    history_orgs = author_history_map.get(author_id, [])
    # 원본 리스트 수정을 방지하기 위해 .copy() 사용
    candidate_orgs = pub_orgs_map.get(pub_id, []).copy()
    
    # 새로운 규칙 적용: 이 논문의 '제외할 기관'이 있다면 후보군에서 제거
    forbidden_org = forbidden_map.get(pub_id)
    if forbidden_org and forbidden_org in candidate_orgs:
        candidate_orgs.remove(forbidden_org)
            
    # 필터링된 후보군과 저자 이력을 비교
    intersection = [org for org in candidate_orgs if org in history_orgs]
    
    if len(intersection) == 1:
        return intersection[0]
    else:
        return np.nan

# 수정된 매칭 함수를 적용
rest_unique_authors['likely_organization'] = rest_unique_authors.apply(
    lambda row: find_likely_org_with_rule(row, author_history, pub_orgs, forbidden_orgs_map), axis=1
)

# 6. 결과 결합 (이전과 동일)
rest_matched_success = rest_unique_authors.dropna(subset=['likely_organization'])
rest_matched_success['organization_cleaned'] = rest_matched_success['likely_organization']
rest_matched_success = rest_matched_success[final_matched.columns] 

failed_keys = rest_unique_authors[rest_unique_authors['likely_organization'].isna()][['pubid', 'author_id_te_cleaned']]
rest_still_unmatched = pd.merge(final_rest, failed_keys, on=['pubid', 'author_id_te_cleaned'])

final_combined = pd.concat([final_matched, rest_matched_success, rest_still_unmatched], ignore_index=True)

# 7. 최종 저장 (이전과 동일)
final_combined = final_combined.drop_duplicates()
final_combined.to_parquet(dir + 'quant_author_ed_eu_val_final_matched.parquet')


print("모든 작업 완료. 최종 데이터가 저장되었습니다.")
matched_count = len(rest_matched_success)
total_rest_authors = len(rest_unique_authors)
print(f"총 {total_rest_authors}개의 'rest' 저자-논문 조합 중 {matched_count}개를 새로운 규칙에 따라 매칭 성공!")

ID 통일 완료. 'rest' 저자 매칭을 시작합니다...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rest_matched_success['organization_cleaned'] = rest_matched_success['likely_organization']


모든 작업 완료. 최종 데이터가 저장되었습니다.
총 41547개의 'rest' 저자-논문 조합 중 15492개를 새로운 규칙에 따라 매칭 성공!


In [4]:
final_combined.head()

Unnamed: 0,pubid,city,country,pubyear,author_id_te,full_name,matched,organization_cleaned,suborganization_cleaned,author_id_te_cleaned,cleaned_or_not
0,52091.0,LAQUILA,ITALY,1998,5945201.0,"Carelli, P",matched,UNIVERSITY OF AQUILA,DIPARTIMENTO ENERGET,5945201.0,original
1,62208.0,BERLIN,GERMANY,1998,3941773.0,"Beyer, J",matched,PHYSIKALISCH-TECHNISCHE BUNDESANSTALT (PTB),PHYS TECH BUNDESANSTALT,3941773.0,original
2,114324.0,JENA,GERMANY,1998,35695931.0,"Schmidl, F",matched,FRIEDRICH SCHILLER UNIVERSITY OF JENA,INST FESTKORPERPHYS,35695931.0,original
3,260373.0,JULICH,GERMANY,1998,6902516.0,"Chesca, B",matched,FORSCHUNGSZENTRUM JULICH,INST SCHICHT & IONENTECH,6902516.0,original
4,273661.0,JULICH,GERMANY,1998,7320318.0,"Chong, Y",matched,FORSCHUNGSZENTRUM JULICH,INST SCHICHT & IONENTECH,7320318.0,original


### Hybrid version

In [8]:
import pandas as pd
import numpy as np
from collections import Counter

dir = 'H:/GD_awekimm/[YU]/[Project]/[Quantum]/Quantum_2nd/04_Analysis/QuanTech_R2/R file/'

# =================================================================
# [1단계] 초기 데이터 준비 및 ID 통합
# =================================================================
print("[1단계] 데이터 로딩 및 ID 통합을 시작합니다...")

# 1-1. 데이터 로딩
df = pd.read_csv(dir+'quant_author_ed_eu_val.csv')
inst = pd.read_csv(dir+'quant_inst_ed_eu_val_cleaned.csv')

required_columns = [
    'pubid', 'city', 'country', 'pubyear', 'author_id_te',
    'full_name', 'matched', 'organization_cleaned', 'suborganization_cleaned'
]
df = df[required_columns]
inst = inst[['pubid', 'organization_cleaned', 'suborganization_cleaned']]
df = df.drop_duplicates()
df['full_name'] = df['full_name'].str.replace(r'\.$', '', regex=True)

# 1-2. 'matched'와 'rest' 데이터 분리 및 재결합 (ID 통일 목적)
df_matched = df[df['matched'] == 'matched'].copy()
df_rest = df[df['matched'] == 'rest'].copy()
df_rest_for_id = df_rest.drop(columns=['organization_cleaned', 'suborganization_cleaned'])
df_rest_for_id = pd.merge(df_rest_for_id, inst, on='pubid', how='left')
df_for_id_unification = pd.concat([df_matched, df_rest_for_id], ignore_index=True)

# 1-3. ID 통일 로직 실행 (전체 데이터 대상)
min_ids = df_for_id_unification.groupby(['full_name', 'organization_cleaned'])['author_id_te'].min()
matched_ids = df_for_id_unification[df_for_id_unification['matched'] == 'matched'].groupby(['full_name', 'organization_cleaned'])['author_id_te'].first()
canonical_s = matched_ids.combine_first(min_ids)
canonical_ids = canonical_s.reset_index(name='canonical_id')
canonical_ids['canonical_id'] = canonical_ids['canonical_id'].astype(df['author_id_te'].dtype)
df_for_id_unification = df_for_id_unification.merge(canonical_ids, on=['full_name','organization_cleaned'], how='left')

final_id_from_matched = df_for_id_unification[df_for_id_unification['matched'] == 'matched'].groupby('full_name')['canonical_id'].first()
default_final_id = df_for_id_unification.groupby('full_name')['canonical_id'].min()
final_id_map = final_id_from_matched.combine_first(default_final_id)
df_for_id_unification['author_id_te_cleaned'] = df_for_id_unification['full_name'].map(final_id_map)

# 1-4. 원본 'rest' 데이터에 통일된 ID 부착
df_rest = pd.merge(df_rest, df_for_id_unification[['pubid', 'author_id_te', 'author_id_te_cleaned']].drop_duplicates(), on=['pubid', 'author_id_te'])

print("ID 통합 완료.")

# =================================================================
# [2단계] PASS 1: 규칙 기반 정밀 매칭
# =================================================================
print("\n[2단계] PASS 1: 규칙 기반 정밀 매칭을 시작합니다...")

# 2-1. 매칭 준비
author_history = df_for_id_unification.groupby('author_id_te_cleaned')['organization_cleaned'].unique().apply(list)
pub_orgs = df_for_id_unification.groupby('pubid')['organization_cleaned'].unique().apply(list)
forbidden_orgs_map = df_for_id_unification[df_for_id_unification['matched'] == 'matched'].groupby('pubid')['organization_cleaned'].first()

# 2-2. 매칭 함수 정의
def find_likely_org_with_rule(row, author_history_map, pub_orgs_map, forbidden_map):
    author_id = row['author_id_te_cleaned']
    pub_id = row['pubid']
    history_orgs = author_history_map.get(author_id, [])
    candidate_orgs = pub_orgs_map.get(pub_id, []).copy()
    
    forbidden_org = forbidden_map.get(pub_id)
    if forbidden_org and forbidden_org in candidate_orgs:
        candidate_orgs.remove(forbidden_org)
        
    intersection = [org for org in candidate_orgs if org in history_orgs]
    
    if len(intersection) == 1:
        return intersection[0]
    else:
        return np.nan

# 2-3. 매칭 실행
df_rest['likely_organization'] = df_rest.apply(
    lambda row: find_likely_org_with_rule(row, author_history, pub_orgs, forbidden_orgs_map), axis=1
)

# 2-4. 결과 분리: 성공 그룹과 실패 그룹
rest_pass1_success = df_rest.dropna(subset=['likely_organization']).copy()
rest_pass1_success['organization_cleaned'] = rest_pass1_success['likely_organization']
rest_pass1_success['match_method'] = 'Rule-Based' # 매칭 방식 기록

rest_still_unmatched = df_rest[df_rest['likely_organization'].isna()].copy()

print(f"PASS 1 결과: {len(rest_pass1_success)} 건 매칭 성공.")

# =================================================================
# [3단계] PASS 2: 추론 기반 포괄적 매칭
# =================================================================
print("\n[3단계] PASS 2: 추론 기반 포괄적 매칭을 시작합니다...")

# 3-1. 추론 준비 ('matched' 데이터 기반 증거 맵 생성)
inst_location_map = df_matched.groupby('organization_cleaned')[['city', 'country']].first().to_dict('index')
pub_org_map_inference = df_matched.groupby('pubid')['organization_cleaned'].unique().apply(list).to_dict()

# 3-2. (증거 수집) PASS 1 실패 데이터 대상 후보 기관 수집
candidate_orgs_inference = []
for idx, row in rest_still_unmatched.iterrows():
    pubid = row['pubid']
    city = row['city']
    country = row['country']
    candidates = []
    
    # [증거 1: 공동 저자]
    if pubid in pub_org_map_inference:
        candidates.extend(pub_org_map_inference[pubid])
    
    # [증거 2: 지리 정보]
    possible_orgs = inst[inst['pubid'] == pubid]['organization_cleaned'].unique()
    for org in possible_orgs:
        if org in inst_location_map:
            loc = inst_location_map[org]
            if loc['city'] == city and loc['country'] == country:
                candidates.append(org)
    candidate_orgs_inference.append(candidates)

rest_still_unmatched['candidate_orgs'] = candidate_orgs_inference

# 3-3. 저자 프로파일링 및 추론
# 하이퍼파라미터
FREQUENCY_THRESHOLD = 5
HIGH_CONFIDENCE_THRESHOLD = 0.8

author_counts = df['full_name'].value_counts()
frequent_authors = set(author_counts[author_counts >= FREQUENCY_THRESHOLD].index)

author_profiles = {}
# 'matched' 데이터와 'PASS 1 성공' 데이터로 프로필 강화
temp_df_for_profile = pd.concat([df_matched, rest_pass1_success])
for _, row in temp_df_for_profile.iterrows():
    author = row['full_name']
    org = row['organization_cleaned']
    if author not in author_profiles:
        author_profiles[author] = []
    author_profiles[author].append(org)

# 'PASS 2 대상' 데이터의 후보 기관들을 프로필에 추가
for _, row in rest_still_unmatched.iterrows():
    author = row['full_name']
    if author not in author_profiles:
        author_profiles[author] = []
    author_profiles[author].extend(row['candidate_orgs'])

# 추론 실행
inferred_org_map = {}
for author, org_list in author_profiles.items():
    if not org_list: continue
    counts = Counter(org_list)
    most_common_org, top_count = counts.most_common(1)[0]
    
    if author in frequent_authors: # Safe Mode
        confidence = top_count / len(org_list)
        if confidence >= HIGH_CONFIDENCE_THRESHOLD:
            inferred_org_map[author] = most_common_org
    else: # Standard Mode
        inferred_org_map[author] = most_common_org

# 3-4. 추론 결과 적용
rest_still_unmatched['inferred_org'] = rest_still_unmatched['full_name'].map(inferred_org_map)
rest_pass2_success = rest_still_unmatched.dropna(subset=['inferred_org']).copy()
rest_pass2_success['organization_cleaned'] = rest_pass2_success['inferred_org']
rest_pass2_success['match_method'] = 'Inference-Based' # 매칭 방식 기록

rest_final_unmatched = rest_still_unmatched[rest_still_unmatched['inferred_org'].isna()]

print(f"PASS 2 결과: {len(rest_pass2_success)} 건 추가 매칭 성공.")

# =================================================================
# [4단계] 최종 결과 종합 및 저장 (수정된 코드)
# =================================================================
print("\n[4단계] 최종 결과를 종합합니다...")

# 4-1. 각 데이터 그룹에 최종 컬럼 추가 및 정리
# (수정) df_matched에도 통일된 ID(author_id_te_cleaned)를 부여합니다.
df_matched = pd.merge(
    df_matched,
    df_for_id_unification[['pubid', 'author_id_te', 'author_id_te_cleaned']].drop_duplicates(),
    on=['pubid', 'author_id_te'],
    how='left'
)
df_matched['match_method'] = 'Original'

# (수정) 최종 매칭 실패 그룹에도 match_method 값을 할당합니다.
rest_final_unmatched['match_method'] = 'Unmatched'

# 4-2. 최종적으로 사용할 컬럼 리스트를 정의합니다.
#    - suborganization_cleaned 는 추론 과정에서 사용되지 않았으므로 유지
final_output_columns = [
    'pubid', 'city', 'country', 'pubyear', 'author_id_te',
    'full_name', 'matched', 'organization_cleaned', 'suborganization_cleaned',
    'author_id_te_cleaned', 'match_method'
]

# 4-3. (수정) 모든 데이터프레임의 컬럼을 통일한 후 안전하게 결합합니다.
final_df = pd.concat([
    df_matched[final_output_columns],
    rest_pass1_success[final_output_columns],
    rest_pass2_success[final_output_columns],
    rest_final_unmatched[final_output_columns]
], ignore_index=True)


# 4-4. 최종 정리 및 저장
final_df = final_df.drop_duplicates(subset=['author_id_te_cleaned', 'pubid', 'organization_cleaned'])
final_df.to_parquet(dir + 'quant_author_ed_eu_val_hybrid_matched.parquet')


print("\n모든 작업 완료. 최종 데이터가 저장되었습니다.")
total_rest = len(df_rest)
p1_count = len(rest_pass1_success)
p2_count = len(rest_pass2_success)
final_fail_count = len(rest_final_unmatched)
print(f"총 {total_rest}개의 'rest' 레코드 중,")
print(f"  - 규칙 기반 매칭 성공: {p1_count} 건")
print(f"  - 추론 기반 매칭 성공: {p2_count} 건")
print(f"  - 최종 매칭 실패: {final_fail_count} 건")

[1단계] 데이터 로딩 및 ID 통합을 시작합니다...
ID 통합 완료.

[2단계] PASS 1: 규칙 기반 정밀 매칭을 시작합니다...
PASS 1 결과: 19056 건 매칭 성공.

[3단계] PASS 2: 추론 기반 포괄적 매칭을 시작합니다...
PASS 2 결과: 20292 건 추가 매칭 성공.

[4단계] 최종 결과를 종합합니다...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rest_final_unmatched['match_method'] = 'Unmatched'



모든 작업 완료. 최종 데이터가 저장되었습니다.
총 232025개의 'rest' 레코드 중,
  - 규칙 기반 매칭 성공: 19056 건
  - 추론 기반 매칭 성공: 20292 건
  - 최종 매칭 실패: 192677 건


In [9]:
final_df.head()

Unnamed: 0,pubid,city,country,pubyear,author_id_te,full_name,matched,organization_cleaned,suborganization_cleaned,author_id_te_cleaned,match_method
0,52091.0,LAQUILA,ITALY,1998,5945201.0,"Carelli, P",matched,UNIVERSITY OF AQUILA,DIPARTIMENTO ENERGET,5945201.0,Original
1,62208.0,BERLIN,GERMANY,1998,3941773.0,"Beyer, J",matched,PHYSIKALISCH-TECHNISCHE BUNDESANSTALT (PTB),PHYS TECH BUNDESANSTALT,3941773.0,Original
2,114324.0,JENA,GERMANY,1998,35695931.0,"Schmidl, F",matched,FRIEDRICH SCHILLER UNIVERSITY OF JENA,INST FESTKORPERPHYS,35695931.0,Original
3,260373.0,JULICH,GERMANY,1998,6902516.0,"Chesca, B",matched,FORSCHUNGSZENTRUM JULICH,INST SCHICHT & IONENTECH,6902516.0,Original
4,273661.0,JULICH,GERMANY,1998,7320318.0,"Chong, Y",matched,FORSCHUNGSZENTRUM JULICH,INST SCHICHT & IONENTECH,7320318.0,Original


### Author network

In [19]:
import pandas as pd
import networkx as nx

dir = 'H:/GD_awekimm/[YU]/[Project]/[Quantum]/Quantum_2nd/04_Analysis/QuanTech_R2/R file/'
author_data_path = dir + 'quant_author_ed_eu_val_hybrid_matched.parquet'
category_data_path = dir + 'quant_author_ed_eu_val.csv'
output_csv_path = dir + 'network_analysis_top5_authors.csv'

### Data Loading and Preparation
print("데이터를 불러오고 결합하는 중입니다...")

df_author_clean = pd.read_parquet(author_data_path)
df_category = pd.read_csv(category_data_path, usecols=['pubid', 'qc_category']).drop_duplicates()
df = pd.merge(df_author_clean, df_category, on='pubid', how='left')

required_cols = ['full_name', 'organization_cleaned', 'country', 'qc_category', 'author_id_te_cleaned']
df_net = df[required_cols].dropna()
df_net = df_net[df_net['organization_cleaned'] != '']

print("데이터 준비 완료.")

results_list = []

### Network Analysis
categories = df_net['qc_category'].unique()

for category in categories:
    print("\n" + "="*50)
    print(f" Category: {category} Start Analysis")
    print("="*50)
    
    df_cat = df_net[df_net['qc_category'] == category]
    
    if len(df_cat) < 5:
        print(" Skip -> too small sample.")
        continue
        
    G = nx.Graph()
    
    for _, row in df_cat.iterrows():
        author = row['full_name']
        org = row['organization_cleaned']
        country = row['country']
        
        G.add_node(author, type='author')
        G.add_node(org, type='organization')
        G.add_node(country, type='country')
        
        G.add_edge(author, org)
        G.add_edge(org, country)
        
    print(f" -> Network Analysis completed (Node: {G.number_of_nodes()}, Edge: {G.number_of_edges()})")
    
    print(" -> Measuring centrality indices...")
    degree_centrality = nx.degree_centrality(G)
    betweenness_centrality = nx.betweenness_centrality(G)

    try:
        eigenvector_centrality = nx.eigenvector_centrality(G, max_iter=1000, tol=1e-06)
    except nx.PowerIterationFailedConvergence:
        print(" Skip -> Eigenvector Centrality")
        eigenvector_centrality = None

    centrality_results = {
        "Degree Centrality": degree_centrality,
        "Betweenness Centrality": betweenness_centrality,
        "Eigenvector Centrality": eigenvector_centrality
    }
    
    for name, result in centrality_results.items():
        if result is None: continue
        
        author_nodes = {node: data['type'] for node, data in G.nodes(data=True) if data['type'] == 'author'}
        author_centrality = {author: result[author] for author in author_nodes}
        
        sorted_authors = sorted(author_centrality.items(), key=lambda item: item[1], reverse=True)
        
        print(f"\n--- 👑 {name} TOP 5 ---")

        for i, (author, score) in enumerate(sorted_authors[:5]):
            author_info = df_cat[df_cat['full_name'] == author]
            
            if not author_info.empty:
                top_org = author_info['organization_cleaned'].mode()[0]
                country = author_info[author_info['organization_cleaned'] == top_org]['country'].iloc[0]
                author_id = author_info['author_id_te_cleaned'].iloc[0] 
            else:
                top_org, country, author_id = "N/A", "N/A", "N/A"

            print(f"{i+1}. {author} (ID: {author_id}, Score: {score:.4f})")
            print(f"   ┖ 소속: {top_org} ({country})")
            
            results_list.append({
                'category': category,
                'centrality_type': name,
                'rank': i + 1,
                'author_id': author_id, 
                'author': author,
                'score': score,
                'organization': top_org,
                'country': country
            })

print("\n" + "="*50)
print("Analysis completed.")

results_df = pd.DataFrame(results_list)
results_df.to_csv(output_csv_path, index=False, encoding='utf-8-sig')

print("Saving .csv file completed.")
print("="*50)

데이터를 불러오고 결합하는 중입니다...
데이터 준비 완료.

 Category: qc111 Start Analysis
 -> Network Analysis completed (Node: 9735, Edge: 13412)
 -> Measuring centrality indices...

--- 👑 Degree Centrality TOP 5 ---
1. Carelli, P (ID: 5945201.0, Score: 0.0020)
   ┖ 소속: UNIVERSITY OF AQUILA (ITALY)
2. Torrioli, G (ID: 39921400.0, Score: 0.0020)
   ┖ 소속: CONSIGLIO NAZIONALE DELLE RICERCHE (CNR) (ITALY)
3. Silvestrini, P (ID: 37029884.0, Score: 0.0017)
   ┖ 소속: SECONDA UNIVERSITA DEGLI STUDI DI NAPOLI (ITALY)
4. Cosmelli, C (ID: 8020268.0, Score: 0.0014)
   ┖ 소속: SAPIENZA UNIVERSITY ROME (ITALY)
5. Leoni, R (ID: 22284448.0, Score: 0.0014)
   ┖ 소속: CONSIGLIO NAZIONALE DELLE RICERCHE (CNR) (ITALY)

--- 👑 Betweenness Centrality TOP 5 ---
1. Bouwmeester, D (ID: 4801928.0, Score: 0.0115)
   ┖ 소속: UNIVERSITY OF INNSBRUCK (AUSTRIA)
2. Silvestrini, P (ID: 37029884.0, Score: 0.0097)
   ┖ 소속: SECONDA UNIVERSITA DEGLI STUDI DI NAPOLI (ITALY)
3. Michotte, S (ID: 26756541.0, Score: 0.0094)
   ┖ 소속: UNIVERSITE CATHOLIQUE L

In [17]:
import pandas as pd

dir = 'H:/GD_awekimm/[YU]/[Project]/[Quantum]/Quantum_2nd/04_Analysis/QuanTech_R2/R file/'

input_csv_path = dir + 'network_analysis_top5_authors.csv'

output_csv_path = dir + 'top_authors_master_list.csv'

try:
    print(f"입력 파일을 불러옵니다: {input_csv_path}")
    df = pd.read_csv(input_csv_path)
except FileNotFoundError:
    print(f"[에러] 입력 파일이 없습니다. 먼저 네트워크 분석 코드를 실행해주세요.")
    exit()

df_authors = df[['qc_category', 'full_name', 'author_id_te_cleaned']].copy()

# 3. 중복 제거
# 한 연구자가 여러 중심성 지표(Degree, Betweenness 등)에서 상위권에 들 수 있으므로,
# 분야별 고유한 연구자 명단을 만들기 위해 중복을 제거합니다.
print("중복된 연구자 정보를 제거하여 고유한 명단을 생성합니다.")
df_authors.drop_duplicates(inplace=True)

# 4. 결과 정렬 (가독성을 위해)
df_authors.sort_values(by=['qc_category', 'full_name'], inplace=True)

# 5. 새로운 CSV 파일로 저장
print(f"최종 명단을 CSV 파일로 저장합니다: {output_csv_path}")
# 엑셀에서 한글이 깨지지 않도록 'utf-8-sig' 인코딩 사용
df_authors.to_csv(output_csv_path, index=False, encoding='utf-8-sig')

print("\n✨ 작업 완료!")
print(f"총 {len(df_authors)}명의 고유한 상위 연구자 명단이 생성되었습니다.")
print("\n생성된 파일의 상위 5줄 미리보기:")
print(df_authors.head())

입력 파일을 불러옵니다: H:/GD_awekimm/[YU]/[Project]/[Quantum]/Quantum_2nd/04_Analysis/QuanTech_R2/R file/network_analysis_top5_authors.csv


KeyError: "None of [Index(['qc_category', 'full_name', 'author_id_te_cleaned'], dtype='object')] are in the [columns]"

**교수님 피드백**
- 전체를 정교화하는게 아니라, 중요한 저자의 식별이 중요함 
- 분야별 논문 수 상위 10명 매뉴얼로 확인

In [1]:
import pandas as pd 
df = pd.read_csv('quant_author_ed_eu_val.csv')
df

Unnamed: 0.1,Unnamed: 0,pubid,qc_category,addr_num,SEQ_NO,organization_cleaned,suborganization,city,country,pubyear,ID_all,role,author_id_te,display_name,full_name,wos_standard,first_name,last_name,matched
0,1,52091.0,qc111,1,1.0,UNIV AQUILA,DIPARTIMENTO ENERGET,LAQUILA,ITALY,1998,52091-qc111-1-1,author,5945201.0,"Carelli, P","Carelli, P","Carelli, P",P,Carelli,matched
1,2,52091.0,qc111,2,,CNR,IST ELETTR STATO SOLIDO,ROME,ITALY,1998,52091-qc111-2-NA,author,5945201.0,"Carelli, P","Carelli, P","Carelli, P",P,Carelli,random
2,3,52091.0,qc111,2,,CNR,IST ELETTR STATO SOLIDO,ROME,ITALY,1998,52091-qc111-2-NA,author,6290927.0,"Castellano, MG","Castellano, MG","Castellano, MG",MG,Castellano,random
3,4,52091.0,qc111,2,,CNR,IST ELETTR STATO SOLIDO,ROME,ITALY,1998,52091-qc111-2-NA,author,39921397.0,"Torrioli, G","Torrioli, G","Torrioli, G",G,Torrioli,random
4,5,52091.0,qc111,2,,CNR,IST ELETTR STATO SOLIDO,ROME,ITALY,1998,52091-qc111-2-NA,author,22284443.0,"Leoni, R","Leoni, R","Leoni, R",R,Leoni,random
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1350989,1350990,46599931.0,qc43,3,2.0,INFN,SEZ CATANIA,CATANIA,ITALY,2021,46599931-qc43-3-2,author,13492143.0,"Giannelli, Luigi","Giannelli, Luigi","Giannelli, L",Luigi,Giannelli,matched
1350990,1350991,46599931.0,qc43,4,3.0,UNIV COPENHAGEN,CTR HYBRID QUANTUM NETWORKS,COPENHAGEN,DENMARK,2021,46599931-qc43-4-3,author,37550262.0,"Sorensen, Anders S.","Sorensen, Anders S.","Sorensen, AS",Anders S.,Sorensen,matched
1350991,1350992,46599931.0,qc43,4,3.0,UNIV COPENHAGEN,NIELS BOHR INST,COPENHAGEN,DENMARK,2021,46599931-qc43-4-3,author,37550262.0,"Sorensen, Anders S.","Sorensen, Anders S.","Sorensen, AS",Anders S.,Sorensen,matched
1350992,1350993,46599931.0,qc43,4,3.0,NIELS BOHR INSTITUTE,CTR HYBRID QUANTUM NETWORKS,COPENHAGEN,DENMARK,2021,46599931-qc43-4-3,author,37550262.0,"Sorensen, Anders S.","Sorensen, Anders S.","Sorensen, AS",Anders S.,Sorensen,matched


In [10]:
# group by author_id_te and qc_category and count the number of unique pubid 
df_grouped = df.groupby(['author_id_te', 'qc_category']).agg({'pubid': 'nunique'}).reset_index()
df_grouped.rename(columns={'pubid': 'pub_count'}, inplace=True)
df_grouped

Unnamed: 0,author_id_te,qc_category,pub_count
0,79.0,qc41,1
1,761.0,qc43,1
2,764.0,qc224,1
3,853.0,qc133,1
4,853.0,qc134,1
...,...,...,...
334947,46522883.0,qc422,1
334948,46523096.0,qc224,1
334949,46523852.0,qc224,1
334950,46523976.0,qc43,1


In [11]:
df_grouped.sort_values(by='pub_count', ascending=False, inplace=True)
df_grouped.head(100)

Unnamed: 0,author_id_te,qc_category,pub_count
268380,36684500.0,qc211,66
330306,45827585.0,qc143,52
268386,36684500.0,qc224,51
305801,41795374.0,qc422,45
179715,24994194.0,qc143,43
...,...,...,...
131224,17954312.0,qc43,24
238182,32993380.0,qc224,24
126095,17192650.0,qc224,24
126761,17294970.0,qc422,24


In [12]:
df_grouped.describe()

Unnamed: 0,author_id_te,pub_count
count,334952.0,334952.0
mean,23096130.0,1.458967
std,13188730.0,1.417166
min,79.0,1.0
25%,11749160.0,1.0
50%,22912990.0,1.0
75%,34758290.0,1.0
max,46524040.0,66.0


In [2]:
# check number of unique author_id_te 
df['author_id_te'].nunique()

193678

In [3]:
# check rows where the same 'author_id_te' has different 'wos_standard' 

df[df.duplicated(subset=['author_id_te'], keep=False) 
   & ~df.duplicated(subset=['author_id_te', 'wos_standard'], keep=False)]

Unnamed: 0.1,Unnamed: 0,pubid,qc_category,addr_num,SEQ_NO,organization_cleaned,suborganization,city,country,pubyear,ID_all,role,author_id_te,display_name,full_name,wos_standard,first_name,last_name,matched


In [4]:
# check rows where the same 'wos_standard' has different 'author_id_te'

df[df.duplicated(subset=['wos_standard'], keep=False) 
   & ~df.duplicated(subset=['wos_standard', 'author_id_te'], keep=False)]

Unnamed: 0.1,Unnamed: 0,pubid,qc_category,addr_num,SEQ_NO,organization_cleaned,suborganization,city,country,pubyear,ID_all,role,author_id_te,display_name,full_name,wos_standard,first_name,last_name,matched
1672,1673,3602978.0,qc111,2,,CSIC,INST CIENCIA MAT,MADRID,SPAIN,2000,3602978-qc111-2-NA,author,12991795.0,"Garcia, D","Garcia, D","Garcia, D",D,Garcia,random
1674,1675,3602978.0,qc111,2,,CSIC,INST CIENCIA MAT,MADRID,SPAIN,2000,3602978-qc111-2-NA,author,1963866.0,"Asenjo, A","Asenjo, A","Asenjo, A",A,Asenjo,random
1676,1677,3602978.0,qc111,2,,CSIC,INST CIENCIA MAT,MADRID,SPAIN,2000,3602978-qc111-2-NA,author,25200187.0,"Mandal, K","Mandal, K","Mandal, K",K,Mandal,random
1679,1680,3602978.0,qc111,2,,CSIC,INST CIENCIA MAT,MADRID,SPAIN,2000,3602978-qc111-2-NA,author,15980894.0,"Hernando, A","Hernando, A","Hernando, A",A,Hernando,random
3218,3219,5739314.0,qc111,2,,TU WIEN,INST THEORET PHYS,VIENNA,AUSTRIA,2003,5739314-qc111-2-NA,author,36841768.0,"Sjoqvist, E","Sjoqvist, E","Sjoqvist, E",E,Sjoqvist,random
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1350541,1350542,46564882.0,qc43,1,1.0,ETH ZURICH,INST QUANTUM ELECT,ZURICH,SWITZERLAND,2021,46564882-qc43-1-1,author,11651473.0,"Ferri, Francesco","Ferri, Francesco","Ferri, F",Francesco,Ferri,matched
1350543,1350544,46564882.0,qc43,1,3.0,ETH ZURICH,INST QUANTUM ELECT,ZURICH,SWITZERLAND,2021,46564882-qc43-1-3,author,11768669.0,"Finger, Fabian","Finger, Fabian","Finger, F",Fabian,Finger,matched
1350647,1350648,46575304.0,qc43,1,1.0,UNIV COLL DUBLIN,SCH PHYS,DUBLIN,IRELAND,2021,46575304-qc43-1-1,author,19833043.0,"Kiely, Anthony","Kiely, Anthony","Kiely, A",Anthony,Kiely,matched
1350811,1350812,46577964.0,qc43,2,4.0,WAGENINGEN UNIV & RES,HORT & PROD PHYSIOL,WAGENINGEN,NETHERLANDS,2021,46577964-qc43-2-4,author,18958659.0,"Kaiser, Elias","Kaiser, Elias","Kaiser, E",Elias,Kaiser,matched


In [5]:
df[df['wos_standard'] == 'Asenjo, A']

Unnamed: 0.1,Unnamed: 0,pubid,qc_category,addr_num,SEQ_NO,organization_cleaned,suborganization,city,country,pubyear,ID_all,role,author_id_te,display_name,full_name,wos_standard,first_name,last_name,matched
1674,1675,3602978.0,qc111,2,,CSIC,INST CIENCIA MAT,MADRID,SPAIN,2000,3602978-qc111-2-NA,author,1963866.0,"Asenjo, A","Asenjo, A","Asenjo, A",A,Asenjo,random
1008198,1008199,10948659.0,qc422,1,1.0,CSIC,INST CIENCIA MAT,MADRID,SPAIN,2006,10948659-qc422-1-1,author,1963902.0,"Asenjo, A.","Asenjo, A.","Asenjo, A",A.,Asenjo,matched


**국가, 도시, 기관명, 저자명은 같은데, author_id_te가 다른 경우가 있다.**

# **Author ID Merging Rule**
- 'organization_cleaned'와 'full_name'가 같은 경우, 'author_id_te'를 통합

In [27]:
df['full_name'] = df['full_name'].str.replace(r'\.$', '', regex=True)

In [28]:
df[df['wos_standard'] == 'Asenjo, A']

Unnamed: 0.1,Unnamed: 0,pubid,qc_category,addr_num,SEQ_NO,organization_cleaned,suborganization,city,country,pubyear,ID_all,role,author_id_te,display_name,full_name,wos_standard,first_name,last_name,matched
1674,1675,3602978.0,qc111,2,,CSIC,INST CIENCIA MAT,MADRID,SPAIN,2000,3602978-qc111-2-NA,author,1963866.0,"Asenjo, A","Asenjo, A","Asenjo, A",A,Asenjo,random
1008198,1008199,10948659.0,qc422,1,1.0,CSIC,INST CIENCIA MAT,MADRID,SPAIN,2006,10948659-qc422-1-1,author,1963902.0,"Asenjo, A.","Asenjo, A","Asenjo, A",A.,Asenjo,matched


In [29]:
# If organization_cleaned and full_name are identical but author_id_te is different, merge them with one author_id_te in a new column 'author_id_te_cleaned'
df['author_id_te_cleaned'] = (
    df.groupby(['organization_cleaned', 'full_name'])['author_id_te']
      .transform('first')
)

In [30]:
df['cleaned_or_not'] = (df['author_id_te_cleaned'] != df['author_id_te']).astype(int)
df.head()

Unnamed: 0.1,Unnamed: 0,pubid,qc_category,addr_num,SEQ_NO,organization_cleaned,suborganization,city,country,pubyear,...,role,author_id_te,display_name,full_name,wos_standard,first_name,last_name,matched,author_id_te_cleaned,cleaned_or_not
0,1,52091.0,qc111,1,1.0,UNIV AQUILA,DIPARTIMENTO ENERGET,LAQUILA,ITALY,1998,...,author,5945201.0,"Carelli, P","Carelli, P","Carelli, P",P,Carelli,matched,5945201.0,0
1,2,52091.0,qc111,2,,CNR,IST ELETTR STATO SOLIDO,ROME,ITALY,1998,...,author,5945201.0,"Carelli, P","Carelli, P","Carelli, P",P,Carelli,random,5945201.0,0
2,3,52091.0,qc111,2,,CNR,IST ELETTR STATO SOLIDO,ROME,ITALY,1998,...,author,6290927.0,"Castellano, MG","Castellano, MG","Castellano, MG",MG,Castellano,random,6290927.0,0
3,4,52091.0,qc111,2,,CNR,IST ELETTR STATO SOLIDO,ROME,ITALY,1998,...,author,39921397.0,"Torrioli, G","Torrioli, G","Torrioli, G",G,Torrioli,random,39921397.0,0
4,5,52091.0,qc111,2,,CNR,IST ELETTR STATO SOLIDO,ROME,ITALY,1998,...,author,22284443.0,"Leoni, R","Leoni, R","Leoni, R",R,Leoni,random,22284443.0,0


In [34]:
# check number of unique author_id_te 
df['author_id_te'].nunique()

193678

In [33]:
# check number of unique author_id_te 
df['author_id_te_cleaned'].nunique()

177770

In [31]:
df[df['wos_standard'] == 'Asenjo, A']

Unnamed: 0.1,Unnamed: 0,pubid,qc_category,addr_num,SEQ_NO,organization_cleaned,suborganization,city,country,pubyear,...,role,author_id_te,display_name,full_name,wos_standard,first_name,last_name,matched,author_id_te_cleaned,cleaned_or_not
1674,1675,3602978.0,qc111,2,,CSIC,INST CIENCIA MAT,MADRID,SPAIN,2000,...,author,1963866.0,"Asenjo, A","Asenjo, A","Asenjo, A",A,Asenjo,random,1963866.0,0
1008198,1008199,10948659.0,qc422,1,1.0,CSIC,INST CIENCIA MAT,MADRID,SPAIN,2006,...,author,1963902.0,"Asenjo, A.","Asenjo, A","Asenjo, A",A.,Asenjo,matched,1963866.0,1


**full_name은 같은데 기관명이 다른 경우?**

In [35]:
# check rows where full_name is identical but organization_cleaned is different 

df[df.duplicated(subset=['full_name'], keep=False) & ~df.duplicated(subset=['organization_cleaned'], keep=False)]

Unnamed: 0.1,Unnamed: 0,pubid,qc_category,addr_num,SEQ_NO,organization_cleaned,suborganization,city,country,pubyear,...,role,author_id_te,display_name,full_name,wos_standard,first_name,last_name,matched,author_id_te_cleaned,cleaned_or_not
4348,4349,7448624.0,qc111,1,1.0,UNIV GENOA GAP,APPL PHYS GRP,GENEVA,SWITZERLAND,2001,...,author,39814970.0,"Tittel, Wolfgang","Tittel, Wolfgang","Tittel, W",Wolfgang,Tittel,matched,39814970.0,0
4349,4350,7448624.0,qc111,2,2.0,UNIV VIENNA UNIVIE,INST EXPT PHYS,VIENNA,AUSTRIA,2001,...,author,42547602.0,"Weihs, Gregor","Weihs, Gregor","Weihs, G",Gregor,Weihs,matched,42547602.0,0
9814,9815,13134142.0,qc111,1,1.0,QUANTUM OPT LAB ELSAGDATAMAT,,GENOA,ITALY,2007,...,author,4835077.0,"Bovino, Fabio Antonio","Bovino, Fabio Antonio","Bovino, FA",Fabio Antonio,Bovino,matched,4835077.0,0
10083,10084,13484216.0,qc111,4,5.0,NATL CTR NANOMAT TECHNOL,,POHANG,SOUTH KOREA,2008,...,author,22198874.0,"Lee, Hu-Jong","Lee, Hu-Jong","Lee, HJ",Hu-Jong,Lee,matched,22198874.0,0
10344,10345,13913831.0,qc111,3,1.0,ACCAD NAZL LINCEI,,ROME,ITALY,2008,...,author,8843932.0,"De Martini, Francesco","De Martini, Francesco","De Martini, F",Francesco,De Martini,matched,8843932.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1349436,1349437,46301158.0,qc43,2,5.0,AGR RES LTD,,TROUBSKO,CZECH REPUBLIC,2021,...,author,20134164.0,"Kintl, Antonin","Kintl, Antonin","Kintl, A",Antonin,Kintl,matched,20134164.0,0
1349443,1349444,46303705.0,qc43,4,3.0,CNR IOM IST OFFICINA MOL,,TRIESTE,ITALY,2021,...,author,33746753.0,"Romanato, Filippo","Romanato, Filippo","Romanato, F",Filippo,Romanato,matched,33746753.0,0
1349851,1349852,46379898.0,qc43,5,2.0,HKU UCAS JOINT INST THEORET & COMPUTAT PHYS HONG,,HONG KONG,PEOPLES R CHINA,2021,...,author,44231961.0,"Yao, Wang","Yao, Wang","Yao, W",Wang,Yao,matched,44231961.0,0
1350217,1350218,46501668.0,qc43,2,1.0,UV BOOSTING,,BOULOGNE BILLANCOURT,FRANCE,2021,...,author,22150607.0,"Ledermann, Loic","Ledermann, Loic","Ledermann, L",Loic,Ledermann,matched,22150607.0,0


In [36]:
df[df['full_name'] == 'Tittel, Wolfgang']

Unnamed: 0.1,Unnamed: 0,pubid,qc_category,addr_num,SEQ_NO,organization_cleaned,suborganization,city,country,pubyear,...,role,author_id_te,display_name,full_name,wos_standard,first_name,last_name,matched,author_id_te_cleaned,cleaned_or_not
4348,4349,7448624.0,qc111,1,1.0,UNIV GENOA GAP,APPL PHYS GRP,GENEVA,SWITZERLAND,2001,...,author,39814970.0,"Tittel, Wolfgang","Tittel, Wolfgang","Tittel, W",Wolfgang,Tittel,matched,39814970.0,0
13767,13768,18141708.0,qc111,1,10.0,UNIV CALGARY,INST QUANTUM INFORMAT SCI,CALGARY,CANADA,2011,...,author,39814968.0,"Tittel, Wolfgang","Tittel, Wolfgang","Tittel, W",Wolfgang,Tittel,matched,39814968.0,0
13774,13775,18141708.0,qc111,2,10.0,UNIV CALGARY,DEPT PHYS & ASTRON,CALGARY,CANADA,2011,...,author,39814968.0,"Tittel, Wolfgang","Tittel, Wolfgang","Tittel, W",Wolfgang,Tittel,matched,39814968.0,0
15657,15658,20280021.0,qc111,1,10.0,UNIV CALGARY,INST QUANTUM INFORMAT SCI,CALGARY,CANADA,2012,...,author,39814970.0,"Tittel, Wolfgang","Tittel, Wolfgang","Tittel, W",Wolfgang,Tittel,matched,39814968.0,1
15664,15665,20280021.0,qc111,2,10.0,UNIV CALGARY,DEPT PHYS & ASTRON,CALGARY,CANADA,2012,...,author,39814970.0,"Tittel, Wolfgang","Tittel, Wolfgang","Tittel, W",Wolfgang,Tittel,matched,39814968.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1210831,1210832,11236815.0,qc43,2,,UNIV NICE,PHYS MAT CONDENSEE LAB,NICE,FRANCE,2006,...,author,39814970.0,"Tittel, Wolfgang","Tittel, Wolfgang","Tittel, W",Wolfgang,Tittel,random,39814970.0,0
1210838,1210839,11236815.0,qc43,3,,UNIV GENEVA,DEPT PHYS MAT CONDENSEE,GENEVA,SWITZERLAND,2006,...,author,39814970.0,"Tittel, Wolfgang","Tittel, Wolfgang","Tittel, W",Wolfgang,Tittel,random,39814970.0,0
1210845,1210846,11236815.0,qc43,4,,UNIV CALGARY,INST QUANTUM INFORMAT SCI,CALGARY,CANADA,2006,...,author,39814970.0,"Tittel, Wolfgang","Tittel, Wolfgang","Tittel, W",Wolfgang,Tittel,random,39814968.0,1
1239939,1239940,19831747.0,qc43,3,6.0,UNIV CALGARY,INST QUANTUM INFORMAT SCI,CALGARY,CANADA,2011,...,author,39814970.0,"Tittel, Wolfgang","Tittel, Wolfgang","Tittel, W",Wolfgang,Tittel,matched,39814968.0,1


- Tittel, Wolfgang이 실제로 이직한건지 알 수가 없음
- 확인해본 결과, Tittel, Wolfgang은 UNIV GENEVA에서 박사학위를 받고, UNIV CALGARY에서 교수로 재직 중
- 즉, UNIV GENEVA와 UNIV CALGARY에서 진행한 연구는 동일인물일 가능성이 높음
- 하지만, UNIV NICE는 관련 없음 -> 동명이인

In [40]:
sample = df[df['full_name'] == 'Weihs, Gregor']
sample

Unnamed: 0.1,Unnamed: 0,pubid,qc_category,addr_num,SEQ_NO,organization_cleaned,suborganization,city,country,pubyear,...,role,author_id_te,display_name,full_name,wos_standard,first_name,last_name,matched,author_id_te_cleaned,cleaned_or_not
4329,4330,7200573.0,qc111,1,2.0,UNIV VIENNA,INST EXPT PHYS,VIENNA,AUSTRIA,2003,...,author,42547602.0,"Weihs, Gregor","Weihs, Gregor","Weihs, G",Gregor,Weihs,matched,42547602.0,0
4349,4350,7448624.0,qc111,2,2.0,UNIV VIENNA UNIVIE,INST EXPT PHYS,VIENNA,AUSTRIA,2001,...,author,42547602.0,"Weihs, Gregor","Weihs, Gregor","Weihs, G",Gregor,Weihs,matched,42547602.0,0
11276,11277,15274405.0,qc111,1,4.0,UNIV WATERLOO,INST QUANTUM COMP,WATERLOO,CANADA,2009,...,author,42547602.0,"Weihs, Gregor","Weihs, Gregor","Weihs, G",Gregor,Weihs,matched,42547602.0,0
11280,11281,15274405.0,qc111,2,4.0,UNIV WATERLOO,DEPT PHYS,WATERLOO,CANADA,2009,...,author,42547602.0,"Weihs, Gregor","Weihs, Gregor","Weihs, G",Gregor,Weihs,matched,42547602.0,0
11282,11283,15274405.0,qc111,4,4.0,UNIV INNSBRUCK,INST EXPT PHYS,INNSBRUCK,AUSTRIA,2009,...,author,42547602.0,"Weihs, Gregor","Weihs, Gregor","Weihs, G",Gregor,Weihs,matched,42547602.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1140798,1140799,21762254.0,qc423,3,3.0,UNIV WATERLOO,DEPT PHYS & ASTRON,WATERLOO,CANADA,2012,...,author,42547652.0,"Weihs, Gregor","Weihs, Gregor","Weihs, G",Gregor,Weihs,matched,42547602.0,1
1185502,1185503,44846533.0,qc423,1,6.0,UNIV INNSBRUCK,INST EXPT PHYS,INNSBRUCK,AUSTRIA,2021,...,author,42547602.0,"Weihs, Gregor","Weihs, Gregor","Weihs, G",Gregor,Weihs,matched,42547602.0,0
1247914,1247915,22611039.0,qc43,1,6.0,UNIV INNSBRUCK,INST EXPT PHYS,INNSBRUCK,AUSTRIA,2013,...,author,42547602.0,"Weihs, Gregor","Weihs, Gregor","Weihs, G",Gregor,Weihs,matched,42547602.0,0
1346455,1346456,45608699.0,qc43,2,7.0,UNIV INNSBRUCK,INST EXPT PHYS,INNSBRUCK,AUSTRIA,2021,...,author,42547602.0,"Weihs, Gregor","Weihs, Gregor","Weihs, G",Gregor,Weihs,matched,42547602.0,0


In [42]:
# show rows where author_id_te_cleaned is not 42547602.0
sample[sample['author_id_te'] != 42547602.0]

Unnamed: 0.1,Unnamed: 0,pubid,qc_category,addr_num,SEQ_NO,organization_cleaned,suborganization,city,country,pubyear,...,role,author_id_te,display_name,full_name,wos_standard,first_name,last_name,matched,author_id_te_cleaned,cleaned_or_not
16902,16903,21762254.0,qc111,1,3.0,UNIV INNSBRUCK,INST EXPT PHYS,INNSBRUCK,AUSTRIA,2012,...,author,42547652.0,"Weihs, Gregor","Weihs, Gregor","Weihs, G",Gregor,Weihs,matched,42547602.0,1
16903,16904,21762254.0,qc111,2,3.0,UNIV WATERLOO,INST QUANTUM COMP,WATERLOO,CANADA,2012,...,author,42547652.0,"Weihs, Gregor","Weihs, Gregor","Weihs, G",Gregor,Weihs,matched,42547602.0,1
16904,16905,21762254.0,qc111,3,3.0,UNIV WATERLOO,DEPT PHYS & ASTRON,WATERLOO,CANADA,2012,...,author,42547652.0,"Weihs, Gregor","Weihs, Gregor","Weihs, G",Gregor,Weihs,matched,42547602.0,1
26413,26414,34524709.0,qc111,3,6.0,UNIV INNSBRUCK,INST EXPT PHYS,INNSBRUCK,AUSTRIA,2017,...,author,42547652.0,"Weihs, Gregor","Weihs, Gregor","Weihs, G",Gregor,Weihs,matched,42547602.0,1
798592,798593,21762254.0,qc41,1,3.0,UNIV INNSBRUCK,INST EXPT PHYS,INNSBRUCK,AUSTRIA,2012,...,author,42547652.0,"Weihs, Gregor","Weihs, Gregor","Weihs, G",Gregor,Weihs,matched,42547602.0,1
798593,798594,21762254.0,qc41,2,3.0,UNIV WATERLOO,INST QUANTUM COMP,WATERLOO,CANADA,2012,...,author,42547652.0,"Weihs, Gregor","Weihs, Gregor","Weihs, G",Gregor,Weihs,matched,42547602.0,1
798594,798595,21762254.0,qc41,3,3.0,UNIV WATERLOO,DEPT PHYS & ASTRON,WATERLOO,CANADA,2012,...,author,42547652.0,"Weihs, Gregor","Weihs, Gregor","Weihs, G",Gregor,Weihs,matched,42547602.0,1
1140796,1140797,21762254.0,qc423,1,3.0,UNIV INNSBRUCK,INST EXPT PHYS,INNSBRUCK,AUSTRIA,2012,...,author,42547652.0,"Weihs, Gregor","Weihs, Gregor","Weihs, G",Gregor,Weihs,matched,42547602.0,1
1140797,1140798,21762254.0,qc423,2,3.0,UNIV WATERLOO,INST QUANTUM COMP,WATERLOO,CANADA,2012,...,author,42547652.0,"Weihs, Gregor","Weihs, Gregor","Weihs, G",Gregor,Weihs,matched,42547602.0,1
1140798,1140799,21762254.0,qc423,3,3.0,UNIV WATERLOO,DEPT PHYS & ASTRON,WATERLOO,CANADA,2012,...,author,42547652.0,"Weihs, Gregor","Weihs, Gregor","Weihs, G",Gregor,Weihs,matched,42547602.0,1


- 소속기관이 바뀌어도 author_id_te는 대부분 동일한 경우
- Weihs, Gregor는 캐나다, 도쿄, 미국에서 교수 재직함 
- 현재는 오스트리아에서 교수 재직 중
- 위 경우, 모두 같은 author_id_te_cleaned로 정제됨 

In [44]:
df.to_csv('quant_author_ed_eu_val_cleaned.csv', index=False)