In [17]:
import pandas as pd

id_table = pd.read_excel("./data/id_table.xlsx")

demographics = pd.read_excel("./data/demographics.xlsx", skiprows=1, header=0)

scores_xls = pd.ExcelFile("./data/scores.xlsx")

# Define the function to generate fmri codes
def generate_fmri_code_demo(row):
    prefix = 's' if row['참여집단'] == 'EXP' else 'c'
    return f"{prefix}{int(row['내부 부여 번호']):04d}"


def generate_fmri_code_id_table(row):
    prefix = 's' if row['Enrollment #'].startswith('EXP') else 'c'
    code = f"{prefix}{int(row['Subject #']):04d}"
    return code


# Apply the function to each row
demographics['fmri_code'] = demographics.apply(generate_fmri_code_demo, axis=1)
id_table['fmri_code'] = id_table.apply(generate_fmri_code_id_table, axis=1)

# Merge the two DataFrames on fmri_code
merged_df = pd.merge(demographics, id_table, on=['fmri_code'], how='left')

# Drop rows where fmri_code is NaN
filtered_df = merged_df.dropna(subset=['fmri_code'])

# Convert 'HAID ID' to string for consistent merging
filtered_df['HAID ID'] = filtered_df['HAID ID'].astype(str)

# STAI-X-1
STAI_X_1 = scores_xls.parse("STAI_X_1")

# STAI-X-2
STAI_X_2 = scores_xls.parse("STAI_X_2")

# HADS_anxiety, HADS_depression
HADS = scores_xls.parse("HADS")

# SWLS
SWLS = scores_xls.parse("SWLS")

# GAD_7
GAD_7 = scores_xls.parse("GAD_7")

# PDSS
PDSS = scores_xls.parse("PDSS")

# performance_lsas, social_interaction_lsas, lsas
LSAS = scores_xls.parse("LSAS")

# MOCI, checking, cleaning, doubting, slowness
MOCI = scores_xls.parse("MOCI")

# BFNE
BFNE = scores_xls.parse("BFNE")

# PSWQ
PSWQ = scores_xls.parse("PSWQ")

# FCV_19S
FCV_19S = scores_xls.parse("FCV_19S")

# Handedness(true)
HANDEDNESS = scores_xls.parse("HANDEDNESS_수정")

# Get unique ids from id_lookup_table
lookup_unique_ids = set(id_table['HAID ID'].astype(str).unique())
print(f"Unique ID count in id_lookup_table: {len(lookup_unique_ids)}")

# Function to get unique ids from a sheet
def get_unique_ids(sheet):
    return set(sheet['id'].astype(str).unique())

# List of score sheets
score_sheets = [
    ('STAI_X_1', STAI_X_1),
    ('STAI_X_2', STAI_X_2),
    ('HADS_anxiety', HADS),
    ('HADS_depression', HADS),
    ('SWLS', SWLS),
    ('GAD_7', GAD_7),
    ('PDSS', PDSS),
    ('performance_lsas', LSAS),
    ('social_interaction_lsas', LSAS),
    ('lsas', LSAS),
    ('MOCI', MOCI),
    ('checking', MOCI),
    ('cleaning', MOCI),
    ('doubting', MOCI),
    ('slowness', MOCI),
    ('BFNE', BFNE),
    ('PSWQ', PSWQ),
    ('Handedness(true)', HANDEDNESS),
    ('FCV_19S', FCV_19S)
]

# Check each sheet for unique ids and matches with id_lookup_table
for sheet_name, sheet in score_sheets:
    sheet_unique_ids = get_unique_ids(sheet)
    common_ids = sheet_unique_ids.intersection(lookup_unique_ids)
    print(f"\nSheet: {sheet_name}")
    print(f"Unique ID count in sheet: {len(sheet_unique_ids)}")
    print(f"Common ID count with id_lookup_table: {len(common_ids)}")


Unique ID count in id_lookup_table: 167

Sheet: STAI_X_1
Unique ID count in sheet: 160
Common ID count with id_lookup_table: 159

Sheet: STAI_X_2
Unique ID count in sheet: 159
Common ID count with id_lookup_table: 159

Sheet: HADS_anxiety
Unique ID count in sheet: 159
Common ID count with id_lookup_table: 159

Sheet: HADS_depression
Unique ID count in sheet: 159
Common ID count with id_lookup_table: 159

Sheet: SWLS
Unique ID count in sheet: 159
Common ID count with id_lookup_table: 159

Sheet: GAD_7
Unique ID count in sheet: 159
Common ID count with id_lookup_table: 159

Sheet: PDSS
Unique ID count in sheet: 159
Common ID count with id_lookup_table: 159

Sheet: performance_lsas
Unique ID count in sheet: 159
Common ID count with id_lookup_table: 159

Sheet: social_interaction_lsas
Unique ID count in sheet: 159
Common ID count with id_lookup_table: 159

Sheet: lsas
Unique ID count in sheet: 159
Common ID count with id_lookup_table: 159

Sheet: MOCI
Unique ID count in sheet: 159
Common I

In [18]:
demographics.columns


Index(['참여집단', '내부 부여 번호', '병록번호', '이름', '성별', '만 나이', '학력', '교육 기간', '직업',
       '자신', '부모', '정신질환\n가족/친척 유무', '참가자와 관계', '정신과입원유무', '질환명', '현재 질병 유무',
       '병명', '과거 뇌외상/\n뇌질환', '정신질환 유무', '병명 ', '입원횟수', '첫 정신과적 입원\n(몇년 전)',
       '첫 정신과적 입원\n(입원 년도)', '발병시기\n(몇년 전)', '발병시기\n(입원 년도)', '항정신병 약물',
       '향정신병약물/ \n용량(mg/tab)', '용량\n(tab)', '현 용량 시작일', '기타 약물\n(약품명/용량)',
       '총 약물 치료기간', 'fmri_code'],
      dtype='object')

In [19]:
# Convert 'HAID ID' to string for consistent merging
filtered_df['HAID ID'] = filtered_df['HAID ID'].astype(str)

In [21]:
# Assuming filtered_df is already defined and the dataframes for each sheet are already loaded

# Define a function to extract scores
def extract_scores(df, columns, id):
    extracted_col = df[(df['id'] == id) & (df['round'] == 1)][columns]
    if len(extracted_col.values) == 0:
        return "n/a"
    else:
        return extracted_col.values[0][0]

# Dictionary to hold the extracted scores for each id
extracted_scores = []

# Loop through each unique HAID ID and extract the scores for each sheet
for id in filtered_df['HAID ID'].unique():
    target_filtered_df = filtered_df[filtered_df['HAID ID'] == id]
    score_dict = {
        'HAID ID': id,
        'GROUP': target_filtered_df['참여집단'].values[0],
        'Exp No.': target_filtered_df['내부 부여 번호'].values[0],
        '1. SEX': target_filtered_df['성별'].values[0],
        '2.AGE': target_filtered_df['만 나이'].values[0],
        '3-2. YR_EDU': target_filtered_df['교육 기간'].values[0],
        'fmri_code': target_filtered_df['fmri_code'].values[0],
        'Screening #': target_filtered_df['Screening #'].values[0],
        'Enrollment #': target_filtered_df['Enrollment #'].values[0],

        
        'STAI-X-1': extract_scores(STAI_X_1, ['STAI-X-1'], id),
        'STAI-X-2': extract_scores(STAI_X_2, ['STAI-X-2(true)'], id),
        'HADS_anxiety': extract_scores(HADS, ['HADS_anxiety'], id),
        'HADS_depression': extract_scores(HADS, ['HADS_depression'], id),
        'SWLS': extract_scores(SWLS, ['SWLS(true)'], id),
        'GAD-7': extract_scores(GAD_7, ['GAD-7(true변환)'], id),
        'PDSS': extract_scores(PDSS, ['PDSS'], id),
        'LSAS_performance': extract_scores(LSAS, ['performance_lsas'], id),
        'LSAS_social_interaction': extract_scores(LSAS, ['social_interaction_lsas'], id),
        'LSAS': extract_scores(LSAS, ['lsas'], id),
        'MOCI': extract_scores(MOCI, ['MOCI'], id),
        'MOCI_checking': extract_scores(MOCI, ['checking'], id),
        'MOCI_cleaning': extract_scores(MOCI, ['cleaning'], id),
        'MOCI_doubting': extract_scores(MOCI, ['doubting'], id),
        'MOCI_slowness': extract_scores(MOCI, ['slowness'], id),
        'BFNE': extract_scores(BFNE, ['BFNE(ture)'], id),
        'PSWQ': extract_scores(PSWQ, ['PSWQ(true)'], id),
        'Handedness(true)': extract_scores(HANDEDNESS, ['Handedness(true)'], id),
        'FCV-19S': extract_scores(FCV_19S, ['FCV(ttrue)'], id)
    }
    extracted_scores.append(score_dict)

In [24]:
extracted_scores_df = pd.DataFrame(extracted_scores)

In [25]:
extracted_scores_df

Unnamed: 0,HAID ID,GROUP,Exp No.,1. SEX,2.AGE,3-2. YR_EDU,fmri_code,Screening #,Enrollment #,STAI-X-1,...,LSAS,MOCI,MOCI_checking,MOCI_cleaning,MOCI_doubting,MOCI_slowness,BFNE,PSWQ,Handedness(true),FCV-19S
0,0827jck,EXP,7,2,20,12.0,s0007,1.0,EXP-0001,56.0,...,98,19,4,8,5,4,56,58,48,7
1,na0840,EXP,5,2,20,12.0,s0005,2.0,EXP-0002,67.0,...,80,15,5,6,3,2,57,62,40,18
2,spwls915,EXP,4,2,21,15.0,s0004,3.0,EXP-0003,65.0,...,95,18,6,10,1,5,59,60,40,5
3,wkddbswns,HC,9,1,25,15.0,c0009,4.0,HC-0001,35.0,...,18,24,7,11,3,7,35,47,47,2
4,bje5409,EXP,12,2,30,12.0,s0012,5.0,EXP-0004,55.0,...,57,26,8,11,4,7,51,50,47,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
159,kimhg2004,EXP,574,2,19,12.0,s0574,160.0,EXP-0086,42.0,...,65,8,4,0,3,2,47,38,36,5
160,miss20c,EXP,562,2,39,20.0,s0562,161.0,EXP-0087,58.0,...,35,13,5,2,6,1,44,51,44,4
161,goun4238,EXP,583,1,19,12.0,s0583,162.0,EXP-0088,70.0,...,105,20,8,2,6,6,53,67,48,6
162,yyyyjg,HC,549,2,35,,c0549,167.0,HC-0076,34.0,...,24,4,1,0,2,1,41,45,46,0


In [27]:
extracted_scores_df.to_csv("./data/participant_demo_clinical_all.csv")