This notebook was created to generate the table for the thesis draft with the distribution for Pitt Corpus

In [1]:
# standar libraries
import os
import sys

# third-party libraries
import pandas as pd

# local module
sys.path.append(os.path.abspath(".."))
from project_config import from_root

In [2]:
path = from_root("Pitt", "Control", "cookie")
cha_files = 0
for root, dirs, files in os.walk(path):
    for file in files:
        if file.endswith('.cha'):
            cha_files += 1
print(f'Total number of .cha files: {cha_files}')


path_D = from_root("Pitt", "Dementia", "cookie")
cha_files_D = 0
for root, dirs, files in os.walk(path_D):
    for file in files:
        if file.endswith('.cha'):
            cha_files_D += 1

print(f'Total number of .cha files: {(cha_files_D)}')

Total number of .cha files: 243
Total number of .cha files: 306


In [3]:
pitt_ad = pd.read_csv(from_root("data", "pitt_ad.tsv"), sep='\t')
pitt_control = pd.read_csv(from_root("data", "pitt_control.tsv"), sep='\t')

print(f'Total number of files in pitt_ad: {len(pitt_ad)}')
print(f'Total number of files in pitt_control: {len(pitt_control)}')
print(pitt_ad.columns.tolist())


pitt_ad[['ID_part1', 'ID_part2']] = pitt_ad['ID'].str.split('-', expand=True).astype(int)
pitt_ad = pitt_ad.sort_values(by=['ID_part1', 'ID_part2'])
pitt_ad = pitt_ad.drop(columns=['ID_part1', 'ID_part2'])

pitt_control[['ID_part1', 'ID_part2']] = pitt_control['ID'].str.split('-', expand=True).astype(int)
pitt_control = pitt_control.sort_values(by=['ID_part1', 'ID_part2'])
pitt_control = pitt_control.drop(columns=['ID_part1', 'ID_part2'])

print(pitt_ad[['ID']].head())
print(pitt_control[['ID']].head())


pitt_ad['participant'] = pitt_ad['ID'].str.split('-').str[0]
pitt_control['participant'] = pitt_control['ID'].str.split('-').str[0]

unique_ad_count = pitt_ad['participant'].nunique()
unique_control_count = pitt_control['participant'].nunique()

print(f'Unique participants in pitt_ad: {unique_ad_count}')
print(f'Unique participants in pitt_control: {unique_control_count}')


Total number of files in pitt_ad: 306
Total number of files in pitt_control: 243
['ID', 'dataset', 'label', 'gender', 'age', 'mmse', 'transcription', 'disfluencies', 'pause_count', 'utterance_count']
       ID
1   001-0
32  001-2
30  003-0
58  005-0
38  005-2
        ID
70   002-0
64   002-1
83   002-2
90   002-3
106  006-2
Unique participants in pitt_ad: 193
Unique participants in pitt_control: 99


In [4]:
pitt_ad['participant'] = pitt_ad['ID'].str.split('-').str[0]
pitt_control['participant'] = pitt_control['ID'].str.split('-').str[0]

pitt_ad_unique = pitt_ad.drop_duplicates(subset='participant')
pitt_control_unique = pitt_control.drop_duplicates(subset='participant')

bins = [45, 50, 55, 60, 65, 70, 75, 80, 85, 90]
labels = ['[45, 50)', '[50, 55)', '[55, 60)', '[60, 65)', '[65, 70)', 
          '[70, 75)', '[75, 80)', '[80, 85)', '[85, 90)']

pitt_ad_unique = pitt_ad_unique.copy()
pitt_ad_unique.loc[:, 'age_bin'] = pd.cut(pitt_ad_unique['age'], bins=bins, labels=labels, right=False)

pitt_control_unique = pitt_control_unique.copy()
pitt_control_unique.loc[:, 'age_bin'] = pd.cut(pitt_control_unique['age'], bins=bins, labels=labels, right=False)

def get_counts(df):
    return df.groupby(['age_bin', 'gender'], observed=True).size().unstack(fill_value=0)

ad_table = get_counts(pitt_ad_unique).add_prefix('AD_')
hc_table = get_counts(pitt_control_unique).add_prefix('HC_')
final_table = pd.concat([ad_table, hc_table], axis=1).fillna(0).astype(int)

total_row = final_table.sum()
total_row.name = 'Total'
final_table = pd.concat([final_table, total_row.to_frame().T])

print(final_table)

gender    AD_f  AD_m  HC_f  HC_m
[45, 50)     1     0     3     0
[50, 55)     1     4     4     5
[55, 60)     8     6    13     6
[60, 65)    13    10     8     8
[65, 70)    24    12    15    10
[70, 75)    26    12    11     8
[75, 80)    29    19     4     3
[80, 85)    14     4     0     1
[85, 90)    10     0     0     0
Total      126    67    58    41
