Source: https://asd.talkbank.org/access/English/Eigsti.html

Associated dataset paper link: https://asd.talkbank.org/access/0docs/Eigsti2007.pdf

- The files consist of transcribed 30-minute free play sessions of children ages 3-6 years, with ASD, non-ASD developmental delay, and typical development (n=16 per group)[48 files in total].
- The first 100 utterances for each participant were included. 

In [None]:
import pylangacq as pla
import pandas as pd
import os

import warnings
warnings.filterwarnings("ignore")

In [None]:
url = '../../../data/talkbank/Rollins.zip'
rollins = pla.read_chat(url)
rollins.n_files() #how many chat files are present for this data

# Extract Raw Data

In [None]:
file_names = rollins.file_paths() #file names will be used for later csv file-saving.
file_names = [file_name.split('\\')[1].split('.')[0] for file_name in file_names] #extract the file name from the path
#file name has format eigsti/101.cha . So, we split on '/', take the later part(101.cha). Again we split on '.' and take the first part '101'.

In [None]:
file_names[:5]

Utterrance is a line of a conversation and its associated metadata such as token, timestamp, and grammatical info. Using utterrances(by_files=True) method here will return a 2d array of the format **(number_of_chat_files, number_of_utterrance_object_in_that_file)**

In [None]:
utterrances = rollins.utterances( by_files=True)

As an example, we take the utterrances of the first chat file denoted by index 0. It has a **participant** attribute indicating who's speaking and a dictionary named **tiers** that holds the original text and some additional grammatical info. We can thus extract the original text from **tiers** using the key of participant name.

In [None]:
first_file_utterrances = utterrances[0]
for utter in first_file_utterrances:
  participant = utter.participant; participant_line = utter.tiers[participant]
  print(f'{participant} : {participant_line}')

We will save each conversation in file_name.csv file. So, for chat file **1010.cha** we will have **1010.csv**. The csv data columns will be:
- participant
- sentence

In [None]:
column_names = ['participant', 'sentence']
save_dir = 'eigsti/'
chat_file_index = 0
for chat_file in utterrances:
  chat_df = pd.DataFrame(columns=column_names)
  #print(chat_df)
  for utter in chat_file:
    participant = utter.participant; participant_line = utter.tiers[participant] #extract participant and chat data info from the utterrance object
    chat_df = chat_df.append({'participant':participant, 'sentence':participant_line}, ignore_index=True) #add the sentence and participant info to a df object
  file_name = file_names[chat_file_index]; chat_file_index += 1
  file_name = file_name + '.csv' #construct the file name of the csv file
  chat_df.to_csv(os.path.join(save_dir, file_name), index = False) # save the csv file in the destination folder
  print(f'{chat_file_index}: {file_name}')
  #break

In [None]:
def save_chat_to_csv(utterances, file_names, save_dir='eigsti/', column_names=['participant', 'sentence']):
    chat_file_index = 0
    for chat_file in utterances:
        chat_df = pd.DataFrame(columns=column_names)
        for utter in chat_file:
            participant = utter.participant
            participant_line = utter.tiers[participant]
            chat_df = chat_df.append({'participant': participant, 'sentence': participant_line}, ignore_index=True)
        file_name = file_names[chat_file_index]
        chat_file_index += 1
        file_name = file_name + '.csv'
        chat_df.to_csv(os.path.join(save_dir, file_name), index=False)
        print(f'{chat_file_index}: {file_name}')

In [None]:
url = '../../../data/talkbank/Flusberg.zip'
flusberg = pla.read_chat(url)
flusberg.n_files() #how many chat files are present for this data

In [None]:
file_names2 = flusberg.file_paths() #file names will be used for later csv file-saving.
file_names2 = [file_name.split('\\')[1].split('.')[0] for file_name in file_names2] #extract the file name from the path
#file name has format eigsti/101.cha . So, we split on '/', take the later part(101.cha). Again we split on '.' and take the first part '101'.

In [None]:
file_names2[:5]

In [None]:
utterrances2 = flusberg.utterances( by_files=True)

In [None]:
first_file_utterrances2 = utterrances2[0]
for utter in first_file_utterrances2:
  participant = utter.participant; participant_line = utter.tiers[participant]
  print(f'{participant} : {participant_line}')

In [None]:
column_names = ['participant', 'sentence']
save_dir = 'flusberg/'
chat_file_index = 0
for chat_file in utterrances2:
  chat_df = pd.DataFrame(columns=column_names)
  #print(chat_df)
  for utter in chat_file:
    participant = utter.participant; participant_line = utter.tiers[participant] #extract participant and chat data info from the utterrance object
    chat_df = chat_df.append({'participant':participant, 'sentence':participant_line}, ignore_index=True) #add the sentence and participant info to a df object
  file_name = file_names[chat_file_index]; chat_file_index += 1
  file_name = file_name + '.csv' #construct the file name of the csv file
  chat_df.to_csv(os.path.join(save_dir, file_name), index = False) # save the csv file in the destination folder
  print(f'{chat_file_index}: {file_name}')
  #break

In [None]:
rollins.headers()

In [None]:
print(type(eigsti))

In [None]:
eigsti.ages(months=True)

In [None]:
words = eigsti.words()
words_by_files = eigsti.words(by_files=True)

In [None]:
len(words)

In [None]:
for words_one_file in words_by_files:
    print(len(words_one_file))

In [None]:
words[:8]

In [None]:
eigsti.utterances()[5]

In [None]:
eigsti.info(verbose=True)

In [None]:
word_freq = eigsti.word_frequencies()

In [None]:
word_freq.most_common(15)

In [None]:
eigsti.mlu()

In [None]:
eigsti.headers()

In [None]:
def save_zip_to_csv(url, folder_name):
    chat = pla.read_chat(url)

    file_names = chat.file_paths()
    file_names = [file_name.split('\\')[1].split('.')[0] for file_name in file_names]

    utterrances = chat.utterances( by_files=True)
    column_names = ['participant', 'sentence']
    save_dir = folder_name

    if not os.path.exists(save_dir):
        os.makedirs(save_dir)

    chat_file_index = 0
    for chat_file in utterrances:
        chat_df = pd.DataFrame(columns=column_names)
        for utter in chat_file:
            participant = utter.participant
            participant_line = utter.tiers[participant]
            chat_df = chat_df.append({'participant':participant, 'sentence':participant_line}, ignore_index=True)
        file_name = file_names[chat_file_index]
        chat_file_index += 1
        file_name = file_name + '.csv'
        chat_df.to_csv(os.path.join(save_dir, file_name), index = False)
        print(f'{chat_file_index}: {file_name}')


In [None]:
save_zip_to_csv('../../../data/talkbank/Eigsti.zip', 'eigsti')

In [None]:
save_zip_to_csv('../../../data/talkbank/Nadig.zip', 'nadig')

In [None]:
save_zip_to_csv('../../../data/talkbank/QuigleyMcNally.zip', 'quigley_mcNally')

In [None]:
save_zip_to_csv('../../../data/talkbank/Flusberg.zip', 'flusbeg')

In [None]:
save_zip_to_csv('../../../data/talkbank/Rollins.zip', 'rollins')