In [5]:
import csv
import zipfile
import os
import pandas as pd
import glob

In [6]:
def extract_files(zip_file, out_dir, delete_zip=False):
    """
    A function takes in a zip file and extracts the .wav file and
    *TRANSCRIPT.csv files into separate folders in a user
    specified directory.
    Parameters
    ----------
    zip_file : filepath
        path to the folder containing the DAIC-WOZ zip files
    out_dir : filepath
        path to the desired directory where audio and transcript folders
        will be created
    delete_zip : bool
        If true, deletes the zip file once relevant files are extracted
    Returns
    -------
    Two directories :
        audio : containing the extracted wav files
        transcripts : containing the extracted transcript csv files
    """
    # create audio directory
    txt_dir = os.path.join(out_dir, 'CLNF_AUs')
    if not os.path.exists(txt_dir):
        os.makedirs(txt_dir)

    # create transcripts directory
    #transcripts_dir = os.path.join(out_dir, 'transcripts')
    #if not os.path.exists(audio_dir):
    #    os.makedirs(transcripts_dir)

    zip_ref = zipfile.ZipFile(zip_file)
    for f in zip_ref.namelist():  # iterate through files in zip file
        if f.endswith('CLNF_AUs.txt'):
            zip_ref.extract(f, txt_dir)
            
    zip_ref.close()

    if delete_zip:
        os.remove(zip_file)




In [30]:
# directory containing DIAC-WOZ zip files
dir_name = './Subjects'

# directory where folders will be created
out_dir = './SubjectsExtract/'

# delete zip file 
delete_zip = False

# iterate through zip files in dir_name and extracts wav and transcripts
for file in os.listdir(dir_name):
    if file.endswith('.zip'):
        zip_file = os.path.join(dir_name, file)
        extract_files(zip_file, out_dir, delete_zip=delete_zip)

In [31]:
# directory containing DIAC-WOZ zip files
in_dir = './SubjectsExtract/CLNF_AUs/'

# directory where audio and transcripts folders will be created
out_dir = './SubjectsExtract/CSV_CLNF_AUs'
if not os.path.exists(out_dir):
        os.makedirs(out_dir)

txt_files = os.path.join(in_dir, '*.txt')

for txt_file in glob.glob(txt_files):
    with open(txt_file, "r") as input_file:
        stripped = (line.strip() for line in input_file)
        lines = (line.split(",") for line in stripped)
        #print(list(lines)[:5])
        filename = os.path.splitext(os.path.basename(txt_file))[0] + '.csv'
        with open(os.path.join(out_dir, filename), 'w') as output_file:
            out_csv = csv.writer(output_file)
            out_csv.writerows(lines)