In [17]:
#This notebook converts the ATC0's transcriptions into a modernized format similar to that of the NIH corpus

#This file was originally stored in ra/wdolan/atc0-sandbox, the 'filename'
#field may need to be updated for different file hierarchy if used elsewhere

#proposed csv format:
#location, year, month, day, file start time, speaker, recipient, start, end, transcription, comments, file, transcriber

import re
import pandas as pd
from os.path import exists
pd.options.mode.chained_assignment = None

from io import StringIO

In [18]:
# Pandas automatically wants to read every newline as a row, which we do not want
# workaround is to read files as strings, remove all newline characters, and then readd them where we want

def readFile(filename):
    f = open(filename).readlines()
    allStrings = ""
    for string in f:
        allStrings = allStrings + string

    #utterances are typically distinguished by two newlines
    
    #allStrings = allStrings.replace("\n\n",'*')
    #allStrings = allStrings.replace("\n \n", '*') #for bos_log_lw_2
    #allStrings = allStrings.replace("\n    \n", '*') #for dca_f2_3
    
    allStrings = re.sub('\n *\t*\n *', "`*`", allStrings)
    
    #dfw utterances are not always split by multiple newlines
    #allStrings = allStrings.replace("))\n((", "))`*`((")
    allStrings = re.sub(' *\)\n*\) *\n\(\n*\(', "))`*`((", allStrings)
    
    allStrings = allStrings.replace('\n','')
    allStrings = allStrings.replace("`*`", "\n")
    allStrings = "Grouped Strings\n" + allStrings
    df = pd.read_table(StringIO(allStrings), delimiter = '\n', engine='python')
    return df

In [19]:
# Anything written as (QUOTE ...) is not spoken(?) and thus unnecessary for our purposes and can be removed
def removeQuote(string):
    quoteIndex = string.find("(QUOTE")
    if quoteIndex != -1:
        endQuoteIndex = quoteIndex
        while string[endQuoteIndex] != ")":
            endQuoteIndex = endQuoteIndex + 1
        string = string[0:quoteIndex - 1] + string[endQuoteIndex + 1:len(string)]
        return removeQuote(string)
    return string

# replace sequence of 4 spaces with one space
def removeTab(string):
    return string.replace("    ", " ")

def prepareString(string):
    string = removeQuote(string)
    string = removeTab(string)
    return string

#df['Grouped Strings'] = df['Grouped Strings'].apply(prepareString)

In [20]:
#drop all strings that are not utterances
#utterances will always start with "((FROM"
def isUtterance(string):
    if (len(string) < 6):
        return False
    if (string[0:6] == "((FROM"):
        return True
    else:
        return False

#df = df[df['Grouped Strings'].apply(isUtterance) == True]

In [21]:
def getSpeaker(string):
    #all utterances have "((FROM " as first 6 chars
    if len(string) < 6:
        print(string)
    endIndex = 7
    while string[endIndex] != ")":
        endIndex = endIndex + 1
    return string[7:endIndex]

#df['Speaker'] = df['Grouped Strings'].apply(getSpeaker)

def getRecipient(string):
    recipIndex = string.find("(TO") + 4
    recipEndIndex = recipIndex
    while string[recipEndIndex] != ")":
        recipEndIndex = recipEndIndex + 1
    return string[recipIndex:recipEndIndex]

#df['Recipient'] = df['Grouped Strings'].apply(getRecipient)

In [22]:
def getTimes(string):
    timeIndex = string.find("(TIMES") + 7
    timeEndIndex = timeIndex
    while string[timeEndIndex] != ")":
        timeEndIndex = timeEndIndex + 1
    res = string[timeIndex:timeEndIndex]
    res = res.split()
    res = " ".join(res)
    return res

def startTime(string):
    return string[0:string.find(' ')]

def endTime(string):
    return string[string.find(' ') + 1:len(string)]

#df['start'] = df['Grouped Strings'].apply(getTimes)
#df['end'] = df['start'].apply(endTime)
#df['start'] = df['start'].apply(startTime)

In [23]:
def getText(string):
    textIndex = string.find("(TEXT") + 6
    textEndIndex = textIndex
    while string[textEndIndex] != ")":
        
        #(UNINTELLIGIBLE) includes an ending parentheses, which we want to keep
        if string[textEndIndex] == "(":
            recursiveIndex = textEndIndex
            while string[recursiveIndex] != ")":
                recursiveIndex = recursiveIndex + 1
            textEndIndex = recursiveIndex
            
        textEndIndex = textEndIndex + 1
    res = string[textIndex:textEndIndex]
    res = res.split()
    res = " ".join(res)
    return res

#df['Transcription'] = df['Grouped Strings'].apply(getText)

In [24]:
def getComment(string):
    commentIndex = string.find("(COMMENT") + 10
    if commentIndex == 9:
        return ""
    else: 
        commentEndIndex = commentIndex
        while string[commentEndIndex] != ")":
            commentEndIndex = commentEndIndex + 1
        commentString = string[commentIndex:commentEndIndex]
        commentString = commentString.replace("\"", "")
        return commentString
    
#df['Comment'] = df['Grouped Strings'].apply(getComment)

In [25]:
def prepareDataFrame():
    df = readFile(filename)
    df['Grouped Strings'] = df['Grouped Strings'].apply(prepareString)
    df = df[df['Grouped Strings'].apply(isUtterance) == True].reset_index(drop = True)

    df['location'] = location
    df['year'] = year
    df['month'] = month
    df['day'] = day
    df['fileStartTime'] = startTranscriptTime
    
    df['speaker'] = df['Grouped Strings'].apply(getSpeaker)   
    df['recipient'] = df['Grouped Strings'].apply(getRecipient)  
    df['start'] = df['Grouped Strings'].apply(getTimes)
    df['end'] = df['start'].apply(endTime)
    df['start'] = df['start'].apply(startTime)    
    df['transcription'] = df['Grouped Strings'].apply(getText)
    df['comment'] = df['Grouped Strings'].apply(getComment)    
    
    df['filePath'] = filepath
    df['transcriber'] = transcriber
    
    df.drop('Grouped Strings', axis = 1, inplace = True)
    return df


df = prepareDataFrame()

In [89]:
filename = "bos/log_f1_1.txt"
filepath = "atc0_bos/data/audio/log_f1_1.sph"
location = "bos"
year = "1991"
month = "06"
day = "26"
startTranscriptTime = "2012 UTC"
transcriber = "FR"

df_bos_f1 = prepareDataFrame()

In [90]:
#From the starting time, I would expect transcripts to 
#overlap with f1_1, but I did not observe this when viewing the files
filename = "bos/log_f1_2.txt"
filepath = "atc0_bos/data/audio/log_f1_2.sph"
location = "bos"
year = "1991"
month = "06"
day = "26"
startTranscriptTime = "2130 UTC"
transcriber = "FR"

df_bos_f2 = prepareDataFrame()

In [91]:
filename = "bos/log_f1_3.txt"
filepath = "atc0_bos/data/audio/log_f1_3.sph"
location = "bos"
year = "1991"
month = "06"
day = "27"
startTranscriptTime = "0900 EDT"
transcriber = "" #no listed transcriber

df_bos_f3 = prepareDataFrame()

In [92]:
filename = "bos/log_f1_4.txt"
filepath = "atc0_bos/data/audio/log_f1_4.sph"
location = "bos"
year = "1991"
month = "06"
day = "27"
startTranscriptTime = "1445 UTC"
transcriber = "" #no listed transcriber

df_bos_f4 = prepareDataFrame()

In [93]:
filename = "bos/log_id_1.txt"
filepath = "atc0_bos/data/audio/log_id_1.sph"
location = "bos"
year = "1991"
month = "06"
day = "26"
startTranscriptTime = "2340 EDT"
transcriber = "FR"

df_bos_id1 = prepareDataFrame()

In [94]:
filename = "bos/log_id_2.txt"
filepath = "atc0_bos/data/audio/log_id_2.sph"
location = "bos"
year = "1991"
month = "06"
day = "27"
startTranscriptTime = "0636 EDT"
transcriber = "FR"

df_bos_id2 = prepareDataFrame()

In [95]:
filename = "bos/log_id_3.txt"
filepath = "atc0_bos/data/audio/log_id_3.sph"
location = "bos"
year = "1991"
month = "06"
day = "27"
startTranscriptTime = "1325 EDT"
transcriber = "" #no listed transcriber

df_bos_id3 = prepareDataFrame()

In [96]:
filename = "bos/log_lw_1.txt"
#different file extension
filepath = "atc0_bos/data/audio/log_lw_1.wav"
location = "bos"
year = "1991"
month = "06"
day = "26"
startTranscriptTime = "1945 EDT"
transcriber = "JLO"

df_bos_lw1 = prepareDataFrame()

In [97]:
filename = "bos/log_lw_2.txt"
filepath = "atc0_bos/data/audio/log_lw_2.sph"
location = "bos"
year = "1991"
month = "06"
day = "27"
startTranscriptTime = "" #start time not given
transcriber = "JO"

df_bos_lw2 = prepareDataFrame()

In [98]:
filename = "bos/log_sm_1.txt"
filepath = "atc0_bos/data/audio/log_sm_1.sph"
location = "bos"
year = "1991"
month = "06"
day = "27"
startTranscriptTime = "0636 EDT"
transcriber = "FR"

df_bos_sm1 = prepareDataFrame()

In [99]:
filename = "bos/log_sm_2.txt"
filepath = "atc0_bos/data/audio/log_sm_2.sph"
location = "bos"
year = "1991"
month = "06"
day = "27"
startTranscriptTime = "1300 EDT"
transcriber = "FR"

df_bos_sm2 = prepareDataFrame()

In [121]:
df_bos = pd.concat([df_bos_f1, df_bos_f2, df_bos_f3, df_bos_f4, 
                    df_bos_id1, df_bos_id2, df_bos_id3, df_bos_lw1, 
                    df_bos_lw2, df_bos_sm1, df_bos_sm2], 
                   ignore_index = True)
#df_bos #8327

In [101]:
filename = "dca/dca_d1_1.txt"
filepath = "atc0_dca/data/audio/dca_d1_1.sph"
location = "dca"
year = "1992"
month = "05"
day = "26"
startTranscriptTime = "1542 EDT"
transcriber = "JLO"

df_dca_d1_1 = prepareDataFrame()

In [102]:
filename = "dca/dca_d1_2.txt"
filepath = "atc0_dca/data/audio/dca_d1_2.sph"
location = "dca"
year = "" #no year, month, day, or time was given
month = ""
day = ""
startTranscriptTime = ""
transcriber = "FAR"

df_dca_d1_2 = prepareDataFrame()

In [103]:
filename = "dca/dca_d1_3.txt"
filepath = "atc0_dca/data/audio/dca_d1_3.sph"
location = "dca"
year = "" #no year, month, day, or time was given
month = ""
day = ""
startTranscriptTime = ""
transcriber = "FAR"

df_dca_d1_3 = prepareDataFrame()

In [104]:
filename = "dca/dca_d1_4.txt"
filepath = "atc0_dca/data/audio/dca_d1_4.sph"
location = "dca"
year = "" #no year, month, day, or time was given
month = ""
day = ""
startTranscriptTime = ""
transcriber = "FAR"

df_dca_d1_4 = prepareDataFrame()

In [105]:
filename = "dca/dca_d2_1.txt"
filepath = "atc0_dca/data/audio/dca_d2_1.sph"
location = "dca"
year = "" #no year, month, day, or time was given
month = ""
day = ""
startTranscriptTime = ""
transcriber = "FAR"

df_dca_d2_1 = prepareDataFrame()

In [106]:
filename = "dca/dca_d2_2.txt"
filepath = "atc0_dca/data/audio/dca_d2_2.sph"
location = "dca"
year = "" #no year, month, day, or time was given
month = ""
day = ""
startTranscriptTime = ""
transcriber = "FAR"

df_dca_d2_2 = prepareDataFrame()

In [107]:
filename = "dca/dca_f1_1.txt"
filepath = "atc0_dca/data/audio/dca_f1_1.sph"
location = "dca"
year = "1992"
month = "05"
day = "26"
startTranscriptTime = "1424 EDT"
transcriber = "JLO"

df_dca_f1_1 = prepareDataFrame()

In [108]:
filename = "dca/dca_f1_2.txt"
filepath = "atc0_dca/data/audio/dca_f1_2.sph"
location = "dca"
year = "1992"
month = "05"
day = "26"
startTranscriptTime = "2005 EDT"
transcriber = "JLO"

df_dca_f1_2 = prepareDataFrame()

In [109]:
filename = "dca/dca_f2_1.txt"
filepath = "atc0_dca/data/audio/dca_f2_1.sph"
location = "dca"
year = "1992"
month = "07"
day = "16"
startTranscriptTime = "1900 EDT"
transcriber = "JLO"

df_dca_f2_1 = prepareDataFrame()

In [110]:
filename = "dca/dca_f2_2.txt"
filepath = "atc0_dca/data/audio/dca_f2_2.sph"
location = "dca"
year = "1992"
month = "05"
day = "27"
startTranscriptTime = "1115 EDT"
transcriber = "JLO"

df_dca_f2_2 = prepareDataFrame()

In [111]:
filename = "dca/dca_f2_3.txt"
filepath = "atc0_dca/data/audio/dca_f2_3.sph"
location = "dca"
year = "1992"
month = "05"
day = "28"
startTranscriptTime = "1845 EDT"
transcriber = "JLO"

df_dca_f2_3 = prepareDataFrame()

In [112]:
filename = "dca/dca_lc_1.txt"
filepath = "atc0_dca/data/audio/dca_lc_1.sph"
location = "dca"
year = "1992"
month = "05"
day = "26"
startTranscriptTime = "1845 UTC"
transcriber = "FAR"

df_dca_lc_1 = prepareDataFrame()

In [113]:
filename = "dca/dca_lc_2.txt"
filepath = "atc0_dca/data/audio/dca_lc_2.sph"
location = "dca"
year = "1992"
month = "05"
day = "27"
startTranscriptTime = "740 EDT"
transcriber = "FAR"

df_dca_lc_2 = prepareDataFrame()

In [114]:
filename = "dca/dca_lc_3.txt"
filepath = "atc0_dca/data/audio/dca_lc_3.sph"
location = "dca"
year = "1992"
month = "05"
day = "27"
startTranscriptTime = "2000 EDT"
transcriber = "FAR"

df_dca_lc_3 = prepareDataFrame()

In [115]:
filename = "dca/dca_lc_4.txt"
filepath = "atc0_dca/data/audio/dca_lc_4.sph"
location = "dca"
year = "" #no year, month, day, or time was given
month = ""
day = ""
startTranscriptTime = ""
transcriber = "FAR"

df_dca_lc_4 = prepareDataFrame()

In [116]:
filename = "dca/dca_lc_5.txt"
filepath = "atc0_dca/data/audio/dca_lc_5.sph"
location = "dca"
year = "" #no year, month, day, or time was given
month = ""
day = ""
startTranscriptTime = ""
transcriber = "FAR"

df_dca_lc_5 = prepareDataFrame()

In [117]:
filename = "dca/dca_lc_6.txt"
filepath = "atc0_dca/data/audio/dca_lc_6.sph"
location = "dca"
year = "" #no year, month, day, or time was given
month = ""
day = ""
startTranscriptTime = ""
transcriber = "FAR"

df_dca_lc_6 = prepareDataFrame()

In [119]:
df_dca = pd.concat([df_dca_d1_1, df_dca_d1_2, df_dca_d1_3, df_dca_d1_4,
                    df_dca_d2_1, df_dca_d2_2, df_dca_f1_1, df_dca_f1_2, df_dca_f2_1, 
                    df_dca_f2_2, df_dca_f2_3, df_dca_lc_1, df_dca_lc_2, df_dca_lc_3, df_dca_lc_4, 
                    df_dca_lc_5, df_dca_lc_6], ignore_index = True)
#df_dca #9741

In [122]:
filename = "dfw/dfw_a1_1.txt"
filepath = "atc0_dfw/data/audio/dfw_a1_1.sph"
location = "dfw"
year = "1990"
month = "03"
day = "01"
startTranscriptTime = "1400 CST"
transcriber = "" #no transcriber listed

df_dfw_a1_1 = prepareDataFrame()

In [123]:
filename = "dfw/dfw_a1_2.txt"
filepath = "atc0_dfw/data/audio/dfw_a1_2.sph"
location = "dfw"
year = "2001"
month = "01"
day = "07"
startTranscriptTime = "1550 CST"
transcriber = "" #no transcriber listed

df_dfw_a1_2 = prepareDataFrame()

In [124]:
filename = "dfw/dfw_a2_1.txt"
filepath = "atc0_dfw/data/audio/dfw_a2_1.sph"
location = "dfw"
year = "1990"
month = "03"
day = "28"
startTranscriptTime = "1345 CST"
transcriber = "" #no transcriber listed

df_dfw_a2_1 = prepareDataFrame()

In [125]:
filename = "dfw/dfw_a2_2.txt"
filepath = "atc0_dfw/data/audio/dfw_a2_2.sph"
location = "dfw"
year = "1991"
month = "01"
day = "07"
startTranscriptTime = "1800 CST"
transcriber = "" #no transcriber listed

df_dfw_a2_2 = prepareDataFrame()

In [126]:
filename = "dfw/dfw_a2_3.txt"
filepath = "atc0_dfw/data/audio/dfw_a2_3.sph"
location = "dfw"
year = "1991"
month = "02"
day = "13"
startTranscriptTime = "1037 CST"
transcriber = "" #no transcriber listed

df_dfw_a2_3 = prepareDataFrame()

In [127]:
filename = "dfw/dfw_d1_1.txt"
filepath = "atc0_dfw/data/audio/dfw_d1_1.sph"
location = "dfw"
year = "1991"
month = "01"
day = "07"
startTranscriptTime = "1550 CST"
transcriber = "" #no transcriber listed

df_dfw_d1_1 = prepareDataFrame()

In [128]:
filename = "dfw/dfw_d1_2.txt"
filepath = "atc0_dfw/data/audio/dfw_d1_2.sph"
location = "dfw"
year = "1991"
month = "01"
day = "07"
startTranscriptTime = "1700 CST"
transcriber = "" #no transcriber listed

df_dfw_d1_2 = prepareDataFrame()

In [129]:
filename = "dfw/dfw_d1_3.txt"
filepath = "atc0_dfw/data/audio/dfw_d1_3.sph"
location = "dfw"
year = "1991"
month = "02"
day = "13"
startTranscriptTime = "1117 CST"
transcriber = "" #no transcriber listed

df_dfw_d1_3 = prepareDataFrame()

In [130]:
filename = "dfw/dfw_d2_1.txt"
filepath = "atc0_dfw/data/audio/dfw_d2_1.sph"
location = "dfw"
year = "1990"
month = "05"
day = "18"
startTranscriptTime = "1815 CST"
transcriber = "" #no transcriber listed

df_dfw_d2_1 = prepareDataFrame()

In [131]:
filename = "dfw/dfw_d2_2.txt"
filepath = "atc0_dfw/data/audio/dfw_d2_2.sph"
location = "dfw"
year = "1991"
month = "01"
day = "07"
startTranscriptTime = "1800 CST"
transcriber = "" #no transcriber listed

df_dfw_d2_2 = prepareDataFrame()

In [132]:
filename = "dfw/dfw_d3_2.txt"
filepath = "atc0_dfw/data/audio/dfw_d3_2.sph"
location = "dfw"
year = "1991"
month = "01"
day = "07"
startTranscriptTime = "1550 CST"
transcriber = "" #no transcriber listed

df_dfw_d3_2 = prepareDataFrame()

In [133]:
filename = "dfw/dfw_fe_1.txt"
filepath = "atc0_dfw/data/audio/dfw_fe_1.sph"
location = "dfw"
year = "1990"
month = "03"
day = "01"
startTranscriptTime = "1400 CST"
transcriber = "" #no transcriber listed

df_dfw_fe_1 = prepareDataFrame()

In [134]:
filename = "dfw/dfw_fe_2.txt"
filepath = "atc0_dfw/data/audio/dfw_fe_2.sph"
location = "dfw"
year = "1991"
month = "01"
day = "07"
startTranscriptTime = "1550 CST"
transcriber = "" #no transcriber listed

df_dfw_fe_2 = prepareDataFrame()

In [135]:
filename = "dfw/dfw_fw_1.txt"
filepath = "atc0_dfw/data/audio/dfw_fw_1.sph"
location = "dfw"
year = "1990"
month = "03"
day = "28"
startTranscriptTime = "1345 CST"
transcriber = "" #no transcriber listed

df_dfw_fw_1 = prepareDataFrame()

In [136]:
filename = "dfw/dfw_fw_2.txt"
filepath = "atc0_dfw/data/audio/dfw_fw_2.sph"
location = "dfw"
year = "1990"
month = "05"
day = "19"
startTranscriptTime = "0930 CST"
transcriber = "" #no transcriber listed

df_dfw_fw_2 = prepareDataFrame()

In [137]:
filename = "dfw/dfw_fw_3.txt"
filepath = "atc0_dfw/data/audio/dfw_fw_3.sph"
location = "dfw"
year = "1991"
month = "01"
day = "07"
startTranscriptTime = "1800 CST"
transcriber = "" #no transcriber listed

df_dfw_fw_3 = prepareDataFrame()

In [139]:
filename = "dfw/dfw_ge_1.txt"
filepath = "atc0_dfw/data/audio/dfw_ge_1.sph"
location = "dfw"
year = "1990"
month = "03"
day = "01"
startTranscriptTime = "1400 CST"
transcriber = "" #no transcriber listed

df_dfw_ge_1 = prepareDataFrame()

In [141]:
filename = "dfw/dfw_gw_1.txt"
filepath = "atc0_dfw/data/audio/dfw_gw_1.sph"
location = "dfw"
year = "1990"
month = "03"
day = "28"
startTranscriptTime = "1345 CST"
transcriber = "" #no transcriber listed

df_dfw_gw_1 = prepareDataFrame()

In [143]:
filename = "dfw/dfw_gw_2.txt"
filepath = "atc0_dfw/data/audio/dfw_gw_2.sph"
location = "dfw"
year = "1990"
month = "05"
day = "18"
startTranscriptTime = "1815 CST"
transcriber = "" #no transcriber listed

df_dfw_gw_2 = prepareDataFrame()

In [145]:
filename = "dfw/dfw_le_1.txt"
filepath = "atc0_dfw/data/audio/dfw_le_1.sph"
location = "dfw"
year = "1990"
month = "03"
day = "01"
startTranscriptTime = "1400 CST"
transcriber = "" #no transcriber listed

df_dfw_le_1 = prepareDataFrame()

In [147]:
filename = "dfw/dfw_le_2.txt"
filepath = "atc0_dfw/data/audio/dfw_le_2.sph"
location = "dfw"
year = "1991"
month = "07"
day = "01"
startTranscriptTime = "1700 CST"
transcriber = "" #no transcriber listed

df_dfw_le_2 = prepareDataFrame()

In [149]:
filename = "dfw/dfw_lw_1.txt"
filepath = "atc0_dfw/data/audio/dfw_lw_1.sph"
location = "dfw"
year = "1990"
month = "03"
day = "28"
startTranscriptTime = "1345 CST"
transcriber = "" #no transcriber listed

df_dfw_lw_1 = prepareDataFrame()

In [151]:
filename = "dfw/dfw_lw_2.txt"
filepath = "atc0_dfw/data/audio/dfw_lw_2.sph"
location = "dfw"
year = "1990"
month = "05"
day = "18"
startTranscriptTime = "1815 CST"
transcriber = "" #no transcriber listed

df_dfw_lw_2 = prepareDataFrame()

In [153]:
filename = "dfw/dfw_lw_3.txt"
filepath = "atc0_dfw/data/audio/dfw_lw_3.sph"
location = "dfw"
year = "1990"
month = "05"
day = "19"
startTranscriptTime = "0930 CST"
transcriber = "" #no transcriber listed

df_dfw_lw_3 = prepareDataFrame()

In [155]:
df_dfw = pd.concat([df_dfw_a1_1, df_dfw_a1_2, df_dfw_a2_1, df_dfw_a2_2,
                   df_dfw_a2_3, df_dfw_d1_1, df_dfw_d1_2, df_dfw_d1_3,
                    df_dfw_d2_1, df_dfw_d2_2, df_dfw_d3_2, df_dfw_fe_1,
                   df_dfw_fe_2, df_dfw_fw_1, df_dfw_fw_2, df_dfw_fw_3,
                   df_dfw_ge_1, df_dfw_gw_1, df_dfw_gw_2, df_dfw_le_1,
                   df_dfw_le_2, df_dfw_lw_1, df_dfw_lw_2, df_dfw_lw_3],
                    ignore_index = True)
#df_dfw #12369

In [158]:
df_atc0 = pd.concat([df_bos, df_dca, df_dfw],ignore_index = True)
#df_atc0 #30437

In [159]:
df_atc0.to_csv('atc0.csv', index=False)

In [160]:
check = pd.read_csv('atc0.csv', keep_default_na = False)
check

Unnamed: 0,location,year,month,day,fileStartTime,speaker,recipient,start,end,transcription,comment,filePath,transcriber
0,bos,1991,06,26,2012 UTC,NERA3788,F1-1,1.49,6.57,THOUSAND ONE NINETY WE GIVE YOU THAT ON THE SP...,,atc0_bos/data/audio/log_f1_1.sph,FR
1,bos,1991,06,26,2012 UTC,F1-1,NERA3788 GAA329,6.59,11.17,THANKS BIZEX THREE TWENTY NINE TURN LEFT HEADI...,CONTROLLER TALKED TO TWO AIRCRAFT IN SAME TRAN...,atc0_bos/data/audio/log_f1_1.sph,FR
2,bos,1991,06,26,2012 UTC,GAA329,F1-1,11.21,13.57,ZERO NINE ZERO THREE TWENTY NINE ROGER THAT SIR,,atc0_bos/data/audio/log_f1_1.sph,FR
3,bos,1991,06,26,2012 UTC,F1-1,USA268,14.31,20.34,U S AIR TWO SIXTY EIGHT YOU OVER LONER CLEARED...,,atc0_bos/data/audio/log_f1_1.sph,FR
4,bos,1991,06,26,2012 UTC,USA268,F1-1,20.74,22.75,CLEARED THE I L S TWO SEVEN U S AIR TWO SIXTY ...,,atc0_bos/data/audio/log_f1_1.sph,FR
...,...,...,...,...,...,...,...,...,...,...,...,...,...
30432,dfw,1990,05,19,0930 CST,LCW-1,UNK,3571.58,3573.33,ONE EIGHT ZERO AT ONE FOUR,,atc0_dfw/data/audio/dfw_lw_3.sph,
30433,dfw,1990,05,19,0930 CST,LCW-1,MRA663,3574.54,3578.99,MARTINAIRE SIX SIX THREE START YOUR RIGHT TURN...,,atc0_dfw/data/audio/dfw_lw_3.sph,
30434,dfw,1990,05,19,0930 CST,MRA663,LCW-1,3579.12,3580.15,SIX SIX THREE WILL DO,,atc0_dfw/data/audio/dfw_lw_3.sph,
30435,dfw,1990,05,19,0930 CST,LCW-1,AAL289,3580.31,3583.09,AMERICAN TWO EIGHTY NINE RUNWAY ONE EIGHT LEFT...,,atc0_dfw/data/audio/dfw_lw_3.sph,


In [161]:
print(check.equals(df_atc0))

True


In [162]:
df_dca_d1_1.head(20)

Unnamed: 0,location,year,month,day,fileStartTime,speaker,recipient,start,end,transcription,comment,filePath,transcriber
0,dca,1992,5,26,1542 EDT,DR1-1,DAL209,63.04,66.01,DELTA TWO ZERO NINE TURN LEFT HEADING TWO EIGH...,,atc0_dca/data/audio/dca_d1_1.sph,JLO
1,dca,1992,5,26,1542 EDT,DAL209,DR1-1,66.65,69.08,LEFT TO TWO EIGHTY DELTA TWO OH NINE,,atc0_dca/data/audio/dca_d1_1.sph,JLO
2,dca,1992,5,26,1542 EDT,AAL1581,DR1-1,93.55,96.11,APPROACH AMERICAN EIGHT AH FIFTEEN EIGHTY ONE ...,,atc0_dca/data/audio/dca_d1_1.sph,JLO
3,dca,1992,5,26,1542 EDT,DR1-1,AAL1581,96.63,101.06,AMERICAN FIFTEEN EIGHTY ONE WASHINGTON DEPARTU...,,atc0_dca/data/audio/dca_d1_1.sph,JLO
4,dca,1992,5,26,1542 EDT,AAL1581,DR1-1,101.3,103.46,UP TO ONE SEVEN THOUSAND AMERICAN FIFTEEN EIGH...,,atc0_dca/data/audio/dca_d1_1.sph,JLO
5,dca,1992,5,26,1542 EDT,DR1-1,DAL209,106.51,109.48,DELTA TWO OH NINE TURN LEFT HEADING TWO FOUR ZERO,,atc0_dca/data/audio/dca_d1_1.sph,JLO
6,dca,1992,5,26,1542 EDT,DAL209,DR1-1,109.85,112.06,LEFT TO TWO FORTY DELTA TWO OH NINE,,atc0_dca/data/audio/dca_d1_1.sph,JLO
7,dca,1992,5,26,1542 EDT,DR1-1,DAL209,195.95,201.46,DELTA TWO OH NINE TURN RIGHT AH HEADING TWO SE...,,atc0_dca/data/audio/dca_d1_1.sph,JLO
8,dca,1992,5,26,1542 EDT,DAL209,DR1-1,201.98,204.77,OKAY WE'LL GO DIRECT LINDEN RIGHT NOW DELTA TW...,,atc0_dca/data/audio/dca_d1_1.sph,JLO
9,dca,1992,5,26,1542 EDT,DR1-1,AAL1581,205.16,208.22,AMERICAN FIFTEEN EIGHTY ONE TURN LEFT HEADING ...,,atc0_dca/data/audio/dca_d1_1.sph,JLO


In [174]:
df_atc0['filePath'][27000]

'atc0_dfw/data/audio/dfw_gw_2.sph'