In [1]:
import warnings
warnings.filterwarnings('ignore')
from os import listdir
from os.path import join, isfile
import csv
import pandas as pd
import re

'''
    read_file_lines: Helper function for batch_import(). Slices files
        into their line-by-line indices and returns a dict of those values for use.
        Note: It filters out empty lines.
'''
def read_file_lines(**kwargs):
    lines = [line.rstrip('\n') for line in open(join(kwargs['path'], kwargs['file']))]
    
    clean_lines = []
    for l in lines:
        if l != '':
            clean_lines.append(l)
        
    return clean_lines

'''
    batch_import: Import all files in directory and process each file per line.
        Returns a Dict, where the filename is the key and lines are stored as a
        subset keyed array named 'lines'.
'''
def batch_import(**kwargs):
    # Pattern for period number from filename
    re_filename = kwargs['regex_filename']
    events = []
    # Listify files within path, ignore hidden files, look for only defined type
    list_of_files = [f for f in listdir(kwargs['path']) if not f.startswith('.') and f.endswith(kwargs['file_type']) and isfile(join(kwargs['path'], f))]

    # Write list of periods
    for f in list_of_files:
        event_name = re.findall(re_filename, f)
        if event_name[0]:
            if not events:
                events.append(event_name[0].lower())
            elif event_name[0] not in events:
                events.append(event_name[0].lower())
                
    # Write per Event dict with each file as List of lines to parse
    dict_files = {}
    sorted_list_of_files = sorted(list_of_files)

    for f in sorted_list_of_files:
        event_filename = re.search(re_filename, f)
        e = event_filename[0]
        lines = read_file_lines(path=kwargs['path'], file=f)

        dict_files.update({ e: 
                    {
                        'lines': lines
                    }
        })
    
    return dict_files


In [3]:
dict_event_lines = batch_import(
    regex_filename=r".{1,}(?=\.txt)",
    path='../data/organized_by_event',
    file_type='txt'
)
dict_event_lines['01-Feb_26_Coronavirus_Updates']['lines'][:3]

['Donald Trump: <05:39>',
 'Thank you very much everybody. Thank you very much. Before I begin, I’d like to extend my deepest condolences to the victims and families in Milwaukee, Wisconsin. Earlier today, a wicked murderer opened fire at a Molson-Coors Brewing Company plant, taking the lives of five people. A number of people were wounded, some badly wounded. Our hearts break for them and their loved ones. We send our condolences, we’ll be with them and it’s a terrible thing. Terrible thing. So our hearts go out to the people of Wisconsin and to the families. Thank you very much. I’ve just received another briefing from a great group of talented people on the virus that is going around to various parts of the world. We have, through some very good early decisions, decisions that were actually ridiculed at the beginning, we closed up our borders to flights coming in from certain areas. Areas that were hit by the coronavirus and hit pretty hard, and we did it very early.',
 'Donald Trum

In [4]:
'''
    Speaker + Timestamp Header:
        - Identifies header for text
        - (^[a-zA-Z]{1,}\:\s\<(\d{2}\:){1,2}\d{2}\>)|([a-zA-Z]{1,}\s\d{1,}\:\s\<(\d{2}\:){1,2}\d{2}\>)|([a-zA-Z]{1,}\s[a-zA-Z]{1,}\:\s\<(\d{2}\:){1,2}\d{2}\>)
        Speaker Tag:
            - when found, cut of last 3 string places [:-3]: 'Donald Trump: <', 'Crowd: <'
            - (^[a-zA-Z]{1,}\:\s\<)|([a-zA-Z]{1,}\s\d{1,}\:\s\<)|([a-zA-Z]{1,}\s[a-zA-Z]{1,}\:\s\<)
        Timestamp Tag:
            - when found, cut of first 3 string & last string places [3:-1]: ': <28:05>', ': <01:00:44>'
            - \:\s\<(\d{2}\:){1,2}\d{2}\>
    Text:
        - when found, cut of the first 2 '<\n' & last string '\n' places [2:-1]
        - \>\n.+\n
    Remove Tags in Text:
        - \<[a-z]{1,}\s(\d{2}\:){1,2}\d{2}\>
'''
def data_organizer(**kwargs):
    new_dict = {}
    # go thru each file's lines and organize by 1) speaker and 2) time
    for event in kwargs['data']:
        new_dict.update({ event: {
            'speakers':{},
            'temporal':[]
        } })
        index=0
        for line in kwargs['data'][event]['lines']:
            header = re.findall(kwargs['re_header'], line)
            if len(header) > 0:
                speaker = re.search(kwargs['re_speaker'], line)
                ts = re.search(kwargs['re_ts'], line)
                # Organize by speaker
                if speaker:
                    sp = speaker[0][:-3]
                    if sp not in new_dict[event]['speakers']:
                        new_dict[event]['speakers'].update({ sp: [kwargs['data'][event]['lines'][index+1]] })
                    elif sp in new_dict[event]['speakers']:
                        new_dict[event]['speakers'][sp].append(kwargs['data'][event]['lines'][index+1])
                # Organize by timestamp
                if ts:
                    time = ts[0][3:-1]
                    new_dict[event]['temporal'].append([(time, sp, kwargs['data'][event]['lines'][index+1])])
            index=index+1
    return new_dict

In [5]:
test = data_organizer(
    data=dict_event_lines,
    re_header=r"(^[a-zA-Z]{1,}\:\s\<(\d{2}\:){1,2}\d{2}\>)|([a-zA-Z]{1,}\s\d{1,}\:\s\<(\d{2}\:){1,2}\d{2}\>)|([a-zA-Z]{1,}\s[a-zA-Z]{1,}\:\s\<(\d{2}\:){1,2}\d{2}\>)",
    re_speaker=r"(^[a-zA-Z]{1,}\:\s\<)|([a-zA-Z]{1,}\s\d{1,}\:\s\<)|([a-zA-Z]{1,}\s[a-zA-Z]{1,}\:\s\<)",
    re_ts=r"\:\s\<(\d{2}\:){1,2}\d{2}\>",
    re_tags=r"\<[a-z]{1,}\s(\d{2}\:){1,2}\d{2}\>"
)
test['01-Feb_26_Coronavirus_Updates']['speakers']['Alex Azar'][:3]

['Well, thank you Mr. Vice President, and thank you, Mr. President for gathering your public health experts here today and for your strong leadership in keeping America safe, and I just want to say I could not be more delighted that you’ve asked the Vice President, my old friend and colleague, to lead this whole of government approach with us under the emergency support function number eight. As of today, we have 15 cases of COVID-19 that have been detected in the United States, with only one new case detected in the last two weeks. We also have three cases among Americans repatriated from Wuhan, and 42 cases among Americans repatriated who had been stuck on the Diamond Princess in Japan. The President’s early and decisive actions, including travel restrictions, have succeeded in buying us incredibly valuable time. This has helped us contain the spread of the virus, handle the cases that we have, and prepare for the possibility that we will need to mitigate broader spread of infections

In [6]:
test['01-Feb_26_Coronavirus_Updates']['temporal'][:3]

[[('05:39',
   'Donald Trump',
   'Thank you very much everybody. Thank you very much. Before I begin, I’d like to extend my deepest condolences to the victims and families in Milwaukee, Wisconsin. Earlier today, a wicked murderer opened fire at a Molson-Coors Brewing Company plant, taking the lives of five people. A number of people were wounded, some badly wounded. Our hearts break for them and their loved ones. We send our condolences, we’ll be with them and it’s a terrible thing. Terrible thing. So our hearts go out to the people of Wisconsin and to the families. Thank you very much. I’ve just received another briefing from a great group of talented people on the virus that is going around to various parts of the world. We have, through some very good early decisions, decisions that were actually ridiculed at the beginning, we closed up our borders to flights coming in from certain areas. Areas that were hit by the coronavirus and hit pretty hard, and we did it very early.')],
 [('