#### Modification of Segun's code

In [1]:
import os, datetime, json, time
from collections import OrderedDict
import pandas as pd
from xml.etree import ElementTree as ET

In [47]:
print("Current Directory:", os.getcwd())

Current Directory: c:\Users\agust\Documents\SEDS\SEDS_thesis\1_congress_data_proc\US_Congressional_speeches_EMI


In [60]:
start_date = '2023-01-04'
end_date   = '2024-12-31'

day        = datetime.datetime.strptime(start_date, '%Y-%m-%d')
end_day    = datetime.datetime.strptime(end_date, '%Y-%m-%d')

DO_SAVE = False 
data = []

In [2]:
def remove_extra_spaces(s):
    while '  ' in s:
        s = s.replace('  ', ' ')
    return s.strip()

In [61]:
while day <= end_day:
    cur_date_str  = datetime.datetime.strftime(day, '%Y-%m-%d')
    
    base_dir_xml = os.path.join('.', 'congressional-record', 'output', str(day.year), f'CREC-{cur_date_str}')
    base_dir_json = os.path.join(base_dir_xml, 'json')
    
    if os.path.exists(base_dir_json):

        start_time = time.time()
        
        # Get congress number from XML file
        xmlfile = os.path.join(base_dir_xml, 'mods.xml')
        if not os.path.exists(xmlfile):
            raise Exception('Metadata XML file not found for %s' % base_dir_xml)
        
        xmltree = ET.parse(xmlfile)
        o = xmltree.getroot().find('./{http://www.loc.gov/mods/v3}extension/{http://www.loc.gov/mods/v3}congress')
        congress_num = int(o.text)

        num_speeches = 0
        for fname in os.listdir(base_dir_json):
            if fname.endswith('.xml'):  # read in all xml files
                print(fname)
                
            if fname.endswith('.json'):  # read in all json files
                o=json.load(open(os.path.join(base_dir_json, fname),'r'))
                h=o['header']
                
                if h['chamber'] == 'House':
                    chamber = 'H'
                elif h['chamber'] == 'Senate':
                    chamber = 'S'
                else:
                    raise Exception('Unknown chamber %s' % h['chamber'])
                    
                for ndx, c in enumerate(o['content']):
                    
                    if c['kind']!='speech':
                        continue
                    
                    num_speeches += 1
                                        
                    speaker = c['speaker'].strip()
                    if 'clerk' in speaker:
                        raise Exception('Found a speech by the clerk')
                        
                    text = c['text'].strip()
                    
                    # Remove speaker's name from beginning of speech
                    if text.startswith(speaker + '.'):
                        text = text[len(speaker + '.'):]
                    elif text.startswith(speaker + ' .'):
                        text = text[len(speaker + ' .'):]
                    else:
                        print(speaker+ '/' +text[:100])
                        raise Exception('Text doesn\'t start with speaker name')
                        
                    text = remove_extra_spaces(text.replace('\n',' '))
                    
                    speech_id = '%s-%d' % (o['id'], ndx)
                    
                    # This creates another row that will go into the final dataframe
                    row = OrderedDict(
                        speech_id    = speech_id,
                        speech       = text,
                        chamber      = chamber,
                        is_extension = h['extension'],
                        date         = cur_date_str,
                        speaker      = speaker,
                        speaker_bioguide = c['speaker_bioguide'],
                        vol          = h['vol'],
                        num          = h['num'],
                        congress_num = congress_num,
                        pages        = h['pages'],
                        doc_title    = o['doc_title'],
                        title        = o['title'])
                    data.append(row)
                    
        load_time = time.time() - start_time
        print("Loaded %4d speeches from %s (time: %0.2f s tot, %4.1f ms/speech)" % 
              (num_speeches, cur_date_str, load_time, 1000*load_time/num_speeches if num_speeches > 0 else 0) )
                    
    day += datetime.timedelta(days=1)

Loaded    5 speeches from 2023-01-04 (time: 0.01 s tot,  2.2 ms/speech)
Loaded   51 speeches from 2023-01-05 (time: 0.01 s tot,  0.2 ms/speech)
Loaded  270 speeches from 2023-01-09 (time: 0.06 s tot,  0.2 ms/speech)
Loaded  225 speeches from 2023-01-10 (time: 0.09 s tot,  0.4 ms/speech)
Loaded  299 speeches from 2023-01-11 (time: 0.04 s tot,  0.1 ms/speech)
Loaded  151 speeches from 2023-01-12 (time: 0.03 s tot,  0.2 ms/speech)
Loaded    9 speeches from 2023-01-13 (time: 0.04 s tot,  4.4 ms/speech)
Loaded   20 speeches from 2023-01-17 (time: 0.03 s tot,  1.6 ms/speech)
Loaded   19 speeches from 2023-01-20 (time: 0.02 s tot,  1.0 ms/speech)
Loaded   60 speeches from 2023-01-23 (time: 0.01 s tot,  0.2 ms/speech)
Loaded  225 speeches from 2023-01-24 (time: 0.04 s tot,  0.2 ms/speech)
Loaded  335 speeches from 2023-01-25 (time: 0.11 s tot,  0.3 ms/speech)
Loaded 1187 speeches from 2023-01-26 (time: 0.28 s tot,  0.2 ms/speech)
Loaded  342 speeches from 2023-01-27 (time: 0.39 s tot,  1.1 ms/

In [62]:
# Create dataframe
if len(data):
    df = pd.DataFrame.from_dict(data).astype({'speech_id'        : str,
                                              'speech'           : str, 
                                              'chamber'          : str, 
                                              'is_extension'     : bool,
                                              'date'             : 'datetime64[ns]', 
                                              'speaker'          : str,
                                              'speaker_bioguide' : str,
                                              'vol'              : int,
                                              'num'              : int,
                                              'congress_num'     : int,
                                              'pages'            : str, 
                                              'doc_title'        : str, 
                                              'title'            : str})
    del data
    if DO_SAVE:
       df.to_csv('./crec2023_2024.csv', index=False)
else:
    print("No data found!")

In [69]:
parsed_df = df.copy()

In [70]:
print("Rows:", len(parsed_df))
print("Extensions:", parsed_df["is_extension"].sum())
parsed_df

Rows: 98187
Extensions: 8660


Unnamed: 0,speech_id,speech,chamber,is_extension,date,speaker,speaker_bioguide,vol,num,congress_num,pages,doc_title,title
0,CREC-2023-01-04-pt1-PgH17-2-0,"Madam Clerk, I move that the House do now adjo...",H,False,2023-01-04,Mr. COLE,C001053,169,3,118,H17-H18,MOTION TO ADJOURN,MOTION TO ADJOURN
1,CREC-2023-01-04-pt1-PgH17-2-1,The question is on the motion of the gentleman...,H,False,2023-01-04,The CLERK,,169,3,118,H17-H18,MOTION TO ADJOURN,MOTION TO ADJOURN
2,CREC-2023-01-04-pt1-PgH17-2-5,"Madam Clerk, on that I demand the yeas and nays.",H,False,2023-01-04,Mr. AGUILAR,A000371,169,3,118,H17-H18,MOTION TO ADJOURN,MOTION TO ADJOURN
3,CREC-2023-01-04-pt1-PgH17-1,The Chair has examined the Journal of the last...,H,False,2023-01-04,The CLERK,,169,3,118,H17,Senate,
4,CREC-2023-01-04-pt1-PgH17-2,Representatives-elect are invited to join in t...,H,False,2023-01-04,The CLERK,,169,3,118,H17,Senate,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
98182,CREC-2024-12-31-pt1-PgE1336-3,"Mr. Speaker, I rise to honor Mr. Marvin Dennis...",H,True,2024-12-31,Mr. GRIFFITH,G000568,170,196,118,E1336,"HONORING THE LIFE OF MARVIN REASER, Jr.",
98183,CREC-2024-12-31-pt1-PgH7429-4-0,Pursuant to section 3(z) of House Resolution 5...,H,False,2024-12-31,The SPEAKER pro tempore,,170,196,118,H7429,THE JOURNAL,THE JOURNAL
98184,CREC-2024-12-31-pt1-PgH7429-5-0,The Chair will lead the House in the Pledge of...,H,False,2024-12-31,The SPEAKER pro tempore,,170,196,118,H7429,PLEDGE OF ALLEGIANCE,PLEDGE OF ALLEGIANCE
98185,CREC-2024-12-31-pt1-PgH7430-3-2,"Without objection, the resolution was agreed to.",H,False,2024-12-31,The SPEAKER pro tempore,,170,196,118,H7430,EXPRESSING THE PROFOUND REGRET AND SORROW OF T...,EXPRESSING THE PROFOUND REGRET AND SORROW OF T...
