In [None]:
import pandas as pd 
from glob import glob
from tqdm import tqdm 
import re

dataset = 'bound'

"""
Speakermaps = Data that relates speaker information with speech ids.
"""

print('Loading speaker maps...')

filelist = glob('../data/us_congressional_record/hein-{}/*SpeakerMap.txt'.format(dataset)) 
df_list = [pd.read_csv(file, sep = '|') for file in tqdm(filelist)] 
speaker_map = pd.concat(df_list, ignore_index = True) 

"""
Speeches = the US Congressional speech records.
"""

print('Loading speeches...')

filelist = glob('../data/us_congressional_record/hein-{}/speeches*.txt'.format(dataset)) 

col_names = ['speech_id', 'speeches']
df = pd.DataFrame(columns = col_names)

for file in tqdm(filelist): # I read the csvs in this way because pd.read_csv displays errors
        with open(file, 'rb') as f:
            lines = f.readlines()

        split_lines = [
            str(line).strip().split("|")
            for line in lines
        ]

        speech_ids, speeches = zip(*split_lines[1:])
        temp = pd.DataFrame({'speech_id': speech_ids, 'speeches': speeches})
        df = pd.concat([df,temp], ignore_index = True)

df.columns = ['speech_id','speech']

df.speech_id = df.speech_id.str.replace('b\'', '') 

speaker_map['speech_id']=speaker_map['speech_id'].apply(str)
df = pd.merge(speaker_map, df, on = 'speech_id') 
del speaker_map

df = df[df['party'].isin(['R', 'D'])]

df.columns = ['speakerid', 'speech_id', 'lastname', 'firstname', 'chamber', 'state',
       'gender', 'party', 'district', 'nonvoting', 'doc']

print('Processing the text data...')

import sys
sys.path.append('../gtm/')
from utils import text_processor
    
p = text_processor(
    'en_core_web_sm', 
    pos_tags_to_keep = ['VERB', 'NOUN', 'PNOUN', 'ADJ']
)

df['doc_clean'] = p.process_docs(df['doc'], batch_size = 10, output_path = '../data/us_congressional_record/temp_us_congress_{}_clean.csv'.format(dataset))

l = pd.read_csv('../data/us_congressional_record/temp_us_congress_{}_clean.csv'.format(dataset))['doc_clean'].to_list()
df['doc_clean'] = l

idx = [k for k,v in enumerate(l) if len(str(v).split()) >= 20] # drop speeches of less than 20 preprocessed tokens
df = df.iloc[idx]
df = df.reset_index(drop=True)

df.to_csv('../data/us_congressional_record/us_congress_speeches_{}_processed.csv'.format(dataset))