In [None]:
import datetime
import glob
import os
import re
import zipfile
from string import Template

import pandas as pd
import requests
from autocorrect import Speller

In [None]:
def download_dataset(url, target_path="data", keep_download=True, overwrite_download=False):
    """Downloads dataset from a url.
    url: string, a dataset path
    target_path: string, path where data will be downloaded
    keep_download: boolean, keeps the original file after extraction
    overwrite_download: boolean, stops download if dataset already exists
    """
    if url == "" or url is None:
        raise Exception('URL is empty')

    filename = os.path.basename(url)
    file_location = os.path.join(target_path, filename)

    os.makedirs(target_path, exist_ok=True)

    if os.path.exists(file_location) and not overwrite_download:
        print(f"Archive already exists at {file_location}. Use: 'overwrite_download=True' to \
overwrite download")
        extract_file(target_path, filename)
        return

    print(f"Downloading file from {url} to {file_location}.")
    # Download
    with open(file_location, 'wb') as f:
        with requests.get(url, allow_redirects=True, stream=True) as resp:
            for chunk in resp.iter_content(chunk_size=512):  #chunk_size in bytes
                if chunk:
                    f.write(chunk)

    print("Finished downloading.")
    extract_file(target_path, filename)

    if not keep_download:
        os.remove(file_location)


In [None]:
def extract_file(target_path, filename):
    """Extract file
    target_path: string, location where data will be extracted
    filename: string, name of the file
    """
    file_location = os.path.join(target_path, filename)

    if os.path.exists(os.path.splitext(file_location)[0]):
        print(f"Extracted folder already exists at {os.path.splitext(file_location)[0]}")
    else:
        print("Extracting the file now ...")
        zipf = zipfile.ZipFile(file_location, 'r')
        zipf.extractall(target_path)
        zipf.close()
        print(f'Successfully extracted file {file_location}')

In [None]:
def process_dataset(dataset_name, data_directory="data", output_directory="processed", deduplicate=False,
                    min_speech_length=20):
    # Create an output directory
    os.makedirs(os.path.join(data_directory, output_directory), exist_ok=True)

    # Find all speech files
    speech_file_pattern = re.compile('speeches_[0-9]+.txt')
    # Find all speaker map files
    speaker_map_file_template = Template('${session}_SpeakerMap.txt')
    # Find all description files
    description_file_template = Template('descr_${session}.txt')
    # Set a template for output files
    output_file_template = Template(f'{dataset_name}-$session.csv')

    # Get all txt files in the data directory
    files = glob.glob(os.path.join(data_directory, dataset_name, '*.txt'))
    # Find speech files
    speech_files = [file for file in files if speech_file_pattern.match(os.path.basename(file))]

    speller = Speller(only_replacements=True, fast=True)

    # Process all speech files
    for speech_file in speech_files:
        print(f"Processing file {speech_file}...")

        # Get a session number
        session = re.search('\d+', speech_file).group()

        # Filter out all sessions before 2000
        if session < '106':
            continue

        # Get a corresponding speaker map
        speaker_map_file = speaker_map_file_template.substitute(session=session)

        # Get a corresponding description
        description_file = description_file_template.substitute(session=session)

        # Get all speeches from the speech file
        speeches = []
        speech_ids = []
        with open(speech_file, 'r', encoding='unicode_escape', errors='backslashreplace') as file:
            # Skip the header
            file.readline()
            # Process all file lines
            while line := file.readline().rstrip():
                # Remove redundant characters
                line = re.sub(r"[^A-Za-z0-9 '\"|$.,-]+", ' ', line)

                line_parts = line.split('|')
                if line_parts[0].isdigit():
                    speech_ids.append(int(line_parts[0]))
                    speeches.append(' '.join(line_parts[1:]))
                else:
                    speeches[-1] += line

        for i, speech in enumerate(speeches):
            # Replace full stops with commas where applicable
            output = re.sub(r'(\b[A-Za-z]+) *\. *([a-z]+)', lambda m: f'{m.group(1)}, {m.group(2)}', speech)
            while speech != output:
                speech = output
                output = re.sub(r'(\b[a-z]+) *\. *([a-z]+)', lambda m: f'{m.group(1)}, {m.group(2)}', speech)

            # Remove frequent phrases
            speech = re.sub(r"(([,.]+ *)|^)Madam Clerk[.,]*", '', speech, flags=re.IGNORECASE)
            speech = re.sub(r"(([,.]+ *)|^)Mr[.,]* Clerk[.,]*", '', speech, flags=re.IGNORECASE)
            speech = re.sub(r"(([,.]+ *)|^)Mr[.,]* President[.,]*", '', speech, flags=re.IGNORECASE)
            speech = re.sub(r"(([,.]+ *)|^)Mr[.,]* Speaker[.,]*", '', speech, flags=re.IGNORECASE)
            speech = re.sub(r"(([,.]+ *)|^)Madam Speaker[.,]*", '', speech, flags=re.IGNORECASE)

            # Remove adjacent spaces
            speech = re.sub(r' +', ' ', speech).strip()

            # Separate sentences with a single space
            output = re.sub(r'(\b[A-Za-z]+) *([.?!]) *([A-Za-z]+)',
                            lambda m: f'{m.group(1)}{m.group(2)} {m.group(3).capitalize()}', speech)
            while speech != output:
                speech = output
                output = re.sub(r'(\b[A-Za-z]+) *([.?!]) *([A-Za-z]+)',
                                lambda m: f'{m.group(1)}{m.group(2)} {m.group(3).capitalize()}', speech)

            speech = re.sub(r'^[a-z]+', lambda m: m.group().capitalize(), speech)

            speeches[i] = speller(speech)

        speech = pd.DataFrame({"speech_id": speech_ids, "speech": speeches})

        speaker_map = pd.read_csv(os.path.join(data_directory, dataset_name, speaker_map_file), sep='|')

        custom_date_parser = lambda x: datetime.datetime.strptime(x, "%Y%m%d").date()
        description = pd.read_csv(os.path.join(data_directory, dataset_name, description_file), sep='|',
                                  parse_dates=['date'], date_parser=custom_date_parser)

        joined = pd.merge(speech, speaker_map, on='speech_id')
        joined = pd.merge(joined, description[['speech_id', 'date']], on='speech_id')

        dataset = joined[['speech', 'date', 'party', 'lastname', 'firstname', 'state', 'speakerid']].reset_index(
            drop=True)

        dataset['speakerid'] = dataset['speakerid'].apply(lambda x: str(x)[3:])

        dataset = dataset[dataset['party'].isin(['R', 'D'])]

        dataset = dataset[dataset['speech'].apply(lambda x: len(x.split()) >= min_speech_length)]

        if deduplicate:
            dataset = dataset.drop_duplicates(subset=['speech'], keep=False)

        output_file = output_file_template.substitute(session=session)
        dataset.to_csv(os.path.join(data_directory, output_directory, output_file), index=False)

        print(f"Processed file {output_file} was saved to {os.path.join(data_directory, output_directory)}")


In [None]:
HEIN_BOUND_URL = "https://stacks.stanford.edu/file/druid:md374tz9962/hein-bound.zip"
HEIN_DAILY_URL = "https://stacks.stanford.edu/file/druid:md374tz9962/hein-daily.zip"

In [None]:
#download_dataset(HEIN_BOUND_URL)
download_dataset(HEIN_DAILY_URL)

In [None]:
process_dataset('hein-daily', deduplicate=True, min_speech_length=20)
#process_dataset('hein-bound')