In [9]:
import os
import pandas as pd
from datetime import datetime
import matplotlib.pyplot as plt
import pickle

Download Evan Odell Harvard Speeches Data

https://evanodell.com/projects/datasets/hansard-data/

In [10]:
repo_url = 'https://zenodo.org/record/4843485/files/'
repo_file_names = [
        'hansard-speeches-v310.csv.zip',
        'parliamentary_posts.json'
]

In [11]:
# # run once only
# for fn in repo_file_names:
#     os.system(f'wget {repo_url + fn}')

In [12]:
# load compressed csv to dataframe
df = pd.read_csv(repo_file_names[0],
                 dtype=str,
                 parse_dates=['date'])

In [13]:
# filter date range
date_from = pd.to_datetime('2010-05-25') # start of coalition government 2010-15
date_to = pd.to_datetime('2019-11-06') # end of Conservative government preceding 2019 GE

df_all = df.copy()
df = df[(date_from <= df.date) & (df.date <= date_to)]

In [14]:
# filter speech class - keep only actual speeches (not procedural/division Hansard entries)
df = df[df.speech_class=='Speech']

# remove unneeded columns
drop_cols = ['year', 'id', 'speakername', 'speech_class', 'hansard_membership_id', 'colnum', 'time', 'person_id', 'speakerid', 'url', 'oral_heading']
df.drop(labels=drop_cols, axis=1, inplace=True)

In [15]:
# replace newline and tab characters with space
df = df.replace(r'\\n',' ', regex=True)
df = df.replace(r'\\t',' ', regex=True)

# replace multiple space characters with single space
df = df.replace('\s+',' ', regex=True)

In [16]:
with open('hansard-speeches-post2010.pkl', 'wb') as f:
    pickle.dump(df, f)