In [1]:
import os
import pandas as pd
from datetime import datetime
import matplotlib.pyplot as plt
import pickle

# Download & Load Data

Evan Odell Harvard Speeches Data

https://evanodell.com/projects/datasets/hansard-data/

In [2]:
repo_url = 'https://zenodo.org/record/4843485/files/'
repo_file_names = [
        'hansard-speeches-v310.csv.zip',
        'parliamentary_posts.json'
]

In [3]:
# # run once only
# for fn in repo_file_names:
#     os.system(f'wget {repo_url + fn}')

In [4]:
# load compressed csv to dataframe
df = pd.read_csv(repo_file_names[0],
                 dtype=str,
                 parse_dates=['date'])

# Filter and Clean

In [5]:
# filter date range
date_from = pd.to_datetime('2010-05-25') # start of coalition government 2010-15
date_to = pd.to_datetime('2019-11-06') # end of Conservative government preceding 2019 GE

df_all = df.copy()
df = df[(date_from <= df.date) & (df.date <= date_to)]

In [6]:
# filter speech class - keep only actual speeches (not procedural/division Hansard entries)
df = df[df.speech_class=='Speech']

# remove unneeded columns
drop_cols = ['year', 'id', 'speakername', 'speech_class', 'hansard_membership_id', 'colnum', 'time', 'person_id', 'speakerid', 'url', 'oral_heading']
df.drop(labels=drop_cols, axis=1, inplace=True)

In [7]:
df[df.mnis_id.isna()].display_as.unique()

array(['Several Hon. Members', 'The Queen',
       'My Lords and Members of the House of Commons',
       'Members of the House of Commons', 'An Hon. Member'], dtype=object)

In [8]:
# remove rows with nan mnis_id
# each MP has an MNIS ID - all entries without one are spoken by non-individual MPs (e.g. 'Several Hon. Members', 'The Queen')
df.dropna(axis=0, subset=['mnis_id'], inplace=True)

In [9]:
# replace newline and tab characters with space
df = df.replace(r'\\n',' ', regex=True)
df = df.replace(r'\\t',' ', regex=True)

# replace multiple space characters with single space
df = df.replace('\s+',' ', regex=True)

In [10]:
# "hon." confuses sentencizer - leads to false positive sentence endings
df['speech'] = df['speech'].str.replace('hon.', 'hon')

  df['speech'] = df['speech'].str.replace('hon.', 'hon')


In [11]:
# text between square brackets is not spoken
df['speech'] = df['speech'].str.replace(r"\[.*\]","")

  df['speech'] = df['speech'].str.replace(r"\[.*\]","")


In [12]:
with open('hansard-speeches-post2010.pkl', 'wb') as f:
    pickle.dump(df, f)