In [1]:
import os
import pandas as pd
from datetime import datetime
import matplotlib.pyplot as plt
import pickle

# Download & Load Data

Evan Odell Harvard Speeches Data

https://evanodell.com/projects/datasets/hansard-data/

In [2]:
repo_url = 'https://zenodo.org/record/4843485/files/'
repo_file_names = [
        'hansard-speeches-v310.csv.zip',
        'parliamentary_posts.json'
]

In [3]:
# run once only
for fn in repo_file_names:
    os.system(f'wget {repo_url + fn}')

In [3]:
# load compressed csv to dataframe
df = pd.read_csv(repo_file_names[0],
                 dtype=str,
                 parse_dates=['date'])

# Filter and Clean

In [4]:
# filter date range
date_from = pd.to_datetime('2010-05-25') # start of coalition government 2010-15
date_to = pd.to_datetime('2019-11-06') # end of Conservative government preceding 2019 GE

df_all = df.copy()
df = df[(date_from <= df.date) & (df.date <= date_to)]

In [5]:
# filter speech class - keep only actual speeches (not procedural/division Hansard entries)
df = df[df.speech_class=='Speech']

# remove unneeded columns
drop_cols = ['year', 'id', 'speakername', 'speech_class', 'hansard_membership_id', 'colnum', 'time', 'person_id', 'speakerid', 'url', 'oral_heading']
df.drop(labels=drop_cols, axis=1, inplace=True)

In [6]:
df[df.mnis_id.isna()].display_as.unique()

array(['Several Hon. Members', 'The Queen',
       'My Lords and Members of the House of Commons',
       'Members of the House of Commons', 'An Hon. Member'], dtype=object)

In [7]:
# remove rows with nan mnis_id
# each MP has an MNIS ID - all entries without one are spoken by non-individual MPs (e.g. 'Several Hon. Members', 'The Queen')
df.dropna(axis=0, subset=['mnis_id'], inplace=True)

In [8]:
# remove speeches from the Speaker - these are non-topical
df = df[df.party != 'Speaker']

In [9]:
# replace newline and tab characters with space
df = df.replace(r'\\n',' ', regex=True)
df = df.replace(r'\\t',' ', regex=True)

# replace full stop with full stop and space
# fixes transcription errors: some sentences don't have spaces after full stop at end
df = df.replace('.','. ', regex=False)

# replace multiple space characters with single space
# fix double spaces created by previous steps
# and pre-existing double spaces
df = df.replace('\s+',' ', regex=True)

In [10]:
# "hon." confuses sentencizer - leads to false positive sentence endings
df['speech'] = df['speech'].str.replace('hon.', 'hon')

# remove all text between square brackets (including brackets)
# text between square brackets is not spoken
df['speech'] = df['speech'].str.replace(r"\[.*\]","")

  df['speech'] = df['speech'].str.replace('hon.', 'hon')
  df['speech'] = df['speech'].str.replace(r"\[.*\]","")


In [11]:
df

Unnamed: 0,speech,display_as,party,constituency,mnis_id,date,major_heading,minor_heading
1958082,"I beg to move,That an humble Address be presen...",Peter Lilley,Conservative,Hitchin and Harpenden,68,2010-05-25,DEPUTY SPEAKERS,Debate on the Address — [1st Day]
1958083,I am delighted to follow the right hon Member ...,Donald Foster,Liberal Democrat,Bath,214,2010-05-25,DEPUTY SPEAKERS,Debate on the Address — [1st Day]
1958084,I am sure that the whole House will join me in...,Harriet Harman,Labour,Camberwell and Peckham,150,2010-05-25,DEPUTY SPEAKERS,Debate on the Address — [1st Day]
1958085,"Before I go on to the tributes, I welcome the ...",David Cameron,Conservative,Witney,1467,2010-05-25,DEPUTY SPEAKERS,Debate on the Address — [1st Day]
1958086,On the issue of fairness and responsibility an...,David Blunkett,Labour,"Sheffield, Brightside and Hillsborough",395,2010-05-25,DEPUTY SPEAKERS,Debate on the Address — [1st Day]
...,...,...,...,...,...,...,...,...
2615300,I congratulate the hon Gentleman on securing t...,James Cartlidge,Conservative,South Suffolk,4519,2019-11-05,Special Educational Needs and Disabilities (S...,
2615301,I thank the hon Gentleman and agree that under...,Sandy Martin,Labour,Ipswich,4678,2019-11-05,Special Educational Needs and Disabilities (S...,
2615302,"Congratulations on your new position, Mr Speak...",Michelle Donelan,Conservative,Chippenham,4530,2019-11-05,Special Educational Needs and Disabilities (S...,
2615303,My hon Friend is right to be open and clear ab...,James Cartlidge,Conservative,South Suffolk,4519,2019-11-05,Special Educational Needs and Disabilities (S...,


In [12]:
MP_info_cols = ['display_as', 'party', 'constituency', 'mnis_id']

MP_info = df[MP_info_cols].drop_duplicates(subset='mnis_id').set_index('mnis_id')

In [14]:
df = df.drop(columns = ['display_as', 'party', 'constituency'])

In [16]:
with open('speeches.pkl', 'wb') as f:
    pickle.dump(df, f)
    
with open('MP_info.pkl', 'wb') as f:
    pickle.dump(MP_info, f)