In [17]:
import numpy as np
import pandas as pd
import pickle

### load lemmatized speeches

In [18]:
with open('hansard-speeches-post2010-lemmatized.pkl', 'rb') as f:
    df = pickle.load(f)

### replace empty entries with ''

In [19]:
df[['major_heading', 'minor_heading']] = df[['major_heading', 'minor_heading']].fillna(value='')

### combine major and minor headings

In [20]:
df['minor_heading'] = df['minor_heading'].str.strip()
df['major_heading'] = df['major_heading'].str.strip()
df['heading'] = df['major_heading'] + ' ' + df['minor_heading']
df['heading'] = df['heading'].str.strip()

### aggregate entries

rows with identical date, heading, and display_as index values are combined together: speeches by string joining with spaces, lemmas by list concatenation

In [21]:
df_agg = df.groupby(['date', 'heading', 'display_as']).agg({'speech': lambda x: " ".join(x), 'lemmas': 'sum'})

In [22]:
with open('hansard-speeches-post2010-lemmatized-agg.pkl', 'wb') as f:
    pickle.dump(df_agg, f)

### make n_grams

In [23]:
from gensim.models.phrases import Phrases, ENGLISH_CONNECTOR_WORDS

# up to 6 long
max_n_gram = 6

ngram_models = {}

texts = df_agg.lemmas.values

for n in np.arange(2, max_n_gram+1):
    ngram_models[n] = Phrases(texts,
                              min_count=300, # individual words and n_gram used at least 300 times
                              threshold=30, # score produced by Phrases - manual testing, 25 a conservative balance
                              connector_words=ENGLISH_CONNECTOR_WORDS)
    texts = [ngram_models[n][text] for text in texts]
    print(f'done {n}-gram model')

done 2-gram model
done 3-gram model
done 4-gram model
done 5-gram model
done 6-gram model


In [24]:
df_agg['lemmas_ngrams'] = texts
df_agg = df_agg.drop(columns='lemmas')

In [25]:
# phrases = ngram_models[6].export_phrases()
# sorted_phrases_asc = dict(sorted(phrases.items(), key=lambda item: item[1]))
# sorted_phrases_desc = dict(sorted(phrases.items(), key=lambda item: item[1], reverse=True))

## remove lemmas present in >X% speeches

In [51]:
from sklearn.feature_extraction.text import CountVectorizer

dummy = lambda x: x

cv = CountVectorizer(
    tokenizer=dummy,
    preprocessor=dummy,
    token_pattern=None,
    max_df = 0.20
    )

texts = df_agg.lemmas_ngrams.values
cv.fit(texts)
print("fit cv")

stopwords = list(cv.stop_words_) + ['', ' ']
texts = [[w for w in text if w not in stopwords] for text in texts]
print('found stopwords and removed from texts (lists of lemmas)')

fit cv
found stopwords and removed from texts (lists of lemmas)


In [52]:
stopwords

['right',
 'say',
 'point',
 'come',
 'country',
 'friend',
 'hon',
 'people',
 'give',
 'support',
 'agree',
 'government',
 'time',
 'need',
 'take',
 'way',
 'issue',
 'secretary_state',
 'good',
 'work',
 'member',
 'year',
 'minister',
 'know',
 'want',
 'house',
 '',
 ' ']

* In parliament convention is not to refer to other MPs by name, or using second person pronouns. This leads to stopwords like 'right', 'hon', 'member', 'friend' (e.g. "does my right hon. friend agree that..." vs "Theresa May, do you agree that...") and 'minister', ("the minister for vaccines"
* Others related to typical non-topical political rhetoric, e.g. 'government', 'people', 'need', 'work'
* Others related to parliamentary procedure, e.g. 'time', 'house', 'year'

In [53]:
# separate dataframe for stopword removal - quicker testing due to time of n-grams creation
df_agg_sw = df_agg.copy()
df_agg_sw['lemmas_ngrams'] = texts

### remove speeches with fewer than 40 lemmas

In [55]:
# no. entries in df
total_l = len(df_agg_sw)

# minimum no. lemmas for passed speeches
t = 40

# filter
# again, separate dataframe for quicker testing at memory expense
df_agg_t = df_agg_sw[df_agg_sw.lemmas_ngrams.apply(len) >= t]

# proportion entries retained
len(df_agg_t) / total_l

0.3924887659315881

In [56]:
df_agg_t

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,speech,lemmas_ngrams
date,heading,display_as,Unnamed: 3_level_1,Unnamed: 4_level_1
2010-05-25,DEPUTY SPEAKERS Debate on the Address — [1st Day],Alan Beith,I usually enjoy and can be quite entertained b...,"[usually, enjoy, entertain, speech, manchester..."
2010-05-25,DEPUTY SPEAKERS Debate on the Address — [1st Day],Andrew Miller,"On a point of order, Mr. Speaker.It might help...","[order, speaker, help, particularly, seek, cat..."
2010-05-25,DEPUTY SPEAKERS Debate on the Address — [1st Day],Andrew Selous,I want to pick the right hon. Gentleman up on ...,"[pick, gentleman, different, talk, health, spe..."
2010-05-25,DEPUTY SPEAKERS Debate on the Address — [1st Day],Anne Begg,I pay tribute to the new hon. Member for Watfo...,"[pay_tribute, new, watford, richard, harringto..."
2010-05-25,DEPUTY SPEAKERS Debate on the Address — [1st Day],Charles Walker,"Mr Deputy Speaker, thank you for calling me to...","[mr_deputy_speaker, thank, call, speak, day, q..."
...,...,...,...,...
2019-11-05,Valedictory Debate,Seema Kennedy,"On a point of order, Madam Deputy Speaker. As ...","[order, madam_deputy_speaker, order, seek, gui..."
2019-11-05,Valedictory Debate,Stephen Pound,"May I begin by apologising, Madam Deputy Speak...","[begin, apologise, madam_deputy_speaker, north..."
2019-11-05,Valedictory Debate,Stephen Twigg,It is a pleasure to follow the hon. Member for...,"[pleasure_follow, north, devon, peter, heaton,..."
2019-11-05,Valedictory Debate,Teresa Pearce,"I would like to thank my fantastic family, my ...","[like, thank, fantastic, family, staff, amazin..."


In [57]:
with open('hansard-speeches-post2010-lemmatized-agg-ngrams.pkl', 'wb') as f:
    pickle.dump(df_agg_t, f)