In [1]:
import requests
import bs4
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, classification_report, plot_roc_curve
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.model_selection import train_test_split, cross_val_predict, cross_val_score
from sklearn.preprocessing import StandardScaler
import re
import time
from time import sleep
from random import randint

In [2]:
dates = pd.read_csv('all_scraped_debate_names.csv')

In [3]:
dates.reset_index(drop = True, inplace = True)

In [4]:
dates = dates[['debates', 'sizes']]

In [5]:
dates.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17448 entries, 0 to 17447
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   debates  17448 non-null  object 
 1   sizes    17448 non-null  float64
dtypes: float64(1), object(1)
memory usage: 272.8+ KB


In [6]:
dates.head()

Unnamed: 0,debates,sizes
0,debates1919-02-05a.xml,227.0
1,debates1919-02-06a.xml,37.0
2,debates1919-02-10a.xml,99.0
3,debates1919-02-11a.xml,36.0
4,debates1919-02-13a.xml,24.0


In [7]:
dates.debates = dates.debates.str.replace('debates', '')

In [8]:
dates['debates'] = dates.debates.str.replace('.xml', '')

In [9]:
dates

Unnamed: 0,debates,sizes
0,1919-02-05a,227.0
1,1919-02-06a,37.0
2,1919-02-10a,99.0
3,1919-02-11a,36.0
4,1919-02-13a,24.0
...,...,...
17443,2020-07-21c,137.0
17444,2020-07-21d,82.0
17445,2020-07-22a,80.0
17446,2020-07-22b,616.0


In [10]:
yr_2020 = dates[(dates.debates.str.contains('2020')) & (dates.sizes >200)]
yr_2020.shape

(94, 2)

In [24]:
yr_2019 = dates[(dates.debates.str.contains('2019')) & (dates.sizes >200)]
yr_2019.shape

(140, 2)

In [25]:
yr_2018 = dates[(dates.debates.str.contains('2018')) & (dates.sizes >200)]
yr_2018.shape

(154, 2)

In [26]:
yr_2017 = dates[(dates.debates.str.contains('2017')) & (dates.sizes >200)]
yr_2017.shape

(129, 2)

In [27]:
yr_2016 = dates[(dates.debates.str.contains('2016')) & (dates.sizes >200)]
yr_2016.shape

(165, 2)

In [37]:

t = time.process_time()
speeches2020df = pd.DataFrame()
headings2020df = pd.DataFrame()
requestsno = 0


for date in yr_2020.debates:

    URL2 = f'https://www.theyworkforyou.com/pwdata/scrapedxml/debates/debates{date}.xml'

    r2 = requests.get(URL2)

    # Pause the loop
    sleep(randint(8,15))

    # Monitor the requests
    requestsno += 1
    #do some stuff
    elapsed_time = time.process_time() - t
    print('Request:{}; Frequency: {} requests/s'.format(requestsno, requestsno/elapsed_time))



    soup2 = BeautifulSoup(r2.text, 'html.parser')


    headings = []
    heading_id = []
    head_debate = []

    # this adds all of the major headings to a list along with their place in the schedule
    for heading in soup2.find_all('major-heading'):
        try:
            headings.append(heading.text.strip())
        except:
            headings.append(np.nan)
        try:
            heading_id.append(heading['id'])
        except:
            heading_id.append(np.nan)
    # this adds all of the minor headings to a list along with their place in the schedule
    for heading in soup2.find_all('minor-heading'):
        try:
            headings.append(heading.text.strip())
        except:
            headings.append(np.nan)
        try:
            heading_id.append(heading['id'])
        except:
            heading_id.append(np.nan)
        try:
            head_debate.append(date)
        except:
            head_debate.append(np.nan)
    print('Request:{}; Time Elapsed: {} s'.format(requestsno, elapsed_time))


    # this splits the headings into the topic and changes the id to numeric
    heading_time = []
    for hi in heading_id:
        heading_time.append(hi.split(f'{date}.')[1])
    topics = pd.DataFrame(list(zip(headings, heading_time, head_debate)), columns = ['heading', 'id', 'debate_date'])
    topics.id = pd.to_numeric(topics.id)

    chronological_topics = topics.sort_values(by = 'id', ascending = True, ignore_index = True)



    #Scraping the speaker names, speeches and details

    speakers = []
    speeches = []
    speech_id = []
    person_id = []
    time_of_speech = []
    debate_id = []


    for speech in soup2.find_all('speech'):
        try:
            speakers.append(speech['speakername'])
        except:
            speakers.append(np.nan)
        try:
            speeches.append(speech.text.strip())
        except:
            speeches.append(np.nan)
        try:
            speech_id.append(speech['id'])
        except:
            speech_id.append(np.nan)
        try:
            person_id.append(speech['person_id'])
        except:
            person_id.append(np.nan)
        try:
            time_of_speech.append(speech['time'])
        except:
            time_of_speech.append(np.nan)
        try:
            debate_id.append(date)
        except:
            debate_id.append(np.nan)



    df = pd.DataFrame(list(zip(speakers, speeches, speech_id, person_id, time_of_speech, debate_id)), 
                      columns = ['speakers', 'speeches', 'speech_id', 'person_id', 'time_of_speech', 'debate_id'])

    #Pulling out the paragraph id of the speech, where it fits in the order of the day
    df.speech_id = df.speech_id.str.replace(f'uk.org.publicwhip/debate/{date}.', '')
    df.speech_id =  pd.to_numeric(df.speech_id)



    #bins = chronological_topics.id.values
    #bins = bins.tolist() + [100000]
    #labels = chronological_topics.heading.values

    #df['topic'] = pd.cut(df.speech_id, bins = bins, labels = labels, duplicates = 'drop')

    # Adding this debates speech data to the final dataframe
    speeches2020df = speeches2020df.append(df, ignore_index = True)

    #Adding this debates topics to the topics dataframe
    headings2020df = headings2020df.append(chronological_topics, ignore_index = True)

















Request:1; Frequency: 40.50879040751991 requests/s
Request:1; Time Elapsed: 0.024685999999999098 s
Request:2; Frequency: 5.488504327685696 requests/s
Request:2; Time Elapsed: 0.3643979999999978 s
Request:3; Frequency: 1.0642713466225355 requests/s
Request:3; Time Elapsed: 2.8188299999999984 s
Request:4; Frequency: 1.0254501341545141 requests/s
Request:4; Time Elapsed: 3.900725999999999 s
Request:5; Frequency: 1.255821675331418 requests/s
Request:5; Time Elapsed: 3.981456999999999 s
Request:6; Frequency: 1.0099194286279845 requests/s
Request:6; Time Elapsed: 5.941067999999998 s
Request:7; Frequency: 0.871121007045751 requests/s
Request:7; Time Elapsed: 8.035623000000001 s
Request:8; Frequency: 0.9644083878937332 requests/s
Request:8; Time Elapsed: 8.295241 s
Request:9; Frequency: 1.0416659432875395 requests/s
Request:9; Time Elapsed: 8.640006 s
Request:10; Frequency: 1.1012625203912525 requests/s
Request:10; Time Elapsed: 9.080486999999998 s
Request:11; Frequency: 0.9447934836047411 req

Request:88; Frequency: 0.8635366648492818 requests/s
Request:88; Time Elapsed: 101.90650099999999 s
Request:89; Frequency: 0.8723140592503543 requests/s
Request:89; Time Elapsed: 102.02747400000001 s
Request:90; Frequency: 0.8639751728094343 requests/s
Request:90; Time Elapsed: 104.16966 s
Request:91; Frequency: 0.8728737969329132 requests/s
Request:91; Time Elapsed: 104.25333 s
Request:92; Frequency: 0.8775592262098629 requests/s
Request:92; Time Elapsed: 104.836229 s
Request:93; Frequency: 0.8613012752676736 requests/s
Request:93; Time Elapsed: 107.976155 s
Request:94; Frequency: 0.867688962417631 requests/s
Request:94; Time Elapsed: 108.333751 s


In [38]:
speeches2020df.head()

Unnamed: 0,speakers,speeches,speech_id,person_id,time_of_speech,debate_id
0,Brandon Lewis,Let me finish the point. It takes five to 10 m...,321.6,uk.org.publicwhip/person/24879,19:15:00,2020-01-07a
1,Brandon Lewis,I will be brief; I just want to respond to a c...,336.0,uk.org.publicwhip/person/24879,20:15:00,2020-01-07a
2,Lindsay Hoyle,"On behalf of the whole House, I wish to expres...",235.1,uk.org.publicwhip/person/10295,,2020-01-07b
3,,The Chancellor of the Exchequer was asked—,235.4,,,2020-01-07b
4,Mark Logan,Whether he plans to increase the level of per ...,235.6,uk.org.publicwhip/person/25886,,2020-01-07b


In order to assign the correct topic (heading) to each speech I needed to use them as bin edges and match them to the speech id for each speech. There were non unique heading ids to I gave them a unique id

In [70]:
for i in headings2020df.index.tolist():


In [71]:
for i in range(len(headings2020df.id)):
    headings2020df.id += i/10000000
headings2020df

Unnamed: 0,heading,id,debate_date
0,Speaker’s Statement,235.034903,2020-01-07b
1,Treasury,235.334903,2020-01-07b
2,Per Pupil Funding,235.534903,2020-01-07b
3,Carbon Emissions,236.734903,2020-01-07b
4,Productivity,239.234903,2020-01-07b
...,...,...,...
831,"HoUSING, CoMMUNITIES AND LoCAL GoVERNMENT CoMM...",1964.434903,2020-07-20c
832,PRoCEDURE CoMMITTEE,1964.634903,2020-07-20c
833,WoMEN AND EQUALITIES CoMMITTEE,1964.834903,2020-07-20c
834,Endangered Species: Developing Countries,1965.034903,2020-07-20c


In [39]:

t = time.process_time()
speeches2019df = pd.DataFrame()
headings2019df = pd.DataFrame()
requestsno = 0


for date in yr_2019.debates:

    URL2 = f'https://www.theyworkforyou.com/pwdata/scrapedxml/debates/debates{date}.xml'

    r2 = requests.get(URL2)

    # Pause the loop
    sleep(randint(8,15))

    # Monitor the requests
    requestsno += 1
    #do some stuff
    elapsed_time = time.process_time() - t
    print('Request:{}; Frequency: {} requests/s'.format(requestsno, requestsno/elapsed_time))



    soup2 = BeautifulSoup(r2.text, 'html.parser')


    headings = []
    heading_id = []
    head_debate = []

    # this adds all of the major headings to a list along with their place in the schedule
    for heading in soup2.find_all('major-heading'):
        try:
            headings.append(heading.text.strip())
        except:
            headings.append(np.nan)
        try:
            heading_id.append(heading['id'])
        except:
            heading_id.append(np.nan)
    # this adds all of the minor headings to a list along with their place in the schedule
    for heading in soup2.find_all('minor-heading'):
        try:
            headings.append(heading.text.strip())
        except:
            headings.append(np.nan)
        try:
            heading_id.append(heading['id'])
        except:
            heading_id.append(np.nan)
        try:
            head_debate.append(date)
        except:
            head_debate.append(np.nan)
    print('Request:{}; Time Elapsed: {} s'.format(requestsno, elapsed_time))


    # this splits the headings into the topic and changes the id to numeric
    heading_time = []
    for hi in heading_id:
        heading_time.append(hi.split(f'{date}.')[1])
    topics = pd.DataFrame(list(zip(headings, heading_time, head_debate)), columns = ['heading', 'id', 'debate_date'])
    topics.id = pd.to_numeric(topics.id)

    chronological_topics = topics.sort_values(by = 'id', ascending = True, ignore_index = True)



    #Scraping the speaker names, speeches and details

    speakers = []
    speeches = []
    speech_id = []
    person_id = []
    time_of_speech = []
    debate_id = []


    for speech in soup2.find_all('speech'):
        try:
            speakers.append(speech['speakername'])
        except:
            speakers.append(np.nan)
        try:
            speeches.append(speech.text.strip())
        except:
            speeches.append(np.nan)
        try:
            speech_id.append(speech['id'])
        except:
            speech_id.append(np.nan)
        try:
            person_id.append(speech['person_id'])
        except:
            person_id.append(np.nan)
        try:
            time_of_speech.append(speech['time'])
        except:
            time_of_speech.append(np.nan)
        try:
            debate_id.append(date)
        except:
            debate_id.append(np.nan)



    df = pd.DataFrame(list(zip(speakers, speeches, speech_id, person_id, time_of_speech, debate_id)), 
                      columns = ['speakers', 'speeches', 'speech_id', 'person_id', 'time_of_speech', 'debate_id'])

    #Pulling out the paragraph id of the speech, where it fits in the order of the day
    df.speech_id = df.speech_id.str.replace(f'uk.org.publicwhip/debate/{date}.', '')
    df.speech_id =  pd.to_numeric(df.speech_id)



    #bins = chronological_topics.id.values
    #bins = bins.tolist() + [100000]
    #labels = chronological_topics.heading.values

    #df['topic'] = pd.cut(df.speech_id, bins = bins, labels = labels, duplicates = 'drop')

    # Adding this debates speech data to the final dataframe
    speeches2019df = speeches2019df.append(df, ignore_index = True)

    #Adding this debates topics to the topics dataframe
    headings2019df = headings2019df.append(chronological_topics, ignore_index = True)

















Request:1; Frequency: 37.86014462572475 requests/s
Request:1; Time Elapsed: 0.026413000000019338 s
Request:2; Frequency: 17.305379377176855 requests/s
Request:2; Time Elapsed: 0.11557100000001697 s
Request:3; Frequency: 0.7842187991362064 requests/s
Request:3; Time Elapsed: 3.8254630000000134 s
Request:4; Frequency: 0.835986196195926 requests/s
Request:4; Time Elapsed: 4.784768000000014 s
Request:5; Frequency: 0.946470111325705 requests/s
Request:5; Time Elapsed: 5.282787000000013 s
Request:6; Frequency: 0.727335763038856 requests/s
Request:6; Time Elapsed: 8.249285000000015 s
Request:7; Frequency: 0.6535094061942796 requests/s
Request:7; Time Elapsed: 10.711399 s
Request:8; Frequency: 0.6657118694179538 requests/s
Request:8; Time Elapsed: 12.017211000000003 s
Request:9; Frequency: 0.6507337167263578 requests/s
Request:9; Time Elapsed: 13.830542000000008 s
Request:10; Frequency: 0.7063546133255916 requests/s
Request:10; Time Elapsed: 14.157195000000002 s
Request:11; Frequency: 0.742469

Request:86; Frequency: 0.8202637186007101 requests/s
Request:86; Time Elapsed: 104.84433000000001 s
Request:87; Frequency: 0.8267176441671564 requests/s
Request:87; Time Elapsed: 105.235446 s
Request:88; Frequency: 0.8204685343371068 requests/s
Request:88; Time Elapsed: 107.25578900000002 s
Request:89; Frequency: 0.815639201687541 requests/s
Request:89; Time Elapsed: 109.116874 s
Request:90; Frequency: 0.8242302520544649 requests/s
Request:90; Time Elapsed: 109.192789 s
Request:91; Frequency: 0.8272288916617319 requests/s
Request:91; Time Elapsed: 110.005829 s
Request:92; Frequency: 0.8226043300801089 requests/s
Request:92; Time Elapsed: 111.83991700000001 s
Request:93; Frequency: 0.8280993374074455 requests/s
Request:93; Time Elapsed: 112.30536700000002 s
Request:94; Frequency: 0.8259141567142765 requests/s
Request:94; Time Elapsed: 113.81328100000002 s
Request:95; Frequency: 0.8210129820646861 requests/s
Request:95; Time Elapsed: 115.71071600000002 s
Request:96; Frequency: 0.82770741

In [40]:
speeches2019df.head()

Unnamed: 0,speakers,speeches,speech_id,person_id,time_of_speech,debate_id
0,Alok Sharma,My hon. Friend has highlighted an important po...,8.5,uk.org.publicwhip/person/24902,,2019-01-07b
1,,The Secretary of State was asked—,147.2,,,2019-01-08b
2,Kevin Foster,What steps his Department has taken to support...,147.4,uk.org.publicwhip/person/25338,,2019-01-08b
3,Richard Harrington,"Happy new year to you, Mr Speaker, and to ever...",147.5,uk.org.publicwhip/person/24954,,2019-01-08b
4,Kevin Foster,"As my hon. Friend mentions, Torbay’s £8 millio...",147.6,uk.org.publicwhip/person/25338,,2019-01-08b


In [41]:

t = time.process_time()
speeches2018df = pd.DataFrame()
headings2018df = pd.DataFrame()
requestsno = 0


for date in yr_2018.debates:

    URL2 = f'https://www.theyworkforyou.com/pwdata/scrapedxml/debates/debates{date}.xml'

    r2 = requests.get(URL2)

    # Pause the loop
    sleep(randint(8,15))

    # Monitor the requests
    requestsno += 1
    #do some stuff
    elapsed_time = time.process_time() - t
    print('Request:{}; Frequency: {} requests/s'.format(requestsno, requestsno/elapsed_time))



    soup2 = BeautifulSoup(r2.text, 'html.parser')


    headings = []
    heading_id = []
    head_debate = []

    # this adds all of the major headings to a list along with their place in the schedule
    for heading in soup2.find_all('major-heading'):
        try:
            headings.append(heading.text.strip())
        except:
            headings.append(np.nan)
        try:
            heading_id.append(heading['id'])
        except:
            heading_id.append(np.nan)
    # this adds all of the minor headings to a list along with their place in the schedule
    for heading in soup2.find_all('minor-heading'):
        try:
            headings.append(heading.text.strip())
        except:
            headings.append(np.nan)
        try:
            heading_id.append(heading['id'])
        except:
            heading_id.append(np.nan)
        try:
            head_debate.append(date)
        except:
            head_debate.append(np.nan)
    print('Request:{}; Time Elapsed: {} s'.format(requestsno, elapsed_time))


    # this splits the headings into the topic and changes the id to numeric
    heading_time = []
    for hi in heading_id:
        heading_time.append(hi.split(f'{date}.')[1])
    topics = pd.DataFrame(list(zip(headings, heading_time, head_debate)), columns = ['heading', 'id', 'debate_date'])
    topics.id = pd.to_numeric(topics.id)

    chronological_topics = topics.sort_values(by = 'id', ascending = True, ignore_index = True)



    #Scraping the speaker names, speeches and details

    speakers = []
    speeches = []
    speech_id = []
    person_id = []
    time_of_speech = []
    debate_id = []


    for speech in soup2.find_all('speech'):
        try:
            speakers.append(speech['speakername'])
        except:
            speakers.append(np.nan)
        try:
            speeches.append(speech.text.strip())
        except:
            speeches.append(np.nan)
        try:
            speech_id.append(speech['id'])
        except:
            speech_id.append(np.nan)
        try:
            person_id.append(speech['person_id'])
        except:
            person_id.append(np.nan)
        try:
            time_of_speech.append(speech['time'])
        except:
            time_of_speech.append(np.nan)
        try:
            debate_id.append(date)
        except:
            debate_id.append(np.nan)



    df = pd.DataFrame(list(zip(speakers, speeches, speech_id, person_id, time_of_speech, debate_id)), 
                      columns = ['speakers', 'speeches', 'speech_id', 'person_id', 'time_of_speech', 'debate_id'])

    #Pulling out the paragraph id of the speech, where it fits in the order of the day
    df.speech_id = df.speech_id.str.replace(f'uk.org.publicwhip/debate/{date}.', '')
    df.speech_id =  pd.to_numeric(df.speech_id)



    #bins = chronological_topics.id.values
    #bins = bins.tolist() + [100000]
    #labels = chronological_topics.heading.values

    #df['topic'] = pd.cut(df.speech_id, bins = bins, labels = labels, duplicates = 'drop')

    # Adding this debates speech data to the final dataframe
    speeches2018df = speeches2018df.append(df, ignore_index = True)

    #Adding this debates topics to the topics dataframe
    headings2018df = headings2018df.append(chronological_topics, ignore_index = True)

















Request:1; Frequency: 33.76325207640845 requests/s
Request:1; Time Elapsed: 0.029618000000027678 s
Request:2; Frequency: 19.236873238621673 requests/s
Request:2; Time Elapsed: 0.10396700000001147 s
Request:3; Frequency: 15.8430063688863 requests/s
Request:3; Time Elapsed: 0.189358000000027 s
Request:4; Frequency: 1.788875164241103 requests/s
Request:4; Time Elapsed: 2.2360419999999976 s
Request:5; Frequency: 1.5519380913271974 requests/s
Request:5; Time Elapsed: 3.221778000000029 s
Request:6; Frequency: 1.2169876877355637 requests/s
Request:6; Time Elapsed: 4.930205999999998 s
Request:7; Frequency: 0.8725021354489719 requests/s
Request:7; Time Elapsed: 8.022903000000042 s
Request:8; Frequency: 0.7998204403111489 requests/s
Request:8; Time Elapsed: 10.002245000000016 s
Request:9; Frequency: 0.7770924206377091 requests/s
Request:9; Time Elapsed: 11.581634000000008 s
Request:10; Frequency: 0.7276966965990125 requests/s
Request:10; Time Elapsed: 13.741989000000046 s
Request:11; Frequency: 

Request:85; Frequency: 0.7631742803012891 requests/s
Request:85; Time Elapsed: 111.37691900000004 s
Request:86; Frequency: 0.761725355760063 requests/s
Request:86; Time Elapsed: 112.90158500000001 s
Request:87; Frequency: 0.7550114929237977 requests/s
Request:87; Time Elapsed: 115.23003400000005 s
Request:88; Frequency: 0.7613103861233046 requests/s
Request:88; Time Elapsed: 115.59017400000005 s
Request:89; Frequency: 0.7694153291584279 requests/s
Request:89; Time Elapsed: 115.67224700000003 s
Request:90; Frequency: 0.7706219790173504 requests/s
Request:90; Time Elapsed: 116.78877900000003 s
Request:91; Frequency: 0.7659968251704332 requests/s
Request:91; Time Elapsed: 118.79944800000004 s
Request:92; Frequency: 0.7736077916633182 requests/s
Request:92; Time Elapsed: 118.92331100000001 s
Request:93; Frequency: 0.764496286097593 requests/s
Request:93; Time Elapsed: 121.64872700000001 s
Request:94; Frequency: 0.7701189079160055 requests/s
Request:94; Time Elapsed: 122.05907300000001 s
Re

In [42]:
speeches2018df.head()

Unnamed: 0,speakers,speeches,speech_id,person_id,time_of_speech,debate_id
0,Huw Merriman,One in five GP surgeries around the country ar...,37.8,uk.org.publicwhip/person/25426,15:41:00,2018-01-08b
1,Ruth Smeeth,I am sure my hon. Friend will join me in thank...,337.6,uk.org.publicwhip/person/25435,13:08:00,2018-01-10a
2,,The Minister for the Cabinet Office and the Ch...,303.2,,,2018-01-10b
3,Liz McInnes,What steps he is taking to ensure that local a...,303.4,uk.org.publicwhip/person/25230,,2018-01-10b
4,David Lidington,"Before I answer the hon. Lady’s question, Mr S...",303.5,uk.org.publicwhip/person/10361,,2018-01-10b


In [43]:

t = time.process_time()
speeches2017df = pd.DataFrame()
headings2017df = pd.DataFrame()
requestsno = 0


for date in yr_2017.debates:

    URL2 = f'https://www.theyworkforyou.com/pwdata/scrapedxml/debates/debates{date}.xml'

    r2 = requests.get(URL2)

    # Pause the loop
    sleep(randint(8,15))

    # Monitor the requests
    requestsno += 1
    #do some stuff
    elapsed_time = time.process_time() - t
    print('Request:{}; Frequency: {} requests/s'.format(requestsno, requestsno/elapsed_time))



    soup2 = BeautifulSoup(r2.text, 'html.parser')


    headings = []
    heading_id = []
    head_debate = []

    # this adds all of the major headings to a list along with their place in the schedule
    for heading in soup2.find_all('major-heading'):
        try:
            headings.append(heading.text.strip())
        except:
            headings.append(np.nan)
        try:
            heading_id.append(heading['id'])
        except:
            heading_id.append(np.nan)
    # this adds all of the minor headings to a list along with their place in the schedule
    for heading in soup2.find_all('minor-heading'):
        try:
            headings.append(heading.text.strip())
        except:
            headings.append(np.nan)
        try:
            heading_id.append(heading['id'])
        except:
            heading_id.append(np.nan)
        try:
            head_debate.append(date)
        except:
            head_debate.append(np.nan)
    print('Request:{}; Time Elapsed: {} s'.format(requestsno, elapsed_time))


    # this splits the headings into the topic and changes the id to numeric
    heading_time = []
    for hi in heading_id:
        heading_time.append(hi.split(f'{date}.')[1])
    topics = pd.DataFrame(list(zip(headings, heading_time, head_debate)), columns = ['heading', 'id', 'debate_date'])
    topics.id = pd.to_numeric(topics.id)

    chronological_topics = topics.sort_values(by = 'id', ascending = True, ignore_index = True)



    #Scraping the speaker names, speeches and details

    speakers = []
    speeches = []
    speech_id = []
    person_id = []
    time_of_speech = []
    debate_id = []


    for speech in soup2.find_all('speech'):
        try:
            speakers.append(speech['speakername'])
        except:
            speakers.append(np.nan)
        try:
            speeches.append(speech.text.strip())
        except:
            speeches.append(np.nan)
        try:
            speech_id.append(speech['id'])
        except:
            speech_id.append(np.nan)
        try:
            person_id.append(speech['person_id'])
        except:
            person_id.append(np.nan)
        try:
            time_of_speech.append(speech['time'])
        except:
            time_of_speech.append(np.nan)
        try:
            debate_id.append(date)
        except:
            debate_id.append(np.nan)



    df = pd.DataFrame(list(zip(speakers, speeches, speech_id, person_id, time_of_speech, debate_id)), 
                      columns = ['speakers', 'speeches', 'speech_id', 'person_id', 'time_of_speech', 'debate_id'])

    #Pulling out the paragraph id of the speech, where it fits in the order of the day
    df.speech_id = df.speech_id.str.replace(f'uk.org.publicwhip/debate/{date}.', '')
    df.speech_id =  pd.to_numeric(df.speech_id)



    #bins = chronological_topics.id.values
    #bins = bins.tolist() + [100000]
    #labels = chronological_topics.heading.values

    #df['topic'] = pd.cut(df.speech_id, bins = bins, labels = labels, duplicates = 'drop')

    # Adding this debates speech data to the final dataframe
    speeches2017df = speeches2017df.append(df, ignore_index = True)

    #Adding this debates topics to the topics dataframe
    headings2017df = headings2017df.append(chronological_topics, ignore_index = True)

















Request:1; Frequency: 23.528304550398293 requests/s
Request:1; Time Elapsed: 0.0425019999999563 s
Request:2; Frequency: 0.8674606823445864 requests/s
Request:2; Time Elapsed: 2.3055799999999635 s
Request:3; Frequency: 1.0214821092516124 requests/s
Request:3; Time Elapsed: 2.9369089999999574 s
Request:4; Frequency: 1.3236411913167985 requests/s
Request:4; Time Elapsed: 3.021966999999961 s
Request:5; Frequency: 1.0023814578676062 requests/s
Request:5; Time Elapsed: 4.988120999999978 s
Request:6; Frequency: 1.1582278187790482 requests/s
Request:6; Time Elapsed: 5.1803279999999745 s
Request:7; Frequency: 1.331695728072309 requests/s
Request:7; Time Elapsed: 5.256455999999957 s
Request:8; Frequency: 1.1313309812019514 requests/s
Request:8; Time Elapsed: 7.071316999999965 s
Request:9; Frequency: 1.210612389622423 requests/s
Request:9; Time Elapsed: 7.434253999999953 s
Request:10; Frequency: 1.0604288565172977 requests/s
Request:10; Time Elapsed: 9.430146999999977 s
Request:11; Frequency: 1.1

Request:84; Time Elapsed: 101.44810299999995 s
Request:85; Frequency: 0.8111626516437561 requests/s
Request:85; Time Elapsed: 104.78786200000002 s
Request:86; Frequency: 0.8108953100775976 requests/s
Request:86; Time Elapsed: 106.05561399999999 s
Request:87; Frequency: 0.8054824505649966 requests/s
Request:87; Time Elapsed: 108.00980199999992 s
Request:88; Frequency: 0.8077397327468265 requests/s
Request:88; Time Elapsed: 108.94598399999995 s
Request:89; Frequency: 0.8102478988951921 requests/s
Request:89; Time Elapsed: 109.84292599999992 s
Request:90; Frequency: 0.8186575698650332 requests/s
Request:90; Time Elapsed: 109.93607499999996 s
Request:91; Frequency: 0.825316292492747 requests/s
Request:91; Time Elapsed: 110.26075800000001 s
Request:92; Frequency: 0.8283868210247856 requests/s
Request:92; Time Elapsed: 111.05922699999996 s
Request:93; Frequency: 0.8203939853017865 requests/s
Request:93; Time Elapsed: 113.36016799999993 s
Request:94; Frequency: 0.815103846527947 requests/s
Re

In [44]:
speeches2017df.head()

Unnamed: 0,speakers,speeches,speech_id,person_id,time_of_speech,debate_id
0,,The Secretary of State was asked—,1.2,,,2017-01-09b
1,John Bercow,I call Mr Gerald Jones. Where is the fella? He...,1.3,uk.org.publicwhip/person/10040,,2017-01-09b
2,Peter Dowd,What recent assessment he has made of trends i...,1.5,uk.org.publicwhip/person/25309,,2017-01-09b
3,Damian Green,The Government support those who aspire to be ...,1.6,uk.org.publicwhip/person/10241,,2017-01-09b
4,Peter Dowd,A Citizens Advice report in August 2015 said t...,1.7,uk.org.publicwhip/person/25309,,2017-01-09b


In [45]:

t = time.process_time()
speeches2016df = pd.DataFrame()
headings2016df = pd.DataFrame()
requestsno = 0


for date in yr_2016.debates:

    URL2 = f'https://www.theyworkforyou.com/pwdata/scrapedxml/debates/debates{date}.xml'

    r2 = requests.get(URL2)

    # Pause the loop
    sleep(randint(8,15))

    # Monitor the requests
    requestsno += 1
    #do some stuff
    elapsed_time = time.process_time() - t
    print('Request:{}; Frequency: {} requests/s'.format(requestsno, requestsno/elapsed_time))



    soup2 = BeautifulSoup(r2.text, 'html.parser')


    headings = []
    heading_id = []
    head_debate = []

    # this adds all of the major headings to a list along with their place in the schedule
    for heading in soup2.find_all('major-heading'):
        try:
            headings.append(heading.text.strip())
        except:
            headings.append(np.nan)
        try:
            heading_id.append(heading['id'])
        except:
            heading_id.append(np.nan)
    # this adds all of the minor headings to a list along with their place in the schedule
    for heading in soup2.find_all('minor-heading'):
        try:
            headings.append(heading.text.strip())
        except:
            headings.append(np.nan)
        try:
            heading_id.append(heading['id'])
        except:
            heading_id.append(np.nan)
        try:
            head_debate.append(date)
        except:
            head_debate.append(np.nan)
    print('Request:{}; Time Elapsed: {} s'.format(requestsno, elapsed_time))


    # this splits the headings into the topic and changes the id to numeric
    heading_time = []
    for hi in heading_id:
        heading_time.append(hi.split(f'{date}.')[1])
    topics = pd.DataFrame(list(zip(headings, heading_time, head_debate)), columns = ['heading', 'id', 'debate_date'])
    topics.id = pd.to_numeric(topics.id)

    chronological_topics = topics.sort_values(by = 'id', ascending = True, ignore_index = True)



    #Scraping the speaker names, speeches and details

    speakers = []
    speeches = []
    speech_id = []
    person_id = []
    time_of_speech = []
    debate_id = []


    for speech in soup2.find_all('speech'):
        try:
            speakers.append(speech['speakername'])
        except:
            speakers.append(np.nan)
        try:
            speeches.append(speech.text.strip())
        except:
            speeches.append(np.nan)
        try:
            speech_id.append(speech['id'])
        except:
            speech_id.append(np.nan)
        try:
            person_id.append(speech['person_id'])
        except:
            person_id.append(np.nan)
        try:
            time_of_speech.append(speech['time'])
        except:
            time_of_speech.append(np.nan)
        try:
            debate_id.append(date)
        except:
            debate_id.append(np.nan)



    df = pd.DataFrame(list(zip(speakers, speeches, speech_id, person_id, time_of_speech, debate_id)), 
                      columns = ['speakers', 'speeches', 'speech_id', 'person_id', 'time_of_speech', 'debate_id'])

    #Pulling out the paragraph id of the speech, where it fits in the order of the day
    df.speech_id = df.speech_id.str.replace(f'uk.org.publicwhip/debate/{date}.', '')
    df.speech_id =  pd.to_numeric(df.speech_id)



    #bins = chronological_topics.id.values
    #bins = bins.tolist() + [100000]
    #labels = chronological_topics.heading.values

    #df['topic'] = pd.cut(df.speech_id, bins = bins, labels = labels, duplicates = 'drop')

    # Adding this debates speech data to the final dataframe
    speeches2016df = speeches2016df.append(df, ignore_index = True)

    #Adding this debates topics to the topics dataframe
    headings2016df = headings2016df.append(chronological_topics, ignore_index = True)

















Request:1; Frequency: 31.30870381958648 requests/s
Request:1; Time Elapsed: 0.03194000000007691 s
Request:2; Frequency: 18.364123847644112 requests/s
Request:2; Time Elapsed: 0.1089080000000422 s
Request:3; Frequency: 13.792088857828231 requests/s
Request:3; Time Elapsed: 0.217516000000046 s
Request:4; Frequency: 10.504174096180932 requests/s
Request:4; Time Elapsed: 0.3808010000000195 s
Request:5; Frequency: 10.641694157708436 requests/s
Request:5; Time Elapsed: 0.46985000000006494 s
Request:6; Frequency: 7.972521376322227 requests/s
Request:6; Time Elapsed: 0.7525850000000673 s
Request:7; Frequency: 8.627638516735628 requests/s
Request:7; Time Elapsed: 0.8113460000000714 s
Request:8; Frequency: 8.92652908653703 requests/s
Request:8; Time Elapsed: 0.896205000000009 s
Request:9; Frequency: 7.802523336046361 requests/s
Request:9; Time Elapsed: 1.1534730000000764 s
Request:10; Frequency: 8.075959242248823 requests/s
Request:10; Time Elapsed: 1.2382430000000113 s
Request:11; Frequency: 7.

Request:84; Time Elapsed: 50.640047000000095 s
Request:85; Frequency: 1.6020276166190341 requests/s
Request:85; Time Elapsed: 53.057762000000025 s
Request:86; Frequency: 1.6181685252523839 requests/s
Request:86; Time Elapsed: 53.14650400000005 s
Request:87; Frequency: 1.5918634188505514 requests/s
Request:87; Time Elapsed: 54.65293000000008 s
Request:88; Frequency: 1.5232049631002733 requests/s
Request:88; Time Elapsed: 57.772921 s
Request:89; Frequency: 1.4919854064381146 requests/s
Request:89; Time Elapsed: 59.65205800000001 s
Request:90; Frequency: 1.4570000669734364 requests/s
Request:90; Time Elapsed: 61.770759 s
Request:91; Frequency: 1.469007841144138 requests/s
Request:91; Time Elapsed: 61.94657200000006 s
Request:92; Frequency: 1.4158860132962157 requests/s
Request:92; Time Elapsed: 64.97698200000002 s
Request:93; Frequency: 1.422813536048257 requests/s
Request:93; Time Elapsed: 65.36344900000006 s
Request:94; Frequency: 1.3928827249053648 requests/s
Request:94; Time Elapsed: 

In [46]:
speeches2016df.head()

Unnamed: 0,speakers,speeches,speech_id,person_id,time_of_speech,debate_id
0,Mark Durkan,In ascribing a key role in the Syrian process ...,109.2,uk.org.publicwhip/person/11589,019:30:00,2016-01-05a
1,Tobias Ellwood,Before I reply to the right hon. Gentleman’s i...,96.0,uk.org.publicwhip/person/11437,019:30:00,2016-01-05b
2,,The Secretary of State was asked—,413.2,,09:30:00,2016-01-07b
3,Marion Fellows,What steps she is taking to reduce the level o...,413.4,uk.org.publicwhip/person/25277,09:30:00,2016-01-07b
4,Amber Rudd,"A reformed domestic supplier obligation—ECO, o...",413.5,uk.org.publicwhip/person/24795,09:30:00,2016-01-07b


In [47]:
speeches2020df.to_csv('speeches2020.csv')
headings2020df.to_csv('headings2020.csv')
speeches2019df.to_csv('speeches2019.csv')
headings2019df.to_csv('headings2019.csv')
speeches2018df.to_csv('speeches2018.csv')
headings2018df.to_csv('headings2018.csv')
speeches2017df.to_csv('speeches2017.csv')
headings2017df.to_csv('headings2017.csv')
speeches2016df.to_csv('speeches2016.csv')
headings2016df.to_csv('headings2016.csv')

In [94]:
speeches2016df.shape

(37954, 6)

In [95]:
headings2020df = pd.read_csv('headings2020.csv')

In [96]:
headings2020df

Unnamed: 0.1,Unnamed: 0,heading,id,debate_date
0,0,Speaker’s Statement,235.0,2020-01-07b
1,1,Treasury,235.3,2020-01-07b
2,2,Per Pupil Funding,235.5,2020-01-07b
3,3,Carbon Emissions,236.7,2020-01-07b
4,4,Productivity,239.2,2020-01-07b
...,...,...,...,...
831,831,"HoUSING, CoMMUNITIES AND LoCAL GoVERNMENT CoMM...",1964.4,2020-07-20c
832,832,PRoCEDURE CoMMITTEE,1964.6,2020-07-20c
833,833,WoMEN AND EQUALITIES CoMMITTEE,1964.8,2020-07-20c
834,834,Endangered Species: Developing Countries,1965.0,2020-07-20c


In [97]:
last4years = pd.DataFrame()

In [98]:
last4years = last4years.append(speeches2020df, ignore_index = True)
last4years = last4years.append(speeches2019df, ignore_index = True)
last4years = last4years.append(speeches2018df, ignore_index = True)
last4years = last4years.append(speeches2017df, ignore_index = True)
last4years = last4years.append(speeches2016df, ignore_index = True)

In [99]:
all_headings = pd.DataFrame()

In [100]:
all_headings = all_headings.append(headings2020df, ignore_index = True)
all_headings = all_headings.append(headings2019df, ignore_index = True)
all_headings = all_headings.append(headings2018df, ignore_index = True)
all_headings = all_headings.append(headings2017df, ignore_index = True)
all_headings = all_headings.append(headings2016df, ignore_index = True)

In [102]:
all_headings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6227 entries, 0 to 6226
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Unnamed: 0   836 non-null    float64
 1   heading      6227 non-null   object 
 2   id           6227 non-null   float64
 3   debate_date  6227 non-null   object 
dtypes: float64(2), object(2)
memory usage: 194.7+ KB


In [103]:
last4years.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150712 entries, 0 to 150711
Data columns (total 6 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   speakers        146059 non-null  object 
 1   speeches        150712 non-null  object 
 2   speech_id       150712 non-null  float64
 3   person_id       144505 non-null  object 
 4   time_of_speech  150712 non-null  object 
 5   debate_id       150712 non-null  object 
dtypes: float64(1), object(5)
memory usage: 6.9+ MB


In [104]:
last4years.to_csv('last4years.csv')
all_headings.to_csv('all_headings.csv')

In [105]:
mps = pd.read_csv('mps.csv')

In [107]:
mps.head()

Unnamed: 0,Person ID,First name,Last name,Party,Constituency,URI
0,10001,Diane,Abbott,Labour,Hackney North and Stoke Newington,https://www.theyworkforyou.com/mp/10001/diane_...
1,25034,Debbie,Abrahams,Labour,Oldham East and Saddleworth,https://www.theyworkforyou.com/mp/25034/debbie...
2,24878,Nigel,Adams,Conservative,Selby and Ainsty,https://www.theyworkforyou.com/mp/24878/nigel_...
3,25661,Bim,Afolami,Conservative,Hitchin and Harpenden,https://www.theyworkforyou.com/mp/25661/bim_af...
4,11929,Adam,Afriyie,Conservative,Windsor,https://www.theyworkforyou.com/mp/11929/adam_a...


In [111]:
last4years[last4years.speakers == 'Diane Abbott']

Unnamed: 0,speakers,speeches,speech_id,person_id,time_of_speech,debate_id
1695,Diane Abbott,(Urgent Question): To ask the Secretary of Sta...,302.2,uk.org.publicwhip/person/10001,12:31:00,2020-01-22d
1697,Diane Abbott,The Minister will be aware that guidance issue...,302.4,uk.org.publicwhip/person/10001,12:31:00,2020-01-22d
3244,Diane Abbott,The Home Secretary will be aware of the condit...,561.8,uk.org.publicwhip/person/10001,,2020-02-10b
3358,Diane Abbott,Does the Home Secretary appreciate the widespr...,577.2,uk.org.publicwhip/person/10001,,2020-02-10b
3393,Diane Abbott,The public will note the very dismissive attit...,586.2,uk.org.publicwhip/person/10001,15:33:00,2020-02-10b
...,...,...,...,...,...,...
144172,Diane Abbott,"It may be a criminal offence, but the entire H...",159.1,uk.org.publicwhip/person/10001,14:30:00,2016-11-15b
144174,Diane Abbott,I am grateful to the hon. and learned Lady for...,160.1,uk.org.publicwhip/person/10001,14:30:00,2016-11-15b
144892,Diane Abbott,This inquiry is on its fourth chair. Every tim...,607.1,uk.org.publicwhip/person/10001,15:37:00,2016-11-21b
145708,Diane Abbott,(Urgent Question): To ask the Secretary of Sta...,1163.0,uk.org.publicwhip/person/10001,11:00:00,2016-11-25a
