In [1]:
from wpqs import get_wpqs_by_date

import requests
import pandas as pd
from tqdm import tqdm
from datetime import datetime
from dateutil.relativedelta import relativedelta
from pathlib import Path

In [3]:
tmp = '/Users/ben/Documents/blog/pqs/tmp'

# Declare some datetime variables
# We use these to generate the complete archive since 2014-05, when the digital record begins. 
today = datetime.today()
today_str = today.strftime("%Y-%m-%d")
tomorrow = today + relativedelta(days=1)
tomorrow_str = tomorrow.strftime("%Y-%m-%d")
start_date = '2014-05-01'
next_month = today + relativedelta(months=1)
next_month_str = next_month.strftime("%Y-%m-%d")

In [29]:
def download_ua_pqs(tmp = '/Users/ben/Documents/blog/pqs/tmp'):

    # Declare some datetime variables
    # We use these to generate the complete archive since 2014-05, when the digital record begins. 
    today = datetime.today()
    today_str = today.strftime("%Y-%m-%d")
    tomorrow = today + relativedelta(days=1)
    tomorrow_str = tomorrow.strftime("%Y-%m-%d")
    start_date = '2014-05-01'
    next_month = today + relativedelta(months=1)
    next_month_str = next_month.strftime("%Y-%m-%d")


    if Path(tmp+'/ua_pqs.csv', index_col=0).is_file():
        pqs = pd.read_csv(Path(tmp+'/ua_pqs.csv'))
        no_pqs = pqs.shape[0]
        pqs['dateTabled'] = pd.to_datetime(pqs.dateTabled)
        start_date = pqs.dateTabled.max()#.strftime("%Y-%m-%d")
        # start_date = pd.to_datetime('2022-01-15')
        print("File ua_pqs.csv found with {n} WPQs in, last updated {d}. Proceeding to update.".format(n=no_pqs, d=start_date.strftime("%Y-%m-%d")))

        date_list = pd.date_range(start_date, today, freq='D').strftime('%Y-%m-%d').tolist()
        
        date_list.append(tomorrow_str)

        master_wpqs = []

        for i, e in enumerate(tqdm(date_list, leave=False)):
            try:
                lst_wpqs = get_wpqs_by_date(tabledWhenFrom=date_list[i], tabledWhenTo=date_list[i+1])
                for x in lst_wpqs:
                    master_wpqs.append(x)
            except IndexError:
                pass
        n_pqs = pd.DataFrame(master_wpqs)
        try:
            n_pqs.drop(columns=['attachments', 'groupedQuestions', 'groupedQuestionsDates'], inplace=True)
            n_pqs.drop(columns=[
                'isWithdrawn',
                'isNamedDay',
                'answerIsHolding',
                'answerIsCorrection',
                'answeringMemberId',
                'answeringMember',
                'correctingMemberId',
                'correctingMember',
                'dateAnswered',
                'answerText',
                'originalAnswerText',
                'comparableAnswerText',
                'dateAnswerCorrected',
                'dateHoldingAnswer',
                'attachmentCount',
            ], inplace=True)
        except KeyError:
            pass
        no_n_pqs = len(master_wpqs)
        old_length = pqs.shape[0]
        new_pqs = pd.concat([pqs, n_pqs])
        
        new_pqs = new_pqs.drop_duplicates()
        new_length = new_pqs.shape[0]
        pqs_added = new_length - old_length
        print("Downloaded {n} WPQs, which have been add to the archive.".format(n=pqs_added))
        new_pqs.to_csv(Path(tmp+'/ua_pqs.csv'), index=False, index_label=False)
        print("All done, be on your merry way.")
        return new_pqs


    # Now handle situations where there's no file. 
    else:
        # If there's no file already downloaded, then we'll download everything since the beginning of time (May 2014, according to Parliament's API)
        print("No file found, proceeding with download of full archive. Sit tight, this can take about 20 minutes!")
        min_date_list = pd.date_range(start_date, today, freq='MS').strftime('%Y-%m-%d').tolist()
        max_date_list = pd.date_range(start_date, next_month, freq='M').strftime('%Y-%m-%d').tolist()
        # max_date_list.append(next_month_str)

        # List for all the PQS to go in:
        master_wpqs = []

        # Iterate through idx, get unanswered wpqs from each month and append to the master_wpqs list
        for i, e in enumerate(tqdm(max_date_list, leave=False)):
            lst_wpqs = get_wpqs_by_date(tabledWhenFrom=min_date_list[i], tabledWhenTo=max_date_list[i])
            for x in lst_wpqs:
                master_wpqs.append(x) 

        # Convert to DataFrame
        pqs = pd.DataFrame(master_wpqs)
        pqs.drop(columns=['attachments', 'groupedQuestions', 'groupedQuestionsDates'], inplace=True)
        pqs.drop(columns=[
                'isWithdrawn',
                'isNamedDay',
                'answerIsHolding',
                'answerIsCorrection',
                'answeringMemberId',
                'answeringMember',
                'correctingMemberId',
                'correctingMember',
                'dateAnswered',
                'answerText',
                'originalAnswerText',
                'comparableAnswerText',
                'dateAnswerCorrected',
                'dateHoldingAnswer',
                'attachmentCount',
            ], inplace=True)
        # pqs.drop_duplicates(inplace=True)
        pqs['dateTabled'] = pd.to_datetime(pqs.dateTabled)
        pqs.to_csv(Path(tmp+'/ua_pqs.csv'), index=False, index_label=False)
        print('Full archive downloaded up to {d}. To get WPQs tabled since that date, call this function once more.'.format(d=pqs.dateTabled.max().strftime('%Y-%m-%d')))
        return pqs

In [30]:
download_ua_pqs()

  0%|          | 0/5 [00:00<?, ?it/s]

File ua_pqs.csv found with 384900 WPQs in, last updated 2022-02-25. Proceeding to update.


                                             

Downloaded 0 WPQs, which have been add to the archive.
All done, be on your merry way.


Unnamed: 0,id,askingMemberId,askingMember,house,memberHasInterest,dateTabled,dateForAnswer,uin,questionText,answeringBodyId,answeringBodyName,heading
0,58429,437,,Commons,False,2014-06-05 00:00:00,2014-06-09T00:00:00,199151,"To ask Mr Chancellor of the Exchequer, how muc...",14,HM Treasury,Working Tax Credit: Bolton
1,57913,4031,,Commons,False,2014-06-04 00:00:00,2014-06-09T00:00:00,198956,"To ask Mr Chancellor of the Exchequer, what th...",14,HM Treasury,Welfare Tax Credits
2,61038,4031,,Commons,False,2014-06-10 00:00:00,2014-06-16T00:00:00,199850,"To ask Mr Chancellor of the Exchequer, what th...",14,HM Treasury,Welfare Tax Credits
3,64002,163,,Commons,False,2014-06-25 00:00:00,2014-06-30T00:00:00,202509,"To ask Mr Chancellor of the Exchequer, how man...",14,HM Treasury,Welfare Tax Credits: Self-employed
4,64577,385,,Commons,False,2014-06-27 00:00:00,2014-07-01T00:00:00,202934,"To ask Mr Chancellor of the Exchequer, how man...",14,HM Treasury,Welfare Tax Credits: York
...,...,...,...,...,...,...,...,...,...,...,...,...
202,1435654,178,,Commons,False,2022-02-25T00:00:00,2022-03-01T00:00:00,129734,"To ask the Minister for the Cabinet Office, wh...",53,Cabinet Office,
203,1435652,178,,Commons,False,2022-02-25T00:00:00,2022-03-01T00:00:00,129733,"To ask the Minister for the Cabinet Office, if...",53,Cabinet Office,
204,1435651,178,,Commons,False,2022-02-25T00:00:00,2022-03-01T00:00:00,129732,"To ask the Minister for the Cabinet Office, wh...",53,Cabinet Office,
205,1435650,178,,Commons,False,2022-02-25T00:00:00,2022-03-01T00:00:00,129731,"To ask the Minister for the Cabinet Office, wh...",53,Cabinet Office,
