In [22]:
# import relevant libraries
import pandas as pd
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm

We want to compute the arks that are needed to enrich our logs. This request is quite slow so we do it incrementally. We load the arks already requested, remove them from the list of arks to request and then perform the request.

In [None]:
# load arks that were already requested to remove them from the list of unique arks to request
arks_already_requested = pd.read_csv("arks_final/arks_non_empty.csv")

In [None]:
# load arks that were not requested
arks_not_computed = pd.read_csv("data_temp_month/unique_arks/unique_arks_counts0.csv")

In [None]:
# remove those already requested from those not computed
arks_to_request = arks_not_computed[~arks_not_computed['Ark'].isin(arks_already_requested['Ark'])]

We can adapt this by repeating this step for other dataframes of arks that were already requested.

Some arks may not be valid, we look at the head of the arks we want to request and if needed start at a certain index.

In [None]:
arks_to_request.head(5)

In [None]:
filtered_arks = arks_to_request[1:]

The arks enable us to request the metadata of the document they are related to to Gallica. From that, we will extract the theme of the document and its Dewey class, if there is one. Only printed documents and prints have Dewey classes.

In [23]:
# function for OAI request to Gallica
def OAI(id):

    OAI_BASEURL = 'https://gallica.bnf.fr/services/OAIRecord?ark='

    url = "".join([OAI_BASEURL, id])

    s = requests.get(url, stream=True)
    soup = BeautifulSoup(s.content,"lxml-xml")
    return soup

# function to extract type and theme
def extract_metadata(ark):
    if ark == '-':
        return ark, None, None
    theme = ''
    typedoc = ''
    try:
        oai_result = OAI(ark)
        if oai_result is not None:
            # extracting theme
            sdewey_tag = oai_result.find("sdewey")
            if sdewey_tag:
                theme = sdewey_tag.text
        
            # extracting typedoc
            typedoc_tag = oai_result.find("typedoc")
            if typedoc_tag:
                typedoc = typedoc_tag.text
        
    except Exception as e:
        print("Error occurred:", e)

    return ark, theme, typedoc

As the process takes a long time, we show a progress bar and save the result incrementally.

In [36]:
# function to request the arks metadata, with a progress bar and saving batches to a folder
def apply_with_progress(df, func, result_df, batch_size=30000, output_prefix='arks_temp_month/arks_batch'):
    result = []
    with tqdm(total=len(df)) as pbar:
        for index, row in df.iterrows():
            curr_result = func(row)
            result.append(curr_result)
            pbar.update(1)
            
            # save to CSV file every batch_size ARKs
            if len(result) % batch_size == 0:
                batch_df = pd.DataFrame(result, columns=['Ark', 'Theme', 'Type'])
                batch_df.to_csv(f'{output_prefix}_{len(result)}.csv', index=False)
    
    return pd.DataFrame(result, columns=['Ark', 'Theme', 'Type'])

In [None]:
arks_month = pd.DataFrame()
clean_arks_month = apply_with_progress(filtered_arks, lambda row: extract_metadata(row['Ark']), arks_month)

  0%|                              | 5837/2620883 [1:10:58<401:59:50,  1.81it/s]

Error occurred: ('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))
Error occurred: ('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))
Error occurred: ('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))


  0%|                              | 5841/2620883 [1:10:58<186:26:11,  3.90it/s]

Error occurred: ('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))
Error occurred: ('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))
Error occurred: ('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))


  0%|                              | 5843/2620883 [1:10:58<141:38:35,  5.13it/s]

Error occurred: ('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))
Error occurred: ('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))
Error occurred: ('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))


  0%|                              | 5845/2620883 [1:10:58<111:40:47,  6.50it/s]

Error occurred: ('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))
Error occurred: ('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))


  0%|                              | 5849/2620883 [1:10:59<105:30:14,  6.89it/s]

Error occurred: ('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))
Error occurred: ('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))
Error occurred: ('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))


  0%|                               | 5851/2620883 [1:10:59<89:04:22,  8.16it/s]

Error occurred: ('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))
Error occurred: ('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))
Error occurred: ('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))


  0%|                               | 5855/2620883 [1:10:59<69:10:48, 10.50it/s]

Error occurred: ('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))
Error occurred: ('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))
Error occurred: ('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))


  0%|                               | 5857/2620883 [1:10:59<63:39:05, 11.41it/s]

Error occurred: ('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))
Error occurred: ('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))
Error occurred: ('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))


  0%|                               | 5861/2620883 [1:11:00<57:29:31, 12.63it/s]

Error occurred: ('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))
Error occurred: ('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))
Error occurred: ('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))


  0%|                               | 5863/2620883 [1:11:00<56:06:55, 12.94it/s]

Error occurred: ('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))
Error occurred: ('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))
Error occurred: ('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))


  0%|                               | 5867/2620883 [1:11:00<53:36:36, 13.55it/s]

Error occurred: ('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))
Error occurred: ('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))
Error occurred: ('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))


  0%|                               | 5869/2620883 [1:11:00<55:42:13, 13.04it/s]

Error occurred: ('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))


  1%|▏                            | 21129/2620883 [4:31:54<336:09:50,  2.15it/s]

Error occurred: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))


  1%|▏                            | 21132/2620883 [4:31:55<305:13:40,  2.37it/s]

Error occurred: ('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))
Error occurred: ('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))
Error occurred: ('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))
Error occurred: ('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))


  1%|▏                            | 21136/2620883 [4:31:55<141:33:25,  5.10it/s]

Error occurred: ('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))
Error occurred: ('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))
Error occurred: ('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))
Error occurred: ('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))


  1%|▏                             | 21140/2620883 [4:31:56<84:51:58,  8.51it/s]

Error occurred: ('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))
Error occurred: ('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))
Error occurred: ('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))
Error occurred: ('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))


  1%|▏                             | 21145/2620883 [4:31:56<55:07:22, 13.10it/s]

Error occurred: ('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))
Error occurred: ('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))
Error occurred: ('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))
Error occurred: ('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))
Error occurred: ('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))


  1%|▏                             | 21151/2620883 [4:31:56<41:42:44, 17.31it/s]

Error occurred: ('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))
Error occurred: ('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))
Error occurred: ('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))
Error occurred: ('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))
Error occurred: ('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))


  1%|▏                             | 21154/2620883 [4:31:56<38:35:05, 18.72it/s]

Error occurred: ('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))
Error occurred: ('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))
Error occurred: ('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))
Error occurred: ('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))
Error occurred: ('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))


  1%|▏                             | 21160/2620883 [4:31:57<35:04:37, 20.59it/s]

Error occurred: ('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))
Error occurred: ('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))
Error occurred: ('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))
Error occurred: ('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))
Error occurred: ('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))


  1%|▏                             | 21166/2620883 [4:31:57<33:34:39, 21.51it/s]

Error occurred: ('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))
Error occurred: ('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))
Error occurred: ('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))
Error occurred: ('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))
Error occurred: ('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))


  1%|▏                             | 21169/2620883 [4:31:57<33:03:58, 21.84it/s]

Error occurred: ('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))
Error occurred: ('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))
Error occurred: ('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))
Error occurred: ('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))
Error occurred: ('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))


  1%|▏                             | 21175/2620883 [4:31:57<32:31:53, 22.20it/s]

Error occurred: ('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))
Error occurred: ('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))
Error occurred: ('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))
Error occurred: ('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))


  1%|▎                            | 27879/2620883 [5:50:53<355:04:07,  2.03it/s]IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)

  2%|▌                            | 47200/2620883 [9:08:49<264:25:00,  2.70it/s]

Error occurred: ("Connection broken: InvalidChunkLength(got length b'', 0 bytes read)", InvalidChunkLength(got length b'', 0 bytes read))


  2%|▌                            | 50697/2620883 [9:39:27<238:46:37,  2.99it/s]

In [None]:
# save the requested arks
clean_arks_month.to_csv("arks_temp_month/arks_requested.csv")

We now (hopefully) have all the arks and their associated type and theme. The next step will be to create user sessions and enrich them using these types and themes.