In [1]:
# import relevant libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import gzip
import os
import glob
import re
import requests
from datetime import datetime
from urllib.parse import urlparse, parse_qs, unquote
from collections import deque
from bs4 import BeautifulSoup
from tqdm import tqdm
import ast
import dask.dataframe as dd

In [None]:
# if code was already run, to save time
clean_logs_df = pd.read_csv("data_month/clean_logs.csv")
unique_arks_df = pd.read_csv("data_month/unique_arks_df.csv")
arks_non_empty_df = pd.read_csv("arks_final_month/arks_non_empty.csv")
arks_empty_df = pd.read_csv("arks_final_month/arks_empty.csv")
enriched_logs_df = pd.read_csv("data_month/enriched_logs.csv")

## 1 : Extracting the logs

In [2]:
# get logs for month of February
folder_path = r'../../../lhstdata1/students/Gallica_logs/1LogGallicaFevrier2016'
all_files = glob.glob(folder_path + "/*.log.gz")
all_files.sort(key=lambda f: int(re.sub('\D', '', f)))

In [3]:
len(all_files)

2201

In [None]:
dfs = []

for f in tqdm(all_files, desc="Processing all files"):
    try:
        # Read csv file into a Dask DataFrame
        df = dd.read_csv(f, encoding='UTF-8', sep='\t', header=None, blocksize=None)
        # Append to list of Dask DataFrames
        dfs.append(df)
    except Exception as e:
        print(f"Error loading file {f}: {e}")

Processing all files:  27%|████▉             | 603/2201 [00:32<01:30, 17.75it/s]

In [None]:
computed_dfs = []
for df in tqdm(dfs[:1100], desc="Computing DataFrames"):
    computed_dfs.append(df.compute())

In [None]:
for df in tqdm(dfs[1100:], desc="Computing DataFrames"):
    computed_dfs.append(df.compute())

In [None]:
combined_df = pd.concat(computed_dfs, ignore_index=True)

In [None]:
combined_df.shape

In [None]:
combined_df.to_csv("data_temp_month/raw_logs_month.csv")

## 2 : Extracting features from logs 

Structure of a line : '##' then IP address, '##' then country (or null), '##' then city (or null), '##--' then date


nothing, then HTTP request in quotes, with the protocol number
followed by number of response (200 = OK), then size, then referrer website (or '-' if unknown)

In [None]:
# split the lines to recover meaningful information - ip address, country, city, date and request
lines_df = combined_df[0].str.split('##', expand=True)

In [None]:
# rename the columns with informative names
lines_df = lines_df.rename(columns = {1:"IPaddress", 2:"Country", 3:"City", 4:"Full_request"})

In [None]:
lines_df.head()

In [None]:
# extract dates
temp = pd.DataFrame()
temp['Date'] = lines_df.apply(lambda x: x['Full_request'].split("]")[0].split("[")[1], axis=1)

In [None]:
# convert to datetime format for sorting, to find earliest and oldest log
temp_date = pd.DataFrame()
temp_date['Date'] = pd.to_datetime(temp['Date'], format='%d/%b/%Y:%H:%M:%S %z')
# sort by ascending order
temp_date = temp_date.sort_values(by='Date', ascending=True)

In [None]:
print("Date of first log:", temp_date.iloc[0]['Date'])
print("Date of last log:", temp_date.iloc[-1]['Date'])

We have logs from 31/01/2016 - 13h to , a period of 

In [None]:
# extract request
temp['Request'] = lines_df.apply(lambda x: ' '.join(x['Full_request'].split("\"")[1].split(' ')[:2]), axis=1)

In [None]:
# extract referrer
temp['Referrer'] = lines_df.apply(lambda x: x['Full_request'].split("\"")[3], axis=1)

In [None]:
# function to extract ark from the request
def extract_ark(request):
    # init ark variable
    ark = '-'
    # regular expression pattern, 12148 is specific to gallica
    pattern = r'/12148/([^/.]+)'
    # use regec to find ark in request string
    match = re.search(pattern, request)
    if match:
        ark = match.group(1)
    return ark

In [None]:
# extract ark
temp['Ark'] = temp.apply(lambda x: extract_ark(x['Request']), axis=1)

In [None]:
# some of the arks are not cut at the right place at their end
# function to clean arks
def clean_ark(ark):
    # remove any query parameters
    ark = re.sub(r'\?.*$', '', ark)
    # remove any trailing non-alphanumeric characters
    ark = re.sub(r'[^a-zA-Z0-9]$', '', ark)
    ark = re.sub(r'%20$', '', ark)
    ark = re.sub(r',.*$', '', ark)
    ark = re.sub(r';.*$', '', ark)
    return ark

In [None]:
# apply cleaning function to 'Ark' column
temp.loc[:, 'Ark'] = temp['Ark'].apply(clean_ark)

In [None]:
# concatenate the information retrieved, drop full request column 
logs_df = pd.concat([lines_df, temp], axis=1)
logs_df = logs_df.drop(['Full_request'], axis=1)

In [None]:
# function to extract search terms from the request

def extract_search_terms(request):
    # parse URL
    parsed_url = urlparse(request)
    # extract query parameters
    query_params = parse_qs(parsed_url.query)
    # extract search query from query parameters
    search_query = query_params.get('query', [''])[0]
    # URL-decode the search query
    search_query = unquote(search_query)
    # extract search terms using regular expression
    search_terms = re.findall(r'"([^"]+)"', search_query)
    
    return search_terms

In [None]:
# extract search terms
logs_df['search_terms'] = logs_df.apply(lambda x: extract_search_terms(x['Request']), axis=1)

In [None]:
logs_df.head(10)

## 3 : Enriching log data

We want to enrich the data by adding additional information about the requested document (if there was one) : Dewey class, type of document, visibility. Among other things, this will help create a diversity metric for the sessions later on.

In [None]:
# get list of unique arks
# replace unknown arks by None
logs_df.loc[logs_df['Ark'] == '-', 'Ark'] = None
# get non null arks
arks_df = logs_df[logs_df['Ark'].notnull()].copy()
# keep unique arks
arks_df.drop_duplicates(subset=['Ark'], inplace=True)

Some of the clean arks values are not arks. We re-clean with a function that specifies that arks must start with a 'b'.

In [None]:
b_mask = arks_df['Ark'].str.startswith('b', na=False)
arks_df = arks_df[b_mask]

In [None]:
# create visibility metric for each ark
# calculate the frequency of each ark
arks_counts = logs_df['Ark'].value_counts()

# calculate the visibility for each value in 'ark', if no ark visibility is 0
logs_df['visibility'] = logs_df['Ark'].map(lambda x: arks_counts.get(x, 0) / len(logs_df))

In [None]:
# save the clean logs
logs_df.to_csv("data_month/clean_logs.csv")

In [None]:
# load arks that were already requested to remove them from the list of unique arks to request
arks_already_requested = pd.read_csv("arks_final/processed_arks.csv")
arks_to_request = arks_df[~arks_df['Ark'].isin(arks_already_requested['Ark'])]

In [None]:
# save unique arks
arks_to_request['Ark'].to_csv("data_month/unique_arks_df.csv")

The arks enable us to request the metadata of this document to Gallica. From that, we will extract the theme of the document and its Dewey class, if there is one. Only printed documents and prints have Dewey classes.

In [None]:
# OAI request to Gallica
def OAI(id):

    OAI_BASEURL = 'https://gallica.bnf.fr/services/OAIRecord?ark='

    url = "".join([OAI_BASEURL, id])

    s = requests.get(url, stream=True)
    soup = BeautifulSoup(s.content,"lxml-xml")
    return soup

# function to extract type and theme
def extract_metadata(ark):
    if ark == '-':
        return ark, None, None
    theme = ''
    typedoc = ''
    try:
        oai_result = OAI(ark)
        if oai_result is not None:
            # extracting theme
            sdewey_tag = oai_result.find("sdewey")
            if sdewey_tag:
                theme = sdewey_tag.text
        
            # extracting typedoc
            typedoc_tag = oai_result.find("typedoc")
            if typedoc_tag:
                typedoc = typedoc_tag.text
        
    except Exception as e:
        print("Error occurred:", e)

    return ark, theme, typedoc

In [None]:
# function to request the arks metadata, with a progress bar and saving batches to a folder
def apply_with_progress(df, func, result_df, batch_size=20000, output_prefix='arks_temp_month/arks_batch'):
    result = []
    with tqdm(total=len(df)) as pbar:
        for index, row in df.iterrows():
            curr_result = func(row)
            result.append(curr_result)
            pbar.update(1)
            
            # save to CSV file every batch_size ARKs
            if len(result) % batch_size == 0:
                batch_df = pd.DataFrame(result, columns=['Ark', 'Theme', 'Type'])
                batch_df.to_csv(f'{output_prefix}_{len(result)}.csv', index=False)
    
    return pd.DataFrame(result, columns=['Ark', 'Theme', 'Type'])

In [None]:
# takes VERY long, do not run if arks' types and dewey classes are already computed
result_df_arks = pd.DataFrame()
all_arks = apply_with_progress(arks_to_request, lambda row: extract_metadata(row['Ark']), result_df_arks)

# Save the remaining results to CSV
all_arks.to_csv('arks_final_month/all_arks_metadata.csv', index=False)

In [None]:
all_arks

In [None]:
# seperate empty and non empty arks
non_empty_arks_df = all_arks[pd.notnull(arks_df['Type'])]
# find empty typedoc and create new dataframe to request these arks
empty_arks_df = all_arks[pd.isnull(arks_df['Type'])]

Some arks have the structure btv1b90039### and when requested, yield "Erreur d'utilisation.500". We assume these refer to documents that are not available anymore on gallica.

In [None]:
nb_empty_arks = len(empty_arks_df)
print("There are",nb_empty_arks , "empty arks, which is", nb_empty_arks/len(all_arks)*100, "% of the arks.")

In [None]:
# save empty and non empty arks for now
non_empty_arks_df.to_csv("arks_final_month/arks_non_empty.csv")
empty_arks_df.to_csv("arks_final_month/arks_empty.csv")

## 4 : Concatenating obtained data

In [None]:
# drop potential duplicates, save final complete arks
arks_final_df = all_arks.drop_duplicates(subset=['Ark'])
arks_final_df.to_csv("data_month/arks_final.csv")

In [None]:
# merge DataFrames based on the 'Ark' column, keeping all rows from 'logs_df'
logs_arks_df = pd.merge(logs_df, arks_final_df, on='Ark', how='left')

In [None]:
logs_arks_df.columns

In [None]:
# drop useless columns
logs_arks_df = logs_arks_df.drop(['0', 'Unnamed: 0'], axis=1)

In [None]:
# check length is the same as the beggining
len(logs_arks_df)

#### Cleaning Nans in 'Theme' and 'Type' fields

The theme field can be Nan if either the document has no Dewey class, or if the document metadata was not retrieved. To differenciate these cases, we put Theme = 'No_dewey_class' when the document has no Dewey class, and 'Unknown' for Theme and Type when the document metadata was not available.
The Type and Theme can also be Nan if the log has no ark (for example, when an action is done). In this case, we give it the value 'No_ark'.

In [None]:
known_type_mask = logs_arks_df['Type'].notna() & logs_arks_df['Theme'].isna()
# replacing 'Theme' values with -1 in rows where 'Theme' is Nan and 'Type' is known, indicating a document with no Dewey class
logs_arks_df.loc[known_type_mask, 'Theme'] = 'No_dewey_class'

In [None]:
unknown_ark_mask = logs_arks_df['Type'].isna() & logs_arks_df['Theme'].isna()
# replacing 'Theme' and 'Type' values with 'Unknown, indicating the metadata was not available
logs_arks_df.loc[unknown_ark_mask, ['Theme', 'Type']] = 'Unknown', 'Unknown'

In [None]:
no_ark_type_mask = logs_arks_df['Ark'].isna()
# replacing 'Theme' and 'Type' values with 'no_ark' where 'Ark' is NaN
logs_arks_df.loc[no_ark_type_mask, ['Theme', 'Type']] = 'No_ark', 'No_ark'

In [None]:
logs_arks_df

## 5 : Final step : saving the enriched logs

In [None]:
logs_arks_df.to_csv("data_month/enriched_logs.csv")

Our data is now pre-processed and enriched. The next step will be to create user sessions from it, and then classify part of these sessions as Rabbit Holes.