In [1]:
# import relevant libraries
import numpy as np
import pandas as pd
import gzip
import os
import glob
import re
import requests
from datetime import datetime
from urllib.parse import urlparse, parse_qs, unquote
from tqdm import tqdm
import ast

## 1 : Extracting the logs

In [2]:
# get logs for month of February
folder_path = r'../../../lhstdata1/students/Gallica_logs/1LogGallicaFevrier2016'
all_files = glob.glob(folder_path + "/*.log.gz")
all_files.sort(key=lambda f: int(re.sub('\D', '', f)))

In [84]:
# take only part of the data, to make the size manageable
files = all_files[0:100]
# empty dataframe to store concatenation
dfs = []
for f in files:
    try:
        # read csv file into df
        df = pd.read_csv(f, encoding='UTF-8', sep ='\t', engine='python', header=None)
        # append to list of dfs
        dfs.append(df)
    except Exception as e:
        print(f"Error loading file {f}: {e}")

# from the list, concatenate the dataframes into a single one
combined_df = pd.concat(dfs, ignore_index=True)

In [85]:
combined_df.shape

(14739905, 1)

## 2 : Extracting features from logs 

Structure of a line : '##' then IP address, '##' then country (or null), '##' then city (or null), '##--' then date, then HTTP request in quotes, with the protocol number, followed by number of response (200 = OK), then size, then referrer website (or '-' if unknown)

In [86]:
# drop potential duplicate lines
combined_df = combined_df.drop_duplicates()

In [87]:
# get new shape
combined_df.shape

(14739810, 1)

In [88]:
# split the lines to recover meaningful information - ip address, country, city, date and request
logs_df = combined_df[0].str.split('##', expand=True)

In [89]:
# rename the columns with informative names
logs_df = logs_df.rename(columns = {1:"IPaddress", 2:"Country", 3:"City", 4:"Full_request"})

In [90]:
logs_df.head()

Unnamed: 0,0,IPaddress,Country,City,Full_request
0,,3c2af9233e11938ca3f73eb650d4af40,France,Nice,"- - [27/Feb/2016:18:01:04 +0100] ""GET /ark:/12..."
1,,c6bd521083f4402a71e65c33baa00f3e,Germany,Munich,"- - [27/Feb/2016:18:01:05 +0100] ""GET /ark:/12..."
2,,c6bd521083f4402a71e65c33baa00f3e,Germany,Munich,"- - [27/Feb/2016:18:01:05 +0100] ""GET /assets/..."
3,,c6bd521083f4402a71e65c33baa00f3e,Germany,Munich,"- - [27/Feb/2016:18:01:05 +0100] ""GET /assets/..."
4,,c6bd521083f4402a71e65c33baa00f3e,Germany,Munich,"- - [27/Feb/2016:18:01:05 +0100] ""GET /assets/..."


In [91]:
# drop columns that we don't use - Country, City
logs_df = logs_df.drop(['Country', 'City'], axis=1)

In [92]:
# extract dates and turn into Datetime
logs_df['Date'] = pd.to_datetime(logs_df.apply(lambda x: x['Full_request'].split("]")[0].split("[")[1], axis=1), format='%d/%b/%Y:%H:%M:%S %z')

In [95]:
# extract Request and Referrer fields
logs_df = logs_df.assign(
    Request=lambda x: x['Full_request'].str.split("\"").str[1].str.split().str[:2].str.join(' '),
    Referrer=lambda x: x['Full_request'].str.split("\"").str[3]
)

In [96]:
# drop full request column 
logs_df = logs_df.drop(['Full_request'], axis=1)

In [97]:
# function to extract ark from the request
def extract_ark(request):
    # check if '12148' is in the request (since it is specific to gallica ARKs)
    if '12148' not in request:
        return '-'
    
    ark = '-'
    
    # regular expression pattern, 12148 is specific to gallica
    pattern = r'/12148/([^/.]+)'
    
    # use regex to find ark in request string
    match = re.search(pattern, request)
    if match:
        ark = match.group(1)
    
    return ark

In [98]:
# extract ark
logs_df['Ark'] = logs_df.apply(lambda x: extract_ark(x['Request']), axis=1)

In [99]:
logs_df.head(10)

Unnamed: 0,0,IPaddress,Date,Request,Referrer,Ark
0,,3c2af9233e11938ca3f73eb650d4af40,2016-02-27 18:01:04+01:00,GET /ark:/12148/bpt6k57843235,http://www.google.fr/url?sa=t&rct=j&q=&esrc=s&...,bpt6k57843235
1,,c6bd521083f4402a71e65c33baa00f3e,2016-02-27 18:01:05+01:00,GET /ark:/12148/btv1b8490545v,http://images.google.de/imgres?imgurl=http://g...,btv1b8490545v
2,,c6bd521083f4402a71e65c33baa00f3e,2016-02-27 18:01:05+01:00,GET /assets/static/stylesheets/vendor/bootstra...,http://gallica.bnf.fr/ark:/12148/btv1b8490545v,-
3,,c6bd521083f4402a71e65c33baa00f3e,2016-02-27 18:01:05+01:00,GET /assets/static/stylesheets/main.css,http://gallica.bnf.fr/ark:/12148/btv1b8490545v,-
4,,c6bd521083f4402a71e65c33baa00f3e,2016-02-27 18:01:05+01:00,GET /assets/static/stylesheets/persoScrollBar.css,http://gallica.bnf.fr/ark:/12148/btv1b8490545v,-
5,,c6bd521083f4402a71e65c33baa00f3e,2016-02-27 18:01:05+01:00,GET /assets/static/stylesheets/panes.css,http://gallica.bnf.fr/ark:/12148/btv1b8490545v,-
6,,c6bd521083f4402a71e65c33baa00f3e,2016-02-27 18:01:05+01:00,GET /assets/static/stylesheets/visualiseur.css,http://gallica.bnf.fr/ark:/12148/btv1b8490545v,-
7,,c6bd521083f4402a71e65c33baa00f3e,2016-02-27 18:01:05+01:00,GET /assets/static/stylesheets/thetiere/fonts/...,http://gallica.bnf.fr/ark:/12148/btv1b8490545v,-
8,,43fdf9ff858eb2aa045661bf488124df,2016-02-27 18:01:05+01:00,GET /ark:/12148/btv1b69277000/f1.highres,https://www.google.fr/,btv1b69277000
9,,c6bd521083f4402a71e65c33baa00f3e,2016-02-27 18:01:06+01:00,GET /assets/static/stylesheets/fonts/pictos.css,http://gallica.bnf.fr/ark:/12148/btv1b8490545v,-


In [100]:
# helper functions
def contains_number_or_special_chars(word):
    special_chars = '/()'
    return any(char.isdigit() for char in word) or any(char in special_chars for char in word)

def remove_term(term):
    words = term.split()
    return all(contains_number_or_special_chars(word) for word in words)

In [101]:
# function to keep only clean search terms
def clean_search_terms(terms):
    return [term for term in terms if not remove_term(term)]

In [102]:
# function to extract search terms from the request

def extract_search_terms(request):
    # check if 'search' is in the request
    if 'search' not in request:
        return []
    
    # parse URL
    parsed_url = urlparse(request)
    # extract query parameters
    query_params = parse_qs(parsed_url.query)
    # extract search query from query parameters
    search_query = query_params.get('query', [''])[0]
    # URL-decode the search query
    search_query = unquote(search_query)
    # extract search terms using regular expression
    search_terms = re.findall(r'"([^"]+)"', search_query)
    
    return clean_search_terms(search_terms)

In [103]:
# extract and clean search terms
logs_df['search_terms'] = logs_df.apply(lambda x: extract_search_terms(x['Request']), axis=1)

In [104]:
logs_df[:20]

Unnamed: 0,0,IPaddress,Date,Request,Referrer,Ark,search_terms
0,,3c2af9233e11938ca3f73eb650d4af40,2016-02-27 18:01:04+01:00,GET /ark:/12148/bpt6k57843235,http://www.google.fr/url?sa=t&rct=j&q=&esrc=s&...,bpt6k57843235,[]
1,,c6bd521083f4402a71e65c33baa00f3e,2016-02-27 18:01:05+01:00,GET /ark:/12148/btv1b8490545v,http://images.google.de/imgres?imgurl=http://g...,btv1b8490545v,[]
2,,c6bd521083f4402a71e65c33baa00f3e,2016-02-27 18:01:05+01:00,GET /assets/static/stylesheets/vendor/bootstra...,http://gallica.bnf.fr/ark:/12148/btv1b8490545v,-,[]
3,,c6bd521083f4402a71e65c33baa00f3e,2016-02-27 18:01:05+01:00,GET /assets/static/stylesheets/main.css,http://gallica.bnf.fr/ark:/12148/btv1b8490545v,-,[]
4,,c6bd521083f4402a71e65c33baa00f3e,2016-02-27 18:01:05+01:00,GET /assets/static/stylesheets/persoScrollBar.css,http://gallica.bnf.fr/ark:/12148/btv1b8490545v,-,[]
5,,c6bd521083f4402a71e65c33baa00f3e,2016-02-27 18:01:05+01:00,GET /assets/static/stylesheets/panes.css,http://gallica.bnf.fr/ark:/12148/btv1b8490545v,-,[]
6,,c6bd521083f4402a71e65c33baa00f3e,2016-02-27 18:01:05+01:00,GET /assets/static/stylesheets/visualiseur.css,http://gallica.bnf.fr/ark:/12148/btv1b8490545v,-,[]
7,,c6bd521083f4402a71e65c33baa00f3e,2016-02-27 18:01:05+01:00,GET /assets/static/stylesheets/thetiere/fonts/...,http://gallica.bnf.fr/ark:/12148/btv1b8490545v,-,[]
8,,43fdf9ff858eb2aa045661bf488124df,2016-02-27 18:01:05+01:00,GET /ark:/12148/btv1b69277000/f1.highres,https://www.google.fr/,btv1b69277000,[]
9,,c6bd521083f4402a71e65c33baa00f3e,2016-02-27 18:01:06+01:00,GET /assets/static/stylesheets/fonts/pictos.css,http://gallica.bnf.fr/ark:/12148/btv1b8490545v,-,[]


## 3 : Enriching log data

We want to enrich the data by adding additional information about the requested document (if there was one) : Dewey class, type of document, visibility. Among other things, this will help create a diversity metric for the sessions later on. To examine the visibility of a document, we want to know : how many times was it seen over all sessions ? For that we create a new dataframe with unique arks associated with the count of the number of times they were seen in this dataframe. Creating a new dataframe and storing it means we can later concatenate them if there are many. We also assume that one person increments the visibility of an ark once. So even if there are ten logs with the same ark, if it is one IP address, we count it as 1 count.

In [118]:
# replace unknown arks by None
logs_df.loc[logs_df['Ark'] == '-', 'Ark'] = None
# keep rows with not None arks
filtered_arks = logs_df.dropna(subset=['Ark'])
# drop duplicates based on both ark and ip address
unique_ip_arks = filtered_arks.drop_duplicates(subset=['Ark', 'IPaddress'])

In [119]:
len(unique_ip_arks)

1614176

In [120]:
# group by ark and count occurrences
ark_counts = unique_ip_arks.groupby('Ark').size().reset_index(name='Count')

In [121]:
ark_counts.shape

(641529, 2)

In [122]:
# save unique arks with their count
ark_counts.to_csv("data_temp_month/unique_arks/unique_arks_counts0.csv", index=False)

In [109]:
# save the clean logs
logs_df.to_csv("data_temp_month/clean_logs0.csv", index=False)

We save the clean logs and the arks and their counts. The next step will be request them, in the next notebook. 