In [60]:
# import relevant libraries
import numpy as np
import pandas as pd
from collections import deque
from datetime import datetime
import os
from urllib.parse import urlparse, parse_qs, unquote
from tqdm import tqdm
import ast

In [61]:
tqdm.pandas()

## 1: Getting logs and visibility dictionnary

We will want to add the visibility metric to the sessions. For that we start by concatenating all the arks_counts dataframes and summing the counts by ark. We then turn it into a dictionnary for quick lookup later.

In [3]:
# get ark counts in one dataframe

arks_dir = 'data_temp_month/unique_arks'
arks_files = [f for f in os.listdir(arks_dir) if f.endswith('.csv')]
# init empty df
combined_arks_counts = pd.DataFrame()

# read and concatenate all unique_arks_counts files
for file in arks_files:
    file_path = os.path.join(arks_dir, file)
    df = pd.read_csv(file_path)
    combined_arks_counts = pd.concat([combined_arks_counts, df])

In [4]:
combined_arks_counts.shape

(10944349, 2)

In [5]:
# group by ark and sum the counts
final_arks_counts = combined_arks_counts.groupby('Ark', as_index=False).sum()

In [47]:
final_arks_counts[270:290]

NameError: name 'final_arks_counts' is not defined

In [7]:
# save the combined DataFrame to a new CSV file
final_arks_counts.to_csv('arks_temp_month/final_arks_counts.csv', index=False)

In [8]:
# create dictionnary for quick lookup later
ark_count_dict = final_arks_counts.set_index('Ark')['Count'].to_dict()

In [9]:
# function to get the visibility of a certain ark
def get_visibility(ark_list):
    return [ark_count_dict.get(ark, 0) for ark in ark_list]

In [24]:
# extracting logs to sessionize
logs_df = pd.read_csv("data_temp_month/clean_logs0.csv")

In [46]:
# if everything was already computed
arks_count_dict = pd.read_csv("arks_temp_month/final_arks_counts.csv")

## 2: Sessionization

Create and enrich user sessions. The goal will then be to classify these sessions and find the ones relating to a "Rabbit Hole".

In [25]:
# check all relevant columns are there
print(logs_df.columns)

Index(['0', 'IPaddress', 'Date', 'Request', 'Referrer', 'Ark', 'search_terms'], dtype='object')


From these logs, we want to create user sessions, so we begin by grouping the rows by IP address and aggregating the features.

In [26]:
# group by ip address 
sessions_df = logs_df.groupby('IPaddress').agg({'Ark':list, 'Date':list, 'search_terms':list, 'Referrer':list})

In [27]:
sessions_df

Unnamed: 0_level_0,Ark,Date,search_terms,Referrer
IPaddress,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1000bc3556ab2bdab322bc1f8a50bb,"[bpt6k61150980, nan, nan, nan, nan, nan, nan, ...","[2016-01-31 23:47:53+01:00, 2016-01-31 23:47:5...","[[], [], [], [], [], [], [], [], [], [], [], [...",[http://genealogielibre.jimdo.com/sites-g%C3%A...
1003da47f36bd25814a5e9a122c2a06,[btv1b7200184d],[2016-02-01 03:31:43+01:00],[[]],[-]
10056df3c43ae8f0b5ac0391ad402c6f,[bpt6k5426257v],[2016-02-01 18:38:04+01:00],[[]],[http://lechatsurmonepaule.over-blog.fr/2014/1...
100754cd2af3e3088b8a599e3e8390f8,[bpt6k5451318c],[2016-01-31 21:15:45+01:00],[[]],[http://data.bnf.fr/12215353/jozef_maria_hoene...
1008188a244d4b5cf136c99254e76e92,"[btv1b7720797v, btv1b7744855n, btv1b77207988, ...","[2016-01-31 20:13:36+01:00, 2016-01-31 20:13:3...","[[], [], [], []]",[http://blog.bnf.fr/gallica/index.php/2010/03/...
...,...,...,...,...
fff57715ba27f512437b38d7f39786ca,"[bpt6k58588187, bpt6k58588187, bpt6k58588187, ...","[2016-01-31 23:29:45+01:00, 2016-01-31 23:30:1...","[[], [], [], [], [], [], [], [], [], [], [], []]",[http://gallica.bnf.fr/ark:/12148/bpt6k5858818...
fffa275839e09e5c9f0dc11d2359a413,"[btv1b6933134r, btv1b6923678c]","[2016-01-31 19:34:32+01:00, 2016-01-31 19:34:3...","[[], []]","[http://vangoghiamo.altervista.org/?m=201503, ..."
fffa9caf308707ae0e768f9bf6d9087f,"[bpt6k1652379, nan, nan, nan, nan, nan, nan, n...","[2016-02-01 17:42:51+01:00, 2016-02-01 17:42:5...","[[], [], [], [], [], [], [], [], [], [], [], [...","[https://www.google.es/, http://gallica.bnf.fr..."
fffd4f62954e7e5c4660f8c44917cd82,[bpt6k6533536h],[2016-01-31 23:46:22+01:00],[[]],[-]


We want to compute the time between two connections following one another.

In [28]:
# function to compute minutes between two dates
def minutes_between(d1, d2):
    # Parse the dates using the appropriate format
    d1 = datetime.strptime(d1, "%Y-%m-%d %H:%M:%S")
    d2 = datetime.strptime(d2, "%Y-%m-%d %H:%M:%S")
    
    # Calculate the difference in minutes
    return abs((d2 - d1).total_seconds() // 60)

In [29]:
time_beginning = "01/Jan/0001:01:01:01 +0100"
time_end = "01/Jan/3000:01:01:01 +0100"
sessions_df['date_1'] = sessions_df.apply(lambda x: [time_beginning]+x['Date'], axis = 1)
sessions_df['date_2'] = sessions_df.apply(lambda x: x['Date']+[time_end],axis=1)

In [30]:
# function to calculate the difference between two zipped lists
def calculate_difference_zipped_list(lst):
    new_lst = []
    for e in lst:
        if (e[0]==time_beginning):
            new_lst.append(999)
        elif (e[1]==time_end):
            new_lst.append(999)
        else:
            new_lst.append(minutes_between(e[0][:-6], e[1][:-6]))
    return new_lst
        
# 999 means this is the first connection    

In [31]:
# this contains the ip addresses and the zipped version of date_1, date_2
IP_and_sessions = sessions_df.apply(lambda x: deque(calculate_difference_zipped_list(list(zip(x['date_1'],x['date_2'])))),axis=1)

In [32]:
# IP and the difference in time between each connection and the last
IP_and_sessions

IPaddress
1000bc3556ab2bdab322bc1f8a50bb      [999, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
1003da47f36bd25814a5e9a122c2a06                                            [999, 999]
10056df3c43ae8f0b5ac0391ad402c6f                                           [999, 999]
100754cd2af3e3088b8a599e3e8390f8                                           [999, 999]
1008188a244d4b5cf136c99254e76e92                            [999, 0.0, 0.0, 0.0, 999]
                                                          ...                        
fff57715ba27f512437b38d7f39786ca    [999, 0.0, 0.0, 1.0, 3.0, 7.0, 4.0, 0.0, 0.0, ...
fffa275839e09e5c9f0dc11d2359a413                                      [999, 0.0, 999]
fffa9caf308707ae0e768f9bf6d9087f    [999, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
fffd4f62954e7e5c4660f8c44917cd82                                           [999, 999]
fffee26dc6c2bc43358fd6c7c720116d    [999, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
Length: 34788, dtype: object

In [33]:
# create previous connexion date feature, to have the last known session date for each IP address
logs_df['previous_connexion_date'] = logs_df.apply(lambda x: IP_and_sessions[x['IPaddress']].popleft(),axis=1)

In [34]:
logs_df.head(4)

Unnamed: 0,0,IPaddress,Date,Request,Referrer,Ark,search_terms,previous_connexion_date
0,,de3a61378c09ea5f7f9816c0ccb13e34,2016-01-31 20:00:16+01:00,GET /ark:/12148/cb328602375/date1896.item.langFR,-,cb328602375,[],999.0
1,,fb61fa29fdfbf9226171dbf3d9e79be,2016-01-31 20:00:16+01:00,GET /ark:/12148/bpt6k6275141/f1.item.zoom,-,bpt6k6275141,[],999.0
2,,fb61fa29fdfbf9226171dbf3d9e79be,2016-01-31 20:00:17+01:00,GET /assets/static/stylesheets/main.css,http://gallica.bnf.fr/ark:/12148/bpt6k6275141/...,,[],0.0
3,,fb61fa29fdfbf9226171dbf3d9e79be,2016-01-31 20:00:17+01:00,GET /assets/static/stylesheets/vendor/bootstra...,http://gallica.bnf.fr/ark:/12148/bpt6k6275141/...,,[],0.0


We generate session IDs based on period, a new session ID is generated if the period between the last connexion and this one is > 60 minutes.
Trying different values than 60 yields little change. 

In [35]:
session_id = 0

def create_session(period, max_time):
    global session_id
    if(period > max_time):
        session_id += 1
    return session_id

In [36]:
# sort by ip address and date
logs_df = logs_df.sort_values(by=['IPaddress', 'Date'])
# create session ids, a new session is created if the previous connexion was more than 60 minutes ago
logs_df['session_id'] = logs_df.apply(lambda x: create_session(x['previous_connexion_date'], 60),axis=1)

In [37]:
logs_df

Unnamed: 0,0,IPaddress,Date,Request,Referrer,Ark,search_terms,previous_connexion_date,session_id
2578480,,1000bc3556ab2bdab322bc1f8a50bb,2016-01-31 23:47:53+01:00,GET /ark:/12148/bpt6k61150980,http://genealogielibre.jimdo.com/sites-g%C3%A9...,bpt6k61150980,[],999.0,1
2578481,,1000bc3556ab2bdab322bc1f8a50bb,2016-01-31 23:47:53+01:00,GET /assets/static/stylesheets/vendor/bootstra...,http://gallica.bnf.fr/ark:/12148/bpt6k61150980,,[],0.0,1
2578482,,1000bc3556ab2bdab322bc1f8a50bb,2016-01-31 23:47:53+01:00,GET /assets/static/stylesheets/main.css,http://gallica.bnf.fr/ark:/12148/bpt6k61150980,,[],0.0,1
2578483,,1000bc3556ab2bdab322bc1f8a50bb,2016-01-31 23:47:53+01:00,GET /assets/static/stylesheets/panes.css,http://gallica.bnf.fr/ark:/12148/bpt6k61150980,,[],0.0,1
2578484,,1000bc3556ab2bdab322bc1f8a50bb,2016-01-31 23:47:53+01:00,GET /assets/static/stylesheets/tablet/panes.css,http://gallica.bnf.fr/ark:/12148/bpt6k61150980,,[],0.0,1
...,...,...,...,...,...,...,...,...,...
8284346,,fffee26dc6c2bc43358fd6c7c720116d,2016-02-01 17:32:01+01:00,GET /assets/static/images/entete/pinterest.png,http://gallica.bnf.fr/ark:/12148/bpt6k96364b,,[],0.0,43691
8284348,,fffee26dc6c2bc43358fd6c7c720116d,2016-02-01 17:32:01+01:00,GET /assets/static/images/favicon_gallica.ico,http://gallica.bnf.fr/ark:/12148/bpt6k96364b,,[],0.0,43691
8284352,,fffee26dc6c2bc43358fd6c7c720116d,2016-02-01 17:32:02+01:00,GET /ark:/12148/bpt6k96364b/f6.highres,http://gallica.bnf.fr/ark:/12148/bpt6k96364b,bpt6k96364b,[],0.0,43691
8284353,,fffee26dc6c2bc43358fd6c7c720116d,2016-02-01 17:32:02+01:00,GET /ark:/12148/bpt6k96364b/f5.highres,http://gallica.bnf.fr/ark:/12148/bpt6k96364b,bpt6k96364b,[],0.0,43691


In [38]:
#create sessions by grouping by session ID and collecting all arks and their metadata
sessions = logs_df.groupby('session_id').agg({'Ark':list,'Date':list, 'Referrer':list, 'search_terms':list})

In [39]:
# removing subsequent ARKs and removing empty lists
def remove_consecutive_duplicates(l):
    return [v for i, v in enumerate(l) if (i == 0 or v != l[i-1]) and v!=[]]

    
sessions['Ark'] = sessions.apply(lambda x: remove_consecutive_duplicates(x['Ark']), axis = 1)

In [40]:
# keep only first and last dates
sessions['Date'] = sessions['Date'].apply(lambda x: [x[0],x[-1]])

In [41]:
# keep only first clean referrer
def get_first_referrer(referrers):
    if isinstance(referrers, list) and referrers:  # Check if referrers is a non-empty list
        first_referrer = referrers[0]
        if isinstance(first_referrer, str):  # Check if the first referrer is a string
            parsed_url = urlparse(first_referrer)
            return f"{parsed_url.scheme}://{parsed_url.netloc}"
    return None

In [42]:
# Apply the function to the 'Referrer' column to create 'first_referrer'
sessions['first_referrer'] = sessions['Referrer'].apply(get_first_referrer)

In [43]:
sessions

Unnamed: 0_level_0,Ark,Date,Referrer,search_terms,first_referrer
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,"[bpt6k61150980, nan, nan, nan, nan, nan, nan, ...","[2016-01-31 23:47:53+01:00, 2016-01-31 23:53:4...",[http://genealogielibre.jimdo.com/sites-g%C3%A...,"[[], [], [], [], [], [], [], [], [], [], [], [...",http://genealogielibre.jimdo.com
2,[btv1b7200184d],"[2016-02-01 03:31:43+01:00, 2016-02-01 03:31:4...",[-],[[]],://
3,[bpt6k5426257v],"[2016-02-01 18:38:04+01:00, 2016-02-01 18:38:0...",[http://lechatsurmonepaule.over-blog.fr/2014/1...,[[]],http://lechatsurmonepaule.over-blog.fr
4,[bpt6k5451318c],"[2016-01-31 21:15:45+01:00, 2016-01-31 21:15:4...",[http://data.bnf.fr/12215353/jozef_maria_hoene...,[[]],http://data.bnf.fr
5,"[btv1b7720797v, btv1b7744855n, btv1b77207988, ...","[2016-01-31 20:13:36+01:00, 2016-01-31 20:13:3...",[http://blog.bnf.fr/gallica/index.php/2010/03/...,"[[], [], [], []]",http://blog.bnf.fr
...,...,...,...,...,...
43687,[bpt6k58588187],"[2016-01-31 23:29:45+01:00, 2016-01-31 23:56:2...",[http://gallica.bnf.fr/ark:/12148/bpt6k5858818...,"[[], [], [], [], [], [], [], [], [], [], [], []]",http://gallica.bnf.fr
43688,"[btv1b6933134r, btv1b6923678c]","[2016-01-31 19:34:32+01:00, 2016-01-31 19:34:3...","[http://vangoghiamo.altervista.org/?m=201503, ...","[[], []]",http://vangoghiamo.altervista.org
43689,"[bpt6k1652379, nan, nan, nan, nan, nan, nan, n...","[2016-02-01 17:42:51+01:00, 2016-02-01 18:10:1...","[https://www.google.es/, http://gallica.bnf.fr...","[[], [], [], [], [], [], [], [], [], [], [], [...",https://www.google.es
43690,[bpt6k6533536h],"[2016-01-31 23:46:22+01:00, 2016-01-31 23:46:2...",[-],[[]],://


In [30]:
# function to find the length of a session
def length_session(d1, d2):
    # Remove the colon from the timezone offset
    d1 = d1[:-3] + d1[-2:]
    d2 = d2[:-3] + d2[-2:]
    
    # Parse the dates using the appropriate format
    d1 = datetime.strptime(d1, "%Y-%m-%d %H:%M:%S%z")
    d2 = datetime.strptime(d2, "%Y-%m-%d %H:%M:%S%z")
    
    # Calculate the difference in minutes
    return abs((d2 - d1).total_seconds() // 60)

In [31]:
# add sessions length in minutes feature
sessions['length_minutes'] = sessions['Date'].apply(lambda x: length_session(x[0], x[-1]))

In [32]:
# add a list of visibilities for each ark
sessions['visibility'] = sessions['Ark'].apply(get_visibility)

We create many visibility features : the min and mean visibility, the min/mean of the first 3 and last 3 arks, and the variation between these. The purpose of this is to see if the visbility decreases at the end of a session or increases.

In [33]:
sessions = sessions.assign(
    min_visibility=lambda x: x['visibility'].apply(lambda vis: min(filter(None, vis), default=0)),
    mean_visibility=lambda x: x['visibility'].apply(lambda vis: sum(vis) / len([v for v in vis if v != 0]) if any(vis) else 0),
    min_first_3=lambda x: x['visibility'].apply(lambda vis: min(filter(None, vis[:3]), default=0)),
    mean_first_3=lambda x: x['visibility'].apply(lambda vis: sum(vis[:3]) / len([v for v in vis[:3] if v != 0]) if any(vis[:3]) else 0),
    min_last_3=lambda x: x['visibility'].apply(lambda vis: min(filter(None, vis[-3:]), default=0)),
    mean_last_3=lambda x: x['visibility'].apply(lambda vis: sum(vis[-3:]) / len([v for v in vis[-3:] if v != 0]) if any(vis[-3:]) else 0),
    variation_min_vis=lambda x: x['min_last_3'] - x['min_first_3'],
    variation_mean_vis=lambda x: x['mean_last_3'] - x['mean_first_3']
)

In [34]:
sessions

Unnamed: 0_level_0,Ark,Date,Referrer,search_terms,first_referrer,length_minutes,visibility,min_visibility,mean_visibility,min_first_3,mean_first_3,min_last_3,mean_last_3,variation_min_vis,variation_mean_vis
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
1,"[nan, nan]","[2016-02-27 20:49:48+01:00, 2016-02-27 20:59:4...","[-, -]","[[], []]",://,10.0,"[0, 0]",0,0.00000,0,0.0,0,0.000000,0,0.000000
2,"[bpt6k5531462t, nan, nan, nan, nan, nan, nan, ...","[2016-02-29 05:02:52+01:00, 2016-02-29 05:03:2...","[https://www.google.es/, http://gallica.bnf.fr...","[[], [], [], [], [], [], [], [], [], [], [], [...",https://www.google.es,0.0,"[296, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...",296,296.00000,296,296.0,296,296.000000,0,0.000000
3,"[btv1b86108062, nan, nan, nan, nan, nan, nan, ...","[2016-02-27 20:16:15+01:00, 2016-02-27 20:16:2...",[http://images.google.fr/imgres?imgurl=http%3A...,"[[], [], [], [], [], [], [], [], [], [], [], [...",http://images.google.fr,0.0,"[516, 0, 0, 0, 0, 0, 0, 516, 0, 0, 0, 0, 0, 0,...",516,516.00000,516,516.0,516,516.000000,0,0.000000
4,"[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[2016-02-27 18:35:21+01:00, 2016-02-27 18:49:3...",[http://www.google.fr/url?sa=t&rct=j&q=&esrc=s...,"[[], [], [], [], [], [], [], [], [], [], [], [...",http://www.google.fr,14.0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",2,1697.61658,0,0.0,15,713.666667,15,713.666667
5,"[nan, nan, nan, nan, nan, nan, nan, nan]","[2016-02-28 03:53:19+01:00, 2016-02-28 03:53:2...","[-, -, -, -, -, -, -, -]","[['Auriol, George', 'Auriol, George', 'papiers...",://,0.0,"[0, 0, 0, 0, 0, 0, 0, 0]",0,0.00000,0,0.0,0,0.000000,0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
70371,"[bpt6k824366, nan, nan, nan, nan, nan, nan, na...","[2016-02-28 04:48:58+01:00, 2016-02-28 04:49:1...","[http://www.lexilogos.com/afrique_langues.htm,...","[[], [], [], [], [], [], [], [], [], [], [], [...",http://www.lexilogos.com,0.0,"[248, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...",4,609.06250,248,248.0,95,498.500000,-153,250.500000
70372,"[btv1b55008737g, nan, nan, nan, nan, nan, nan,...","[2016-02-28 19:35:50+01:00, 2016-02-28 19:36:3...",[http://gallica.bnf.fr/html/und/cartes/les-glo...,"[[], [], [], [], [], [], [], [], [], [], [], [...",http://gallica.bnf.fr,0.0,"[3662, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",3662,3662.00000,3662,3662.0,0,0.000000,-3662,-3662.000000
70373,"[bpt6k6528902c, nan, nan, nan, nan, nan, nan, ...","[2016-02-28 16:45:55+01:00, 2016-02-28 16:51:0...",[http://www.geneanet.org/archives/ouvrages/?ac...,"[[], [], [], [], [], [], [], [], [], [], [], [...",http://www.geneanet.org,5.0,"[35, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",35,35.00000,35,35.0,35,35.000000,0,0.000000
70374,[nan],"[2016-02-29 03:04:07+01:00, 2016-02-29 03:04:0...",[-],[[]],://,0.0,[0],0,0.00000,0,0.0,0,0.000000,0,0.000000


In [35]:
# save temporary sessions
sessions.to_csv("data_temp_month/sessions/sessions0.csv")

## 3: Adding features to sessions to later help characterize rabbit holes
To help characterize rabbit holes we add features such as theme, type. We begin by concatenanting all the previously obtained sessions into one.

In [39]:
# concatenating the sessions
sessions_dir = 'data_temp_month/sessions'
sessions_files = [f for f in os.listdir(sessions_dir) if f.endswith('.csv')]
# init empty df
combined_sessions = pd.DataFrame()

# read and concatenate all sessions files
for file in tqdm(sessions_files, desc="Combining session files"):
    file_path = os.path.join(sessions_dir, file)
    df = pd.read_csv(file_path)
    combined_sessions = pd.concat([combined_sessions, df])

Combining session files: 100%|██████████████████| 22/22 [03:48<00:00, 10.37s/it]


In [40]:
combined_sessions.shape

(1417035, 16)

In [72]:
# function to make sure the arks is a list of strings
def convert_ark_string(ark_string):
    try:
        # Replace nan (unquoted) with 'nan' (quoted)
        ark_string = ark_string.replace('nan', '"nan"')
        # Safely evaluate the string to a Python list
        ark_list = ast.literal_eval(ark_string)
        # Replace 'nan' strings with np.nan
        return [np.nan if item == 'nan' else item for item in ark_list]
    except (ValueError, SyntaxError) as e:
        print(f"Error: {e}")
        # Return an empty list if there's an issue with conversion
        return ['error_parsing']

In [93]:
# apply the previous function
combined_sessions['Ark_list'] = combined_sessions['Ark'].apply(convert_ark_string)

In [94]:
combined_sessions.head(2)

Unnamed: 0,session_id,Ark,Date,Referrer,search_terms,first_referrer,length_minutes,visibility,min_visibility,mean_visibility,min_first_3,mean_first_3,min_last_3,mean_last_3,variation_min_vis,variation_mean_vis,Ark_list
0,1,"[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","['2016-02-14 21:40:58+01:00', '2016-02-14 21:5...","['-', 'http://gallica.bnf.fr/assets/static/sty...","['[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]...",://,18.0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",3,1640.487421,0,0.0,453,8244.333333,453,8244.333333,"[nan, nan, nan, nan, nan, nan, nan, nan, nan, ..."
1,2,"[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","['2016-02-14 20:24:19+01:00', '2016-02-14 20:2...",['http://bibliotheque.clermont-universite.fr/r...,"['[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]...",http://bibliotheque.clermont-universite.fr,5.0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",20,3358.471698,0,0.0,222,334.5,222,334.5,"[nan, nan, nan, nan, nan, nan, nan, nan, nan, ..."


We keep only the sessions where at least one document was consulted.

In [95]:
# remove sessions with no ark
def invalid_arks(ark_list):
    all_nan = all(pd.isna(item) for item in ark_list)
    return all_nan

In [96]:
filtered_sessions = combined_sessions[~combined_sessions['Ark_list'].apply(invalid_arks)]

In [97]:
filtered_sessions.shape

(1181190, 17)

In [98]:
filtered_sessions.head(2)

Unnamed: 0,session_id,Ark,Date,Referrer,search_terms,first_referrer,length_minutes,visibility,min_visibility,mean_visibility,min_first_3,mean_first_3,min_last_3,mean_last_3,variation_min_vis,variation_mean_vis,Ark_list
0,1,"[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","['2016-02-14 21:40:58+01:00', '2016-02-14 21:5...","['-', 'http://gallica.bnf.fr/assets/static/sty...","['[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]...",://,18.0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",3,1640.487421,0,0.0,453,8244.333333,453,8244.333333,"[nan, nan, nan, nan, nan, nan, nan, nan, nan, ..."
1,2,"[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","['2016-02-14 20:24:19+01:00', '2016-02-14 20:2...",['http://bibliotheque.clermont-universite.fr/r...,"['[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]...",http://bibliotheque.clermont-universite.fr,5.0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",20,3358.471698,0,0.0,222,334.5,222,334.5,"[nan, nan, nan, nan, nan, nan, nan, nan, nan, ..."


In [99]:
percentage_full = filtered_sessions.shape[0] / combined_sessions.shape[0] * 100
print(percentage_full, "% of sessions have arks.")

83.35644497136627 % of sessions have arks.


In [100]:
filtered_sessions.to_csv("data_month/sessions_with_arks.csv")

In [49]:
# if everything was already computed
filtered_sessions = pd.read_csv("data_month/sessions_with_arks.csv", index_col=0)

In [50]:
filtered_sessions

Unnamed: 0,session_id,Ark,Date,Referrer,search_terms,first_referrer,length_minutes,visibility,min_visibility,mean_visibility,min_first_3,mean_first_3,min_last_3,mean_last_3,variation_min_vis,variation_mean_vis,Ark_list
0,1,"[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","['2016-02-14 21:40:58+01:00', '2016-02-14 21:5...","['-', 'http://gallica.bnf.fr/assets/static/sty...","['[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]...",://,18.0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",3,1640.487421,0,0.000000,453,8244.333333,453,8244.333333,"[nan, nan, nan, nan, nan, nan, nan, nan, nan, ..."
1,2,"[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","['2016-02-14 20:24:19+01:00', '2016-02-14 20:2...",['http://bibliotheque.clermont-universite.fr/r...,"['[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]...",http://bibliotheque.clermont-universite.fr,5.0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",20,3358.471698,0,0.000000,222,334.500000,222,334.500000,"[nan, nan, nan, nan, nan, nan, nan, nan, nan, ..."
2,3,"['btv1b6951272q', 'btv1b6951275z', 'btv1b69512...","['2016-02-15 22:00:43+01:00', '2016-02-15 23:2...",['http://data.bnf.fr/documents-by-rdt/11900585...,"['[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]...",http://data.bnf.fr,83.0,"[54, 71, 84, 138, 59, 137, 54, 24, 54, 24, 11,...",2,159.437621,54,69.666667,9,17.333333,-45,-52.333333,"['btv1b6951272q', 'btv1b6951275z', 'btv1b69512..."
3,4,['bpt6k361547'],"['2016-02-15 21:29:00+01:00', '2016-02-15 21:2...",['https://www.google.dz/'],['[]'],https://www.google.dz,0.0,[95],95,95.000000,95,95.000000,95,95.000000,0,0.000000,['bpt6k361547']
4,5,"['btv1b7741316w', 'btv1b7741310d', 'btv1b77413...","['2016-02-14 22:29:54+01:00', '2016-02-14 22:5...","['-', '-', '-', '-', '-', '-', '-', '-', '-', ...","['[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]...",://,23.0,"[27, 53, 27, 53, 27, 53]",27,40.000000,27,35.666667,27,44.333333,0,8.666667,"['btv1b7741316w', 'btv1b7741310d', 'btv1b77413..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
70958,70959,"['bpt6k5471361x', 'btv1b8432934d', 'btv1b84539...","['2016-02-11 18:53:43+01:00', '2016-02-11 19:1...",['http://catalogue.bnf.fr/rechercher.do?motRec...,"['[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]...",http://catalogue.bnf.fr,20.0,"[11, 1290, 712, 9209, 712, 6, 7, 11, 1290, 712...",3,1723.000000,11,671.000000,7,21.333333,-4,-649.666667,"['bpt6k5471361x', 'btv1b8432934d', 'btv1b84539..."
70959,70960,"['bpt6k278426', 'bpt6k27843j', 'bpt6k278457', ...","['2016-02-11 19:41:48+01:00', '2016-02-11 20:1...",['http://catalogue.bnf.fr/ark:/12148/cb3030161...,"['[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]...",http://catalogue.bnf.fr,36.0,"[23, 36, 18, 17, 28, 41, 78, 69, 17, 23, 0, 0,...",3,1375.890000,18,25.666667,23,50.500000,5,24.833333,"['bpt6k278426', 'bpt6k27843j', 'bpt6k278457', ..."
70960,70961,['bpt6k278426'],"['2016-02-11 21:02:25+01:00', '2016-02-11 21:0...",['http://catalogue.bnf.fr/changerPage.do?motRe...,"['[]', '[]', '[]']",http://catalogue.bnf.fr,1.0,[23],23,23.000000,23,23.000000,23,23.000000,0,0.000000,['bpt6k278426']
70961,70962,['bpt6k297194d'],"['2016-02-11 21:02:54+01:00', '2016-02-11 21:0...",['http://gallica.bnf.fr/ark:/12148/bpt6k297194...,"['[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]...",http://gallica.bnf.fr,2.0,[26],26,26.000000,26,26.000000,26,26.000000,0,0.000000,['bpt6k297194d']


In [51]:
# get arks that were requested
arks_computed = pd.read_csv("arks_final_month/arks_theme_typev2.csv", index_col=0)

In [52]:
arks_computed

Unnamed: 0_level_0,Theme,Type
Ark,Unnamed: 1_level_1,Unnamed: 2_level_1
btv1b8459670z,,carte
btv1b59624343,,carte
btv1b53036648q,,carte
btv1b530395328,,carte
btv1b5962468w,,carte
...,...,...
'bpt6k5745834w',33.0,fascicule
'bpt6k5822225m',33.0,fascicule
'btv1b90694071',,partition
'bpt6k502837q',7.0,fascicule


In [53]:
# create a dictionnary for quick lookup later
arks_grouped = arks_computed.groupby('Ark').agg({'Theme':'first', 'Type':'first'}).reset_index()
arks_dict = arks_grouped.set_index('Ark').to_dict(orient='index')

In [54]:
arks_dict

{"'": {'Theme': 49.0, 'Type': 'monographie'},
 "'&lt;span%20class=&quot;highlight&quot;&gt;bpt6k1013503&lt;'": {'Theme': 49.0,
  'Type': 'monographie'},
 "'&lt;span%20class=&quot;highlight&quot;&gt;bpt6k127569m&lt;'": {'Theme': 49.0,
  'Type': 'monographie'},
 "'&lt;span%20class=&quot;highlight&quot;&gt;bpt6k2051596&lt;'": {'Theme': 49.0,
  'Type': 'monographie'},
 "'&lt;span%20class=&quot;highlight&quot;&gt;btv1b2200191h&lt;'": {'Theme': 49.0,
  'Type': 'monographie'},
 "'&lt;span%20class=&quot;highlight&quot;&gt;btv1b7200095v&lt;'": {'Theme': 49.0,
  'Type': 'monographie'},
 "'&lt;span%20class=&quot;highlight&quot;&gt;btv1b9005203s&lt;'": {'Theme': 49.0,
  'Type': 'monographie'},
 "'(null)'": {'Theme': 87.0, 'Type': 'monographie'},
 "'---'": {'Theme': 49.0, 'Type': 'monographie'},
 "'0'": {'Theme': nan, 'Type': 'image'},
 "'06'": {'Theme': nan, 'Type': 'image'},
 "'11'": {'Theme': nan, 'Type': 'monographie'},
 "'29'": {'Theme': nan, 'Type': 'periodique'},
 "':5054'": {'Theme': nan, '

In [57]:
# create a df to save it
arks_df = pd.DataFrame.from_dict(arks_dict, orient='index').reset_index()
arks_df.columns = ['Ark', 'Theme', 'Type']

# save arks_df to a CSV file
arks_df.to_csv('arks_final_month/arks_dict.csv', index=False)

In [67]:
# function to create a list of themes and a list of types from a list of arks
def map_themes_and_types(ark_list, arks_dict):
    themes = []
    types = []
    for ark in ark_list:
        if pd.isna(ark) or ark == 'nan':
            themes.append('no_ark')
            types.append('no_ark')
        else:
            theme_type = arks_dict.get(ark, {'Theme': 'no_data', 'Type': 'no_data'})
            theme = theme_type['Theme'] if pd.notna(theme_type['Theme']) else 'no_dewey_class'
            themes.append(theme)
            types.append(theme_type['Type'])
    return themes, types

In [73]:
filtered_sessions['Ark_list'] = filtered_sessions['Ark'].apply(convert_ark_string)

In [76]:
# apply the function to create a Series of tuples
themes_types_series = filtered_sessions['Ark_list'].progress_apply(
    lambda ark_list: map_themes_and_types(ark_list, arks_dict)
)

# create a DataFrame from the Series
themes_types_df = pd.DataFrame(themes_types_series.tolist(), columns=['themes', 'types'])

# assign the DataFrame columns to the original DataFrame
filtered_sessions.loc[:, ['themes', 'types']] = themes_types_df

100%|███████████████████████████████| 1181190/1181190 [03:31<00:00, 5585.09it/s]


In [77]:
filtered_sessions.head(10)

Unnamed: 0,session_id,Ark,Date,Referrer,search_terms,first_referrer,length_minutes,visibility,min_visibility,mean_visibility,min_first_3,mean_first_3,min_last_3,mean_last_3,variation_min_vis,variation_mean_vis,Ark_list,themes,types
0,1,"[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","['2016-02-14 21:40:58+01:00', '2016-02-14 21:5...","['-', 'http://gallica.bnf.fr/assets/static/sty...","['[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]...",://,18.0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",3,1640.487421,0,0.0,453,8244.333333,453,8244.333333,"[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[no_ark, no_ark, no_ark, no_ark, no_ark, no_ar...","[no_ark, no_ark, no_ark, no_ark, no_ark, no_ar..."
1,2,"[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","['2016-02-14 20:24:19+01:00', '2016-02-14 20:2...",['http://bibliotheque.clermont-universite.fr/r...,"['[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]...",http://bibliotheque.clermont-universite.fr,5.0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",20,3358.471698,0,0.0,222,334.5,222,334.5,"[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[no_ark, no_ark, no_ark, no_ark, no_ark, no_ar...","[no_ark, no_ark, no_ark, no_ark, no_ark, no_ar..."
2,3,"['btv1b6951272q', 'btv1b6951275z', 'btv1b69512...","['2016-02-15 22:00:43+01:00', '2016-02-15 23:2...",['http://data.bnf.fr/documents-by-rdt/11900585...,"['[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]...",http://data.bnf.fr,83.0,"[54, 71, 84, 138, 59, 137, 54, 24, 54, 24, 11,...",2,159.437621,54,69.666667,9,17.333333,-45,-52.333333,"[btv1b6951272q, btv1b6951275z, btv1b69512645, ...","[no_dewey_class, no_dewey_class, no_dewey_clas...","[image, image, image, image, monographie, imag..."
3,4,['bpt6k361547'],"['2016-02-15 21:29:00+01:00', '2016-02-15 21:2...",['https://www.google.dz/'],['[]'],https://www.google.dz,0.0,[95],95,95.0,95,95.0,95,95.0,0,0.0,[bpt6k361547],[944.0],[monographie]
4,5,"['btv1b7741316w', 'btv1b7741310d', 'btv1b77413...","['2016-02-14 22:29:54+01:00', '2016-02-14 22:5...","['-', '-', '-', '-', '-', '-', '-', '-', '-', ...","['[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]...",://,23.0,"[27, 53, 27, 53, 27, 53]",27,40.0,27,35.666667,27,44.333333,0,8.666667,"[btv1b7741316w, btv1b7741310d, btv1b7741316w, ...","[no_dewey_class, no_dewey_class, no_dewey_clas...","[image, image, image, image, image, image]"
5,6,"['bpt6k3947158', nan, nan, nan, nan, nan, nan,...","['2016-02-14 23:15:45+01:00', '2016-02-14 23:1...",['http://data.bnf.fr/14754920/marcelle_chadal/...,"['[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]...",http://data.bnf.fr,4.0,"[21, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",21,21.0,21,21.0,21,21.0,0,0.0,"[bpt6k3947158, nan, nan, nan, nan, nan, nan, n...","[no_dewey_class, no_ark, no_ark, no_ark, no_ar...","[partition, no_ark, no_ark, no_ark, no_ark, no..."
6,7,"[nan, nan, nan, nan, nan, nan, 'bpt6k411269d',...","['2016-02-14 20:57:01+01:00', '2016-02-14 20:5...",['http://gallica.bnf.fr/services/engine/search...,"[""['MMlle. Arriaza']"", ""['M.Mlle. Arriaza']"", ...",http://gallica.bnf.fr,1.0,"[0, 0, 0, 0, 0, 0, 187, 3, 17, 50, 187, 0, 0, ...",1,151.416667,0,0.0,71,180.5,71,180.5,"[nan, nan, nan, nan, nan, nan, bpt6k411269d, b...","[no_ark, no_ark, no_ark, no_ark, no_ark, no_ar...","[no_ark, no_ark, no_ark, no_ark, no_ark, no_ar..."
7,8,['btv1b9024887s'],"['2016-02-15 00:24:50+01:00', '2016-02-15 00:2...",['-'],['[]'],://,0.0,[19],19,19.0,19,19.0,19,19.0,0,0.0,[btv1b9024887s],[no_data],[no_data]
10,11,['btv1b7740417b'],"['2016-02-14 22:59:32+01:00', '2016-02-14 22:5...",['http://www.skyscrapercity.com/showthread.php...,['[]'],http://www.skyscrapercity.com,0.0,[23],23,23.0,23,23.0,23,23.0,0,0.0,[btv1b7740417b],"[no_ark, no_ark, no_ark, no_ark, no_ark, no_ar...","[no_ark, no_ark, no_ark, no_ark, no_ark, no_ar..."
11,12,['btv1b2300096g'],"['2016-02-15 23:08:20+01:00', '2016-02-15 23:0...",['-'],['[]'],://,0.0,[883],883,883.0,883,883.0,883,883.0,0,0.0,[btv1b2300096g],[no_dewey_class],[objet]


In [78]:
# create additional features
filtered_sessions = filtered_sessions.assign(
    very_long=lambda x: x['length_minutes'] >= 120,
    nb_docs = lambda x: x['Ark_list'].apply(lambda y: len(set(i for i in y if i not in (np.nan, 'nan')))),
    over_10_docs=lambda x: x['nb_docs'] >= 10
)

In [79]:
top_10_threshold = filtered_sessions['length_minutes'].quantile(0.9)
top_5_threshold = filtered_sessions['length_minutes'].quantile(0.95)
# create a new column indicating whether each session length is in the top 10%
filtered_sessions.loc[:, 'top_10%_length'] = filtered_sessions['length_minutes'] >= top_10_threshold
# same for top 5%
filtered_sessions.loc[:, 'top_5%_length'] = filtered_sessions['length_minutes'] >= top_5_threshold

In [80]:
# function to count the non nan and valid themes and types
def count_unique_valid_entries(lst):
    # Convert the list to a Series, drop NaN values, and then get the unique values
    unique_values = pd.Series(lst).dropna().unique()
    # Exclude 'no_ark' and 'no_data' from the unique values
    unique_values = [value for value in unique_values if value not in ['no_ark', 'no_data']]
    # Return the length of the filtered unique values
    return len(unique_values)


In [81]:
# apply the function to create 2 new features
with tqdm(total=len(filtered_sessions)) as pbar:
    filtered_sessions['nb_types'] = filtered_sessions['types'].progress_apply(lambda x: count_unique_valid_entries(x))
    pbar.update()
    filtered_sessions['nb_themes'] = filtered_sessions['themes'].progress_apply(lambda x: count_unique_valid_entries(x))
    pbar.update()

  0%|                                               | 0/1181190 [00:00<?, ?it/s]
  0%|                                               | 0/1181190 [00:00<?, ?it/s][A
  0%|                                   | 229/1181190 [00:00<08:36, 2288.61it/s][A
  0%|                                   | 537/1181190 [00:00<07:09, 2750.80it/s][A
  0%|                                   | 843/1181190 [00:00<06:48, 2887.91it/s][A
  0%|                                  | 1150/1181190 [00:00<06:38, 2957.60it/s][A
  0%|                                  | 1484/1181190 [00:00<06:21, 3093.41it/s][A
  0%|                                  | 1857/1181190 [00:00<05:56, 3307.40it/s][A
  0%|                                  | 2291/1181190 [00:00<05:23, 3643.14it/s][A
  0%|                                  | 2725/1181190 [00:00<05:04, 3863.88it/s][A
  0%|                                  | 3160/1181190 [00:00<04:53, 4013.66it/s][A
  0%|                                  | 3595/1181190 [00:01<04:46, 4116.48it/s

In [82]:
# add diversity features
filtered_sessions = filtered_sessions.assign(
    diversified=((filtered_sessions['nb_themes'] >= 2) | (filtered_sessions['nb_types'] >= 2)),
    diversified_restrictive=((filtered_sessions['nb_themes'] >= 2) & (filtered_sessions['nb_types'] >= 2)),
    diversified_restrictive_5=((filtered_sessions['nb_themes'] >= 5) & (filtered_sessions['nb_types'] >= 5)),
    diversified_5=((filtered_sessions['nb_themes'] >= 5) | (filtered_sessions['nb_types'] >= 5))
)

In [83]:
filtered_sessions

Unnamed: 0,session_id,Ark,Date,Referrer,search_terms,first_referrer,length_minutes,visibility,min_visibility,mean_visibility,...,nb_docs,over_10_docs,top_10%_length,top_5%_length,nb_types,nb_themes,diversified,diversified_restrictive,diversified_restrictive_5,diversified_5
0,1,"[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","['2016-02-14 21:40:58+01:00', '2016-02-14 21:5...","['-', 'http://gallica.bnf.fr/assets/static/sty...","['[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]...",://,18.0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",3,1640.487421,...,144,True,False,False,3,38,True,True,False,True
1,2,"[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","['2016-02-14 20:24:19+01:00', '2016-02-14 20:2...",['http://bibliotheque.clermont-universite.fr/r...,"['[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]...",http://bibliotheque.clermont-universite.fr,5.0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",20,3358.471698,...,45,True,False,False,3,15,True,True,False,True
2,3,"['btv1b6951272q', 'btv1b6951275z', 'btv1b69512...","['2016-02-15 22:00:43+01:00', '2016-02-15 23:2...",['http://data.bnf.fr/documents-by-rdt/11900585...,"['[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]...",http://data.bnf.fr,83.0,"[54, 71, 84, 138, 59, 137, 54, 24, 54, 24, 11,...",2,159.437621,...,1059,True,True,True,5,19,True,True,True,True
3,4,['bpt6k361547'],"['2016-02-15 21:29:00+01:00', '2016-02-15 21:2...",['https://www.google.dz/'],['[]'],https://www.google.dz,0.0,[95],95,95.000000,...,1,False,False,False,1,1,False,False,False,False
4,5,"['btv1b7741316w', 'btv1b7741310d', 'btv1b77413...","['2016-02-14 22:29:54+01:00', '2016-02-14 22:5...","['-', '-', '-', '-', '-', '-', '-', '-', '-', ...","['[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]...",://,23.0,"[27, 53, 27, 53, 27, 53]",27,40.000000,...,2,False,False,False,1,1,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
70958,70959,"['bpt6k5471361x', 'btv1b8432934d', 'btv1b84539...","['2016-02-11 18:53:43+01:00', '2016-02-11 19:1...",['http://catalogue.bnf.fr/rechercher.do?motRec...,"['[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]...",http://catalogue.bnf.fr,20.0,"[11, 1290, 712, 9209, 712, 6, 7, 11, 1290, 712...",3,1723.000000,...,12,True,False,False,1,1,False,False,False,False
70959,70960,"['bpt6k278426', 'bpt6k27843j', 'bpt6k278457', ...","['2016-02-11 19:41:48+01:00', '2016-02-11 20:1...",['http://catalogue.bnf.fr/ark:/12148/cb3030161...,"['[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]...",http://catalogue.bnf.fr,36.0,"[23, 36, 18, 17, 28, 41, 78, 69, 17, 23, 0, 0,...",3,1375.890000,...,97,True,True,False,2,4,True,True,False,False
70960,70961,['bpt6k278426'],"['2016-02-11 21:02:25+01:00', '2016-02-11 21:0...",['http://catalogue.bnf.fr/changerPage.do?motRe...,"['[]', '[]', '[]']",http://catalogue.bnf.fr,1.0,[23],23,23.000000,...,1,False,False,False,1,1,False,False,False,False
70961,70962,['bpt6k297194d'],"['2016-02-11 21:02:54+01:00', '2016-02-11 21:0...",['http://gallica.bnf.fr/ark:/12148/bpt6k297194...,"['[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]...",http://gallica.bnf.fr,2.0,[26],26,26.000000,...,1,False,False,False,1,1,False,False,False,False


In [84]:
# save enriched sessions
filtered_sessions.to_csv("data_month/enriched_sessions.csv")

## 4 : Finding missing arks
Some arks may still be missing. We find out which these are and save them to re-request later.

In [85]:
# keep only the rows where theme and type are both not nan
arks_dict_cleaned = {ark: info for ark, info in arks_dict.items() if pd.notna(info['Theme']) and pd.notna(info['Type'])}

In [86]:
len(arks_dict_cleaned)

1103387

In [87]:
# find the arks not in the dict but in the sessions
filtered_arks = filtered_sessions['Ark_list'].explode().dropna().loc[lambda x: x != 'nan']

# split the ARK lists and flatten them
ark_lists = filtered_arks.str.replace(r"\bnan\b", "", regex=True).str.split(",\s*")
flat_arks = [ark.strip() for sublist in ark_lists for ark in sublist]

# get unique ARKs
unique_arks = set(flat_arks)

# find ARKs not in arks_dict_clean
arks_not_in_dict = unique_arks.difference(arks_dict_cleaned)


In [88]:
len(arks_not_in_dict)

1543933

In [17]:
arks_without_brackets = [ark.strip("[]") for ark in arks_not_in_dict]

In [20]:
arks_without_brackets_df = pd.DataFrame(arks_without_brackets, columns=['Ark'])

In [21]:
# save them to re-request them
arks_without_brackets_df.to_csv("data_temp_month/arks_to_request_last.csv")

## 5 : Finding Rabbit Holes

To find sessions that qualify as rabbit holes, we filter the sessions. First we keep only the end of the long tail of session length, so the ones with top 10% time. Then from these the ones where more than 10 documents were visited. And lastly the ones that are diverse, where the documents visited range over 2 different types or 2 different Dewey classes. We also try with different diversity metrics to see how that influences the percentage of rabbit hole sessions.

In [89]:
# function to filter sessions according to different metrics to keep sessions that could qualify as rabbit holes

def filter_sessions(sessions, top_10_length_col, diversified_col, docs_col, diversity_explanation):
    # filter for top 10% length sessions
    sessions_top10_time = sessions[sessions[top_10_length_col] == True].copy()
    print(f"We keep {len(sessions_top10_time) / len(sessions) * 100:.2f}% of the sessions with the highest time")

    # from the top 10% length, keep only the diversified ones
    sessions_top10_time_diversified = sessions_top10_time[sessions_top10_time[diversified_col] == True].copy()
    print(f"We keep {len(sessions_top10_time_diversified) / len(sessions_top10_time) * 100:.2f}% of the long sessions, which are diversified")
    print(diversity_explanation)

    # from the diversified ones, keep only the ones with more than 10 documents
    sessions_rh_final = sessions_top10_time_diversified[sessions_top10_time_diversified[docs_col] == True].copy()
    print(f"We keep {len(sessions_rh_final) / len(sessions_top10_time_diversified) * 100:.2f}% of the diversified sessions, which have more than 10 documents")

    # calculate the percentage of all sessions that qualify as rabbit holes
    print(f"The sessions that could qualify as rabbit holes constitute {len(sessions_rh_final) / len(sessions) * 100:.2f}% of all the sessions")

    return sessions_rh_final

In [90]:
plain_diversity_explanation = "diversfied means 2 types of documents or more or 2 dewey classes or more"
session_rh_final = filter_sessions(filtered_sessions, 'top_10%_length', 'diversified', 'over_10_docs', plain_diversity_explanation)             

We keep 10.01% of the sessions with the highest time
We keep 28.78% of the long sessions, which are diversified
diversfied means 2 types of documents or more or 2 dewey classes or more
We keep 55.08% of the diversified sessions, which have more than 10 documents
The sessions that could qualify as rabbit holes constitute 1.59% of all the sessions


#### Tests to see if 2.89% is a reasonable percentage for the rabbit holes sessions

With a more restrictive diversity metric : 2 types AND 2 Dewey classes instead of 'or'

In [91]:
restrictive_diversity_explanation = "diversfied means 2 types of documents or more AND 2 dewey classes or more"
session_rh_test = filter_sessions(filtered_sessions, 'top_10%_length', 'diversified_restrictive', 'over_10_docs', restrictive_diversity_explanation)             

We keep 10.01% of the sessions with the highest time
We keep 21.10% of the long sessions, which are diversified
diversfied means 2 types of documents or more AND 2 dewey classes or more
We keep 54.93% of the diversified sessions, which have more than 10 documents
The sessions that could qualify as rabbit holes constitute 1.16% of all the sessions


Then even more restrictive : 5 types and 5 Dewey classes

In [92]:
restrictive5_diversity_explanation = "diversfied means 5 types of documents or more AND 5 dewey classes or more"
session_rh_test = filter_sessions(filtered_sessions, 'top_10%_length', 'diversified_restrictive_5', 'over_10_docs', restrictive5_diversity_explanation)             

We keep 10.01% of the sessions with the highest time
We keep 1.93% of the long sessions, which are diversified
diversfied means 5 types of documents or more AND 5 dewey classes or more
We keep 54.96% of the diversified sessions, which have more than 10 documents
The sessions that could qualify as rabbit holes constitute 0.11% of all the sessions


Final test with 5 types or 5 Dewey classes. We see that we get back to about 5 percent of the sessions.

In [93]:
diversity_5_explanation = "diversfied means 5 types of documents or more or 5 dewey classes or more"
session_rh_test = filter_sessions(filtered_sessions, 'top_10%_length', 'diversified_5', 'over_10_docs', diversity_5_explanation)             

We keep 10.01% of the sessions with the highest time
We keep 18.48% of the long sessions, which are diversified
diversfied means 5 types of documents or more or 5 dewey classes or more
We keep 55.01% of the diversified sessions, which have more than 10 documents
The sessions that could qualify as rabbit holes constitute 1.02% of all the sessions


We conclude that rabbit holes representing about 2-3% of the sessions is a reasonable number. We save these sessions for later use.

In [95]:
session_rh_final.to_csv("data_month/rh_sessions.csv")

The next step will be to compute statistics on both the normal and the rabbit holes sessions.