In [32]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import gzip
import os
import glob
import re

In [33]:
folder_path = r'../../../lhstdata1/students/Gallica_logs/1LogGallicaFevrier2016'
all_files = glob.glob(folder_path + "/*.log.gz")
print(len(all_files))

2201


In [34]:
all_files.sort(key=lambda f: int(re.sub('\D', '', f)))

In [35]:
some_files = all_files[300:1000]

In [36]:
len(some_files)

700

In [37]:
df = pd.read_csv(all_files[0], sep='\t', header=None)


In [38]:
len(df)

114722

In [41]:
df[0][0]

'##e7fdec50f50253f6796d61b5382155f8##null##null##- - [31/Jan/2016:18:59:19 +0100] "GET /ark:/12148/bpt6k70211m HTTP/1.0" 200 24552 "-" "-" 48652'

In [42]:
df[0][1]

'##5d1edaaaa9cf4772ce828127ce3e523a##United States##Chapel Hill##- - [31/Jan/2016:18:59:20 +0100] "GET /ark:/12148/bpt6k622723.planchecontact.r=etteilla.f1.langEN HTTP/1.1" 200 19186 "-" "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.111 Safari/537.36" 1201281'

# Structure :
'##' then IP address

'##' then country (or null)

'##' then city (or null)

'##--' then date

nothing, then HTTP request in quotes, with the protocol number
followed by number of response (200 = OK), then size, then referrer website (or '-' if unknown)

In [43]:
# empty df to store concatenation
dfs = []

for f in some_files:
    try:
        # read csv file into df
        df = pd.read_csv(f, encoding='UTF-8', sep ='\t', engine='python', header=None)
        # append to list of dfs
        dfs.append(df)
    except Exception as e:
        print(f"Error loading file {f}: {e}")

#from the list, concatenate the dataframes into a single one
combined_df = pd.concat(dfs, ignore_index=True)

In [44]:
combined_df.shape

(98167550, 1)

In [45]:
#split the logs and put them in columns
lines_df = combined_df[0].str.split('##', expand=True)

In [46]:
lines_df.head()

Unnamed: 0,0,1,2,3,4
0,,6fb861a2dcda718f3e56fb984234fc32,Algeria,,"- - [04/Feb/2016:00:59:58 +0100] ""GET /service..."
1,,5a3ac648814b7afbc68c531b8402378b,Russia,,"- - [04/Feb/2016:00:59:59 +0100] ""GET /ark:/12..."
2,,e74fe2c139b2e2c27cab51b178497085,France,Fargues-Saint-Hilaire,"- - [04/Feb/2016:00:59:58 +0100] ""GET /proxy?..."
3,,e74fe2c139b2e2c27cab51b178497085,France,Fargues-Saint-Hilaire,"- - [04/Feb/2016:00:59:58 +0100] ""GET /proxy?..."
4,,5a3ac648814b7afbc68c531b8402378b,Russia,,"- - [04/Feb/2016:01:00:01 +0100] ""GET /ark:/12..."


In [49]:
# extracting date
temp = pd.DataFrame()
temp['Date'] = lines_df.apply(lambda x: x[4].split("]")[0].split("[")[1], axis=1)

In [51]:
# referrer
temp['Referrer'] = lines_df.apply(lambda x: x[4].split("\"")[3], axis=1)

In [52]:
temp

Unnamed: 0,Date,Referrer
0,04/Feb/2016:00:59:58 +0100,http://gallica.bnf.fr/ark:/12148/bpt6k83289g.r...
1,04/Feb/2016:00:59:59 +0100,-
2,04/Feb/2016:00:59:58 +0100,-
3,04/Feb/2016:00:59:58 +0100,-
4,04/Feb/2016:01:00:01 +0100,-
...,...,...
98167545,13/Feb/2016:05:34:53 +0100,-
98167546,13/Feb/2016:05:34:54 +0100,http://gallica.bnf.fr/services/engine/search/s...
98167547,13/Feb/2016:05:34:55 +0100,http://gallica.bnf.fr/services/engine/search/s...
98167548,13/Feb/2016:05:34:55 +0100,-


In [53]:
nb_unknown_referrer = temp['Referrer'].value_counts()['-']
percentage_unknown_referrer = nb_unknown_referrer/len(temp) * 100
print(percentage_unknown_referrer)

14.668873777536467


In [54]:
#extract request
temp['Request'] = lines_df.apply(lambda x: ' '.join(x[4].split("\"")[1].split(' ')[:2]), axis=1)

In [55]:
#extract ark
#fct to check if request contains ark
def extract_ark(request):
    # capture everything between 12148 and / or between 12148 and . using regex -  the assigning authority number for Gallica
    ark = '-'
    ark = re.findall('(?<=12148/).+?(?=/)|(?<=12148/).+?(?=\.)', request)
    return ark


temp['Ark'] = temp.apply(lambda x: extract_ark(x['Request']), axis=1)
    


In [56]:
lines_df = lines_df.rename(columns = {1:"IPaddress", 2:"Country", 3:"City", 4:"Full_request"})

In [57]:
final_df = pd.concat([lines_df, temp], axis=1)

In [58]:
final_df

Unnamed: 0,0,IPaddress,Country,City,Full_request,Date,Referrer,Request,Ark
0,,6fb861a2dcda718f3e56fb984234fc32,Algeria,,"- - [04/Feb/2016:00:59:58 +0100] ""GET /service...",04/Feb/2016:00:59:58 +0100,http://gallica.bnf.fr/ark:/12148/bpt6k83289g.r...,GET /services/engine/search/sru?operation=sear...,[]
1,,5a3ac648814b7afbc68c531b8402378b,Russia,,"- - [04/Feb/2016:00:59:59 +0100] ""GET /ark:/12...",04/Feb/2016:00:59:59 +0100,-,GET /ark:/12148/bpt6k116584w.thumbnail,[bpt6k116584w]
2,,e74fe2c139b2e2c27cab51b178497085,France,Fargues-Saint-Hilaire,"- - [04/Feb/2016:00:59:58 +0100] ""GET /proxy?...",04/Feb/2016:00:59:58 +0100,-,GET /proxy?method=R&ark=bpt6k1121835.f148&l=0&...,[]
3,,e74fe2c139b2e2c27cab51b178497085,France,Fargues-Saint-Hilaire,"- - [04/Feb/2016:00:59:58 +0100] ""GET /proxy?...",04/Feb/2016:00:59:58 +0100,-,GET /proxy?method=R&ark=bpt6k1121835.f147&l=-1...,[]
4,,5a3ac648814b7afbc68c531b8402378b,Russia,,"- - [04/Feb/2016:01:00:01 +0100] ""GET /ark:/12...",04/Feb/2016:01:00:01 +0100,-,GET /ark:/12148/bpt6k77157b.thumbnail,[bpt6k77157b]
...,...,...,...,...,...,...,...,...,...
98167545,,e74feca46ad927c9132f15deaa66402,United States,Chicago,"- - [13/Feb/2016:05:34:53 +0100] ""GET //iiif/a...",13/Feb/2016:05:34:53 +0100,-,GET //iiif/ark:/12148/bpt6k215245w/f1/full/174...,[bpt6k215245w]
98167546,,e26bd5ebbd2011fc113f40a1756fd599,United States,Boston,"- - [13/Feb/2016:05:34:54 +0100] ""GET /service...",13/Feb/2016:05:34:54 +0100,http://gallica.bnf.fr/services/engine/search/s...,GET /services/ajax/extract/ark:/12148/bpt6k651...,[bpt6k6510332p]
98167547,,e26bd5ebbd2011fc113f40a1756fd599,United States,Boston,"- - [13/Feb/2016:05:34:55 +0100] ""GET /service...",13/Feb/2016:05:34:55 +0100,http://gallica.bnf.fr/services/engine/search/s...,GET /services/ajax/extract/ark:/12148/bpt6k567...,[bpt6k5677637n]
98167548,,e74feca46ad927c9132f15deaa66402,United States,Chicago,"- - [13/Feb/2016:05:34:55 +0100] ""GET //iiif/a...",13/Feb/2016:05:34:55 +0100,-,GET //iiif/ark:/12148/bpt6k56745826/f373/full/...,[bpt6k56745826]


In [59]:
nb_unknown_countries = final_df['Country'].value_counts()['null']
percentage_unknown_countries = nb_unknown_countries/len(final_df) * 100
print(percentage_unknown_countries)

1.0163877982082674


In [60]:
nb_unknown_cities = final_df['City'].value_counts()['null']
percentage_unknown_cities = nb_unknown_cities/len(final_df) * 100
print(percentage_unknown_cities)

25.897728933848303


In [61]:
# create df with ip and the difference in time between each connexion and the last
sessions_df = final_df.groupby('IPaddress').agg({'Ark':list, 'Date':list})