Bot_6

# Chargement des données

In [None]:
import pandas as pd

from IPython.display import display
from urllib.parse import quote

doc2start = {"bpt6k63243601": 123, "bpt6k62931221": 151, "bpt6k6286466w": 189, "bpt6k6393838j": 219, "bpt6k6331310g": 216, "bpt6k6292987t": 353, "bpt6k62906378": 288, "bpt6k6391515w": 319, "bpt6k6315927h": 349, "bpt6k6319106t": 324, "bpt6k6315985z": 82, "bpt6k63959929": 82, "bpt6k63197984": 56, "bpt6k6389871r": 77, "bpt6k6319811j": 79, "bpt6k6282019m": 72, "bpt6k6314752k": 190, "bpt6k6305463c": 113, "bpt6k6318531z": 108, "bpt6k6324389h": 72, "bpt6k63243920": 80, "bpt6k6309075f": 96, "bpt6k6333200c": 132, "bpt6k63243905": 134, "bpt6k6333170p": 137, "bpt6k96727875": 135, "bpt6k9764746t": 99, "bpt6k97645375": 123, "bpt6k9672117f": 125, "bpt6k9763554c": 123, "bpt6k9763553z": 105, "bpt6k9677392n": 110, "bpt6k9692809v": 113, "bpt6k9762929c": 129, "bpt6k9672776c": 119, "bpt6k9764647w": 121, "bpt6k9669143t": 145, "bpt6k9677737t": 139, "bpt6k9668037f": 167, "bpt6k96839542": 171, "bpt6k96762564": 185, "bpt6k9685861g": 189, "bpt6k9763471j": 153, "bpt6k9762899p": 157, "bpt6k97630871": 11, "bpt6k9684454n": 235, "bpt6k9732740w": 239, "bpt6k9684013b": 189, "bpt6k9692626p": 305, "bpt6k9685098r": 281, "bpt6k9764402m": 329, "bpt6k97631451": 322, "bpt6k9776121t": 49, "bpt6k9775724t": 33, "bpt6k97774838": 327, "bpt6k9780089g": 339}

def entry2url(row):
    """
    Takes a row of an Annuaire csv and
    transforms it to the corresponding Gallica url
    """
    url = "https://gallica.bnf.fr/ark:/12148/"
    
    directory = row['directory']
    page = row['page'] - doc2start[directory]
    url += f"{row['directory']}/f{row['page']-doc2start[row['directory']]}"
    
    r_strings = []
    if 'name' in row and pd.notna(row['name']):
        r_strings.append(quote(row['name'].replace('.', ' ')))
    if 'job' in row and pd.notna(row['job']):
        r_strings.append(quote(row['job'].replace('.', ' ')))
    if 'street' in row and pd.notna(row['street']):
        r_strings.append(quote(row['street'].replace('.', ' ')))
    if 'number' in row and pd.notna(row['number']):
        r_strings.append(quote(row['number'].replace('.', ' ')))
    
    if len(r_strings) > 0:
        url += f".item.r={'%20'.join(r_strings)}.zoom"
    
    return url

def add_clickable_url(bottin_dataframe):
    bottin_dataframe = bottin_dataframe.copy()
    bottin_dataframe['url'] = bottin_dataframe.apply(entry2url, axis=1)
    def make_clickable(val):
        return '<a href="{}">gallica url</a>'.format(val,val)

    return bottin_dataframe.style.format(make_clickable, subset=['url'])

In [None]:
data_bottin = pd.read_csv('bottin_data_groupe_6.csv')

# Aperçu des données

In [None]:
# Infos de base

entries_number = data_bottin.shape[0] 
entries_per_year = data_bottin.groupby('year').size() 
unique_names_number = data_bottin['name'].unique().size 
unique_jobs_number = data_bottin['job'].unique().size 
unique_streets_number = data_bottin['street'].unique().size 

print(f"Il y {entries_number} entrées, dont {unique_names_number} noms uniques, {unique_jobs_number} métiers uniques, {unique_streets_number} rue uniques")
print("\nLa distribution d'entrées par année est la suivante:")
print("\n".join([f"\t{year}: {count}" for year, count in entries_per_year.reset_index().values]))

In [None]:
# histogramme par années

(data_bottin.groupby('year').size()
 .plot(kind='bar', title='Entries per year', figsize=(8,5)).set_ylabel('Number of entries'));

In [None]:
# distibrution de duplicats de noms

(data_bottin['name'].value_counts()
 .plot(kind='hist',loglog=True, bins=1000,
       title='Distribution of duplicate names',
       figsize=(8,5)).set_xlabel('Number of duplicates'));

# Début de clean

In [None]:
#garder seulement les entrées composées d'un mot (et contenu entre paranthèses)

regex_one_word = '^\s*\w+(?:\s?\(.*\)\s*)?\s*$'

predicate_one_word = data_bottin['name'].str.match(regex_one_word) # à remplacer, pour le moment sélélectionne tout

data_bottin_one_word = data_bottin.loc[predicate_one_word].copy()

In [None]:
#nettoyer le numéro de rue: garder seulement le premier nombre et éventuellement 'bis'
data_bottin['number_clean'] = data_bottin['number'].str.extract('(^\d+(?: ?bis)?).*')

# Exploration des données

In [None]:
#extraire le compte de chaque mot entre (), dans name
regex_parens = '^.*?\((.*)\).*?$'

name_parens = data_bottin['name'].str.extract(regex_parens).dropna()[0]

from collections import Counter
name_parens_split = name_parens.str.split('\W')
name_parens_split = name_parens_split.apply(lambda words: [word for word in words if len(word) > 0])

word_counts = Counter()

for words in name_parens_split.values:
    word_counts.update(words)

#afficher les 5 mots les plus communs entre (), dans name    
word_counts.most_common(5)

In [None]:
#extraire les mots correspondent pas à predicate_one_word (un seul mot + texte entre paranthèse)
from collections import Counter
name_one_word_split = data_bottin.loc[~predicate_one_word]['name'].str.split('\W')
name_one_word_split = name_one_word_split.apply(lambda words: [word for word in words if len(word) > 0])

word_counts = Counter()

for words in name_one_word_split.values:
    word_counts.update(words)
    
#afficher les 5 expressions les plus communs présents dans les cases name de plus d'un mot      
word_counts.most_common(5)

In [None]:
#grouper les duplicates et les afficher
(data_bottin
 .groupby(['name', 'job', 'street', 'number'])
 .size()
 .sort_values(ascending=False)
 .value_counts()
 .to_frame('Count')
 .rename_axis('Number of duplicates'))

In [None]:
#meilleurs résultats avec number_clean
(data_bottin
 .groupby(['name', 'street_only', 'number_clean'])
 .size()
 .sort_values(ascending=False)
 .value_counts()
 .to_frame('Count')
 .rename_axis('Number of duplicates'))

Tests pour voir erreurs d'OCR:

In [None]:
(data_bottin
 .groupby(['name', 'job', 'street_only', 'number_clean'])
 .size().to_frame('count')
 .sort_values(ascending=False, by="count")
)

In [None]:
data_bottin[((data_bottin['name'] == 'Pitt et Scott') | (data_bottin['name'] == 'Pitt et Seott')) & (data_bottin['number_clean'] == '7')]

In [None]:
(data_bottin.loc[~predicate_one_word]
 .groupby(['name', 'job'])
 .size().to_frame('count')
 .sort_values(ascending=False, by="count")
)

In [None]:
x = data_bottin[['job']].copy().drop_duplicates().reset_index()[['job']]
x.head(len(x))

In [None]:
y = data_bottin.copy().dropna()
sample = y[y['job'].str.contains("success")]

In [None]:
sample

In [None]:
add_clickable_url(sample.fillna(''))

que faire avec les entreprises ?
les NC et les abréviations semblables ?
certaines entrées des entrées mal faite

dans job:
successeur): 
success.). 

[year] / Paris. [name], [job], exerce son activité au [number_clean] [street_clean].

In [None]:
#Generate text for wikipast pages
df = pd.DataFrame()

In [None]:
df['criteria'] = data_bottin['name'].astype(str)+" "\
+data_bottin['job'].astype(str)+" "\
+data_bottin['number_clean'].astype(str)+" "\
+data_bottin['street_clean'].astype(str)

In [None]:
df['page_text'] = data_bottin['year'].astype(str)\
+" / Paris. "\
+data_bottin['name'].astype(str)\
+", "+ data_bottin['job'].astype(str)\
+", exerce son activité au "\
+data_bottin['street_clean'].astype(str)\
+" "+data_bottin['number_clean'].astype(str)

In [None]:
pd.set_option('display.max_colwidth', None)
df.head(1)

In [None]:
import re
chaine = "*) el Cle"
chaine[re.search("\w", chaine).start(): len(chaine)]


In [None]:
user = 'Clement.lhoste@BottinBot6'
password = '0fimgpdbef61a9lbkiumrhtpq2prmi8'

In [None]:
from pywikiapi import Site

In [None]:
site = Site('http://wikipast.epfl.ch/wikipast/api.php') # Définition de l'adresse de l'API
site.no_ssl = True # Désactivation du https, car pas activé sur wikipast
site.login(user, password) # Login du bot