In [31]:
import pandas as pd
from pathlib import Path
import spacy
from spacy.matcher import Matcher
import yaml
from spacy.tokens import Doc, DocBin, Span
from spacy.util import filter_spans
from sutime import SUTime
import string
import re
from datetime import datetime, timedelta
from tqdm.notebook import tqdm

from journals.config import OUTPUT_DATA_DIR, INPUT_DATA_DIR, PROCESSING_DATA_DIR
from journals.parser import Params773Parser


In [53]:
child_df = pd.read_excel(INPUT_DATA_DIR / 'children_with_links_250224.xlsx')

In [44]:
parent_df = pd.read_excel(INPUT_DATA_DIR / 'parents_250224.xlsx')

In [45]:
predicted = pd.read_csv(OUTPUT_DATA_DIR / 'results.csv')

In [54]:
param_parser = Params773Parser()

child_df = param_parser.parse(child_df)

In [47]:
child_df

Unnamed: 0,MMS Id,Permanent Call Number,Barcode,Location Name,Author,Series Statement,773 - Local Param 06,248 - Local Param 09,Publication Date,Uniform Title,...,Bibliographic Level,Language Code,Local Note,Local Note.1,General Note,param_title,param_subtitle,param_metadata,param_wid,param_parent_barcode
0,99322902081,SERIALS S 1865,205718,Zoology Serials,"Oikos Conference on Winter Ecology Oulu, Finla...",Aquilo. Ser Botanica ; Ser zoologica ; tom 23;...,‡t Aquilo.; ‡g Tom.21-28 (1982-1991); ‡w 99146...,,1985-1989.,,...,a,eng,,,000427177 UK-LoNHM,Aquilo.,,Tom.21-28 (1982-1991),9914663702081,000427177
1,99467802081,SERIALS S 1820 C,110582,Zoology Serials,,Issledovaniia fauny moreĭ ; 31 (39) ISSN ;,‡t Issledovaniia fauny moreĭ /; ‡g Vol.31 (198...,,1985.,,...,m,rus,,,"In Russian; summaries, table of contents also ...",Issledovaniia fauny moreĭ /,,Vol.31 (1985),9939022102081,000423859
2,99495402081,SERIALS S 140,206615,Zoology Serials,,"Parasitology ; vol.104, suppl.; Symposia of th...","‡t Parasitology; ‡g Vol.104, Suppl.29 Pt.1-3 (...",,1992.,,...,a,eng,,,000421650 UK-LoNHM,Parasitology,,"Vol.104, Suppl.29 Pt.1-3 (1992)",9911338402081,000421650
3,99527302081,REPTILES S 138,206789,Zoology Serials,"Ferner, John W.",Society for the Study of Amphibians and Reptil...,‡w 9914955602081; ‡g no:000342747,,,,...,m,eng,,,,,,,9914955602081,000342747
4,99539702081,SERIALS S 820,206855,Zoology Serials,,,‡t Feuille des jeunes Naturalistes.; ‡g Ann.11...,,1881.,,...,m,fre,,,000414984 UK-LoNHM,Feuille des jeunes Naturalistes.,,Ann.11-12 No.121-144 (1880-1882),9912659302081,000414984
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3542,999805096000802081,SERIALS S 1982 D,,Zoology Serials,"Sim, Chung Ja author.","Invertebrate fauna of Korea ; Vol.2, no.4",‡t Invertebrate fauna of Korea; ‡w 99390572020...,,2021.,,...,m,eng,,,000367586 UK-LoNHM,Invertebrate fauna of Korea,,,9939057202081,000367586
3543,999805105101002081,SERIALS S 1720,,Zoology Serials,"Grobben, Carl, |d 1854-1945",Universität Wien. Arbeiten aus dem Zoologische...,‡t Arbeiten aus dem Zoologischen Institute der...,,1899.,,...,a,eng,,,000412957 UK-LoNHM,Arbeiten aus dem Zoologischen Institute der Un...,,Tom.10 (1892-1893),9932209802081,000412957
3544,999805105301002081,MOLLUSCA S 1309 A,,Zoology Serials,"Kittel, Klaus, author.","Acta Conchyliorum, heft 21 0721-1635 ;",‡t Acta conchyliorum; ‡w 9911663002081; ‡g no:...,,2022.,,...,m,eng,,,000401052 UK-LoNHM,Acta conchyliorum,,,9911663002081,000401052
3545,999805116900602081,SERIALS S 1319 A,,Zoology Serials,,Blätter aus dem Naumann-Museum ; 10,‡t Blätter aus dem Naumann-Museum; ‡g Year 198...,,1986.,,...,a,ger,,,000134014 UK-LoNHM,Blätter aus dem Naumann-Museum,,Year 1986 Number 10 (1986),9910723302081,000134014


In [65]:
# Ensure child with links has a barcode
labelled = child_df[child_df.param_parent_barcode.notna()]

# And barcode exists in the parent
labelled = labelled[child_df['param_parent_barcode'].isin(parent_df.Barcode.unique())]

  labelled = labelled[child_df['param_parent_barcode'].isin(parent_df.Barcode.unique())]


In [66]:
labelled = labelled[['MMS Id', 'param_parent_barcode']]
labelled.columns = ["mms_id", "true_parent_barcode"]

In [67]:
labelled

Unnamed: 0,mms_id,true_parent_barcode
0,99322902081,000427177
1,99467802081,000423859
2,99495402081,000421650
3,99527302081,000342747
4,99539702081,000414984
...,...,...
3542,999805096000802081,000367586
3543,999805105101002081,000412957
3544,999805105301002081,000401052
3545,999805116900602081,000134014


In [51]:
predicted

Unnamed: 0,mms_id,barcode
0,9910003302081,000432325
1,9910005602081,000411792
2,9910007102081,000432608
3,9910027002081,000423817
4,9910027202081,000423832
...,...,...
2457,999949002081,000423759
2458,999970002081,000431922
2459,999970402081,000431935
2460,999979802081,000439187


In [68]:
merged_df = labelled.merge(predicted, on="mms_id", suffixes=("_true", "_pred"))

In [69]:
merged_df

Unnamed: 0,mms_id,true_parent_barcode,barcode
0,99322902081,000427177,000427177
1,99467802081,000423859,000423859
2,99495402081,000421650,000421650
3,99541702081,000430708,000430708
4,99559302081,000428849,000428849
...,...,...,...
2441,999805096000702081,000367587,000367587
2442,999805096000802081,000367586,000367586
2443,999805105301002081,000401052,000401052
2444,999805116900602081,000134014,000134014


In [70]:
correct_predictions = (merged_df["true_parent_barcode"] == merged_df["barcode"]).sum()

In [75]:
correct_predictions / len(labelled)

0.6798745724059293

In [42]:
for mms_id, group in features.groupby(level='mms_id'):
    print(mms_id)

9910003302081
9910005602081
9910007102081
9910027002081
9910027202081
9910058702081
991007402081
9910091102081
9910093802081
9910115302081
9910120602081
9910120802081
9910132602081
9910133002081
9910135502081
9910136202081
9910176502081
9910177002081
9910177402081
9910191902081
9910198502081
9910218302081
9910224202081
9910234302081
9910234502081
9910234902081
9910250802081
9910261802081
9910285102081
9910292802081
9910312902081
991032902081
9910332702081
9910332902081
9910333302081
9910333602081
9910333802081
9910333902081
9910334102081
9910334702081
9910334902081
9910344402081
9910352502081
9910356002081
9910374102081
9910389302081
9910394502081
9910404502081
9910405102081
9910410702081
9910411002081
9910415002081
9910421102081
9910427202081
9910427802081
9910428702081
9910432402081
9910443702081
9910452202081
9910458102081
9910486802081
9910492802081
9910510102081
9910516702081
9910520802081
9910527502081
9910536002081
9910546002081
9910569402081
9910577602081
9910578002081
99105782