In [14]:
import os
import difflib
import shutil
from pathlib import Path

import pandas as pd
import numpy as np

from src.dataset.util import read_jsonl, write_jsonl


RAW_PATH = '../data/raw/'
INTERIM_PATH = '../data/interim/'
PROCESSED_PATH = '../data/processed/'

In [209]:
m_rr = pd.read_csv(os.path.join(RAW_PATH, 'ResearchRabbit_Export_1701250574.csv'))
m_my = pd.read_csv(os.path.join(INTERIM_PATH, 'meta_publications_merged.csv'))

### standardize column names
# convert to year
m_my['year'] = [funk.year for funk in pd.to_datetime(m_my['reconstructed_date'], format='mixed')]
# rename columns for merging
m_rr = m_rr.rename(columns={'DOI': 'doi', 'Title': 'title', 'Year': 'year'})

### split parsing and ocr
m_parsing = m_my[m_my['source'] == 'parsing']
m_ocr = m_my[m_my['source'] == 'ocr']

### copying ocr files for sofie

In [121]:
# reltive_root = Path('..')
# destination_root = reltive_root.joinpath('data/interim/ocr_files_orig_structure/')

# for i, row in m_ocr.iterrows():
#     # find implicated file
#     current_path = reltive_root.joinpath(row['path'])
#     filename = current_path.name
#     # copy directory structure from the UTA Publications
#     path_structure = current_path.parent.parts[3:]
#     path_structure = Path(*path_structure)
#     # create destination directory
#     destination_dir = destination_root.joinpath(path_structure)
#     destination_dir.mkdir(parents=True, exist_ok=True)
#     # copy file
#     destination_file_path = destination_dir.joinpath(filename)
#     shutil.copy(current_path, destination_file_path)

In [123]:
# reltive_root = Path('..')
# destination_root = reltive_root.joinpath('data/interim/ocr_files_raw/')
# destination_root.mkdir(parents=True, exist_ok=True)

# for i, row in m_ocr.iterrows():
#     # find implicated file
#     current_path = reltive_root.joinpath(row['path'])
#     filename = current_path.name
#     # copy file
#     destination_file_path = destination_root.joinpath(filename)
#     shutil.copy(current_path, destination_file_path)

### playground

In [28]:
m_my.loc[20, 'path']

'data/raw/UTA publications/UF papers 2011-2013 copy/25 years for APS 2013.pdf'

In [34]:
difflib.get_close_matches(
    "mapping mentalising in the brain", 
    # m_rr["Title"].str.lower().to_list()
    m_rr["Title"].to_list()
    )

['Mapping Mentalising in the Brain']

In [35]:
m_rr.query('Title == "Mapping Mentalising in the Brain"')

Unnamed: 0,DOI,PMID,arXiv ID,Title,Abstract,Authors,Journal,Year
47,10.1007/978-3-030-51890-5_2,,,Mapping Mentalising in the Brain,This is a short history of mentalising researc...,"Chris D. Frith,Uta Frith",,2021.0


In [37]:
# lowercase relevant columns
# m_rr['title_l'] = m_rr['Title'].str.lower() 
# m_my['title_l'] = m_my['title'].str.lower()
# m_my['authors_l'] = m_my['authors'].str.lower()

In [51]:
### merge on doi first
# convert to year
m_my['year'] = [funk.year for funk in pd.to_datetime(m_my['reconstructed_date'], format='mixed')]
# rename columns for merging
m_rr = m_rr.rename(columns={'DOI': 'doi', 'Title': 'title', 'Year': 'year'})
m = pd.merge(m_my, m_rr, how='inner', on='doi')

In [66]:
m_rr_full_doi = m_rr.dropna(subset=['doi'])

m = pd.merge(m_my, m_rr_full_doi, how='left', on='doi')

### merging title lookup (first round)

In [210]:
# lowercase relevant columns
m_rr['title_in_rr'] = m_rr['title'].str.lower()
m_parsing['title_in_prs'] = m_parsing['title'].str.lower()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  m_parsing['title_in_prs'] = m_parsing['title'].str.lower()


In [220]:
m_parsing_no_doi = m_parsing[m_parsing['doi'].isna()]
m_parsing_no_doi_yes_tit = m_parsing_no_doi[~m_parsing_no_doi['title_in_prs'].isna()]


found_pairs = []
for i, row in m_parsing_no_doi_yes_tit.iterrows():

    close_match = difflib.get_close_matches(
        row['title_in_prs'], 
        m_rr["title_in_rr"].to_list(),
        cutoff=0.8
        )
    
    if close_match == []:
        n_matches = 0
        selected_title = None
        close_match = None
    else: 
        n_matches = len(close_match)
        selected_title = close_match[0]
    
    found_pairs.append({
        'title_in_prs': row['title_in_prs'],
        'title_in_rr': selected_title,
        'matches': close_match,
        'id': row['id'],
        'n_matches': n_matches,
    })

found_pairs = pd.DataFrame(found_pairs)
useful_found_pairs = found_pairs.query('n_matches > 0')

In [242]:
m_parsing_aug1 = pd.merge(
    m_parsing_no_doi_yes_tit.drop(columns=['doi']),
    useful_found_pairs.drop(columns=['title_in_prs']),
    on='id', how='left'
    )

m_parsing_aug1 = pd.merge(
    m_parsing_aug1,
    m_rr,
    on='title_in_rr', how='left',
    suffixes=('_prs', '_rr')
    )

m_parsing_aug1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 66 entries, 0 to 65
Data columns (total 19 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   id                  66 non-null     object 
 1   title_prs           66 non-null     object 
 2   date                6 non-null      object 
 3   path                66 non-null     object 
 4   source              66 non-null     object 
 5   reconstructed_date  66 non-null     object 
 6   year_prs            66 non-null     int64  
 7   title_in_prs        66 non-null     object 
 8   title_in_rr         48 non-null     object 
 9   matches             48 non-null     object 
 10  n_matches           48 non-null     float64
 11  doi                 41 non-null     object 
 12  PMID                28 non-null     float64
 13  arXiv ID            0 non-null      float64
 14  title_rr            48 non-null     object 
 15  Abstract            40 non-null     object 
 16  Authors   

### merging based on doi

In [243]:
m_parsing_yes_doi = m_parsing[~m_parsing['doi'].isna()]

m_parsing_aug2 = pd.merge(m_parsing_yes_doi, m_rr, on='doi', how='left', suffixes=('_prs', '_rr'))
m_parsing_aug2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 75 entries, 0 to 74
Data columns (total 17 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   id                  75 non-null     object 
 1   title_prs           71 non-null     object 
 2   date                24 non-null     object 
 3   doi                 75 non-null     object 
 4   path                75 non-null     object 
 5   source              75 non-null     object 
 6   reconstructed_date  75 non-null     object 
 7   year_prs            75 non-null     int64  
 8   title_in_prs        71 non-null     object 
 9   PMID                57 non-null     float64
 10  arXiv ID            0 non-null      float64
 11  title_rr            61 non-null     object 
 12  Abstract            59 non-null     object 
 13  Authors             61 non-null     object 
 14  Journal             59 non-null     object 
 15  year_rr             60 non-null     float64
 16  title_in_r

### merging the datasets

In [276]:
# join the partially augumented dfs
m_parsing_aug_complete = pd.concat([m_parsing_aug1, m_parsing_aug2], ignore_index=True)
# drop garbage columns
m_parsing_aug_complete = m_parsing_aug_complete.drop(columns=['matches', 'n_matches', 'arXiv ID'])
m_parsing_aug_complete.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 141 entries, 0 to 140
Data columns (total 16 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   id                  141 non-null    object 
 1   title_prs           137 non-null    object 
 2   date                30 non-null     object 
 3   path                141 non-null    object 
 4   source              141 non-null    object 
 5   reconstructed_date  141 non-null    object 
 6   year_prs            141 non-null    int64  
 7   title_in_prs        137 non-null    object 
 8   title_in_rr         109 non-null    object 
 9   doi                 116 non-null    object 
 10  PMID                85 non-null     float64
 11  title_rr            109 non-null    object 
 12  Abstract            99 non-null     object 
 13  Authors             109 non-null    object 
 14  Journal             99 non-null     object 
 15  year_rr             108 non-null    float64
dtypes: float

In [277]:
assert m_parsing_aug_complete['id'].nunique() == len(m_parsing_aug_complete)

In [278]:
non_overlap = set(m_parsing['id'].tolist()) - set(m_parsing_aug_complete['id'].tolist())
assert len(m_parsing) - len(m_parsing_aug_complete) == len(non_overlap)

m_parsing_missing = m_parsing[m_parsing['id'].isin(non_overlap)]
m_parsing_missing = (m_parsing_missing
                     .drop(columns=['title', 'doi', 'title_in_prs'])
                     .rename(columns={'year': 'year_prs'})
                     )

m_parsing_missing.info()

<class 'pandas.core.frame.DataFrame'>
Index: 13 entries, 6 to 125
Data columns (total 6 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   id                  13 non-null     object
 1   date                1 non-null      object
 2   path                13 non-null     object
 3   source              13 non-null     object
 4   reconstructed_date  13 non-null     object
 5   year_prs            13 non-null     int64 
dtypes: int64(1), object(5)
memory usage: 728.0+ bytes


In [280]:
m_parsing_ = pd.concat([m_parsing_aug_complete, m_parsing_missing], ignore_index=True)
m_parsing_ = m_parsing_.reset_index(drop=True)
m_parsing_.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 154 entries, 0 to 153
Data columns (total 16 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   id                  154 non-null    object 
 1   title_prs           137 non-null    object 
 2   date                31 non-null     object 
 3   path                154 non-null    object 
 4   source              154 non-null    object 
 5   reconstructed_date  154 non-null    object 
 6   year_prs            154 non-null    int64  
 7   title_in_prs        137 non-null    object 
 8   title_in_rr         109 non-null    object 
 9   doi                 116 non-null    object 
 10  PMID                85 non-null     float64
 11  title_rr            109 non-null    object 
 12  Abstract            99 non-null     object 
 13  Authors             109 non-null    object 
 14  Journal             99 non-null     object 
 15  year_rr             108 non-null    float64
dtypes: float

In [281]:
m_parsing_['year'] = None
m_parsing_['title'] = None

for i, row in m_parsing_.iterrows():
    # year
    if pd.isna(row['year_rr']):
        m_parsing_.loc[i, 'year'] = row['year_prs']
    else:
        m_parsing_.loc[i, 'year'] = row['year_rr']

    # title
    if pd.isna(row['title_rr']):
        m_parsing_.loc[i, 'title'] = row['title_prs']
    else:
        m_parsing_.loc[i, 'title'] = row['title_rr']

m_parsing_.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 154 entries, 0 to 153
Data columns (total 18 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   id                  154 non-null    object 
 1   title_prs           137 non-null    object 
 2   date                31 non-null     object 
 3   path                154 non-null    object 
 4   source              154 non-null    object 
 5   reconstructed_date  154 non-null    object 
 6   year_prs            154 non-null    int64  
 7   title_in_prs        137 non-null    object 
 8   title_in_rr         109 non-null    object 
 9   doi                 116 non-null    object 
 10  PMID                85 non-null     float64
 11  title_rr            109 non-null    object 
 12  Abstract            99 non-null     object 
 13  Authors             109 non-null    object 
 14  Journal             99 non-null     object 
 15  year_rr             108 non-null    float64
 16  year    

In [285]:
(m_parsing_
    .drop(columns=['title_in_rr', 'title_in_prs', 'year_prs', 'year_rr', 'title_prs', 'title_rr'])
    .drop(columns=['date', 'reconstructed_date'])
).info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 154 entries, 0 to 153
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   id        154 non-null    object 
 1   path      154 non-null    object 
 2   source    154 non-null    object 
 3   doi       116 non-null    object 
 4   PMID      85 non-null     float64
 5   Abstract  99 non-null     object 
 6   Authors   109 non-null    object 
 7   Journal   99 non-null     object 
 8   year      154 non-null    object 
 9   title     139 non-null    object 
dtypes: float64(1), object(9)
memory usage: 12.2+ KB


In [None]:
# save m_parsing_