In [1]:
import os
import difflib
import shutil
from pathlib import Path

import pandas as pd
import numpy as np

from src.dataset.util import read_jsonl, write_jsonl


RAW_PATH = '../data/raw/'
INTERIM_PATH = '../data/interim/'
PROCESSED_PATH = '../data/processed/'

In [2]:
m_rr = pd.read_csv(os.path.join(RAW_PATH, 'ResearchRabbit_Export_1701250574.csv'))
m_my = pd.read_csv(os.path.join(INTERIM_PATH, 'meta_publications_merged.csv'))

### standardize column names
# convert to year
m_my['year'] = [funk.year for funk in pd.to_datetime(m_my['reconstructed_date'], format='mixed')]
# rename columns for merging
m_rr = m_rr.rename(columns={'DOI': 'doi', 'Title': 'title', 'Year': 'year'})

### split parsing and ocr
m_parsing = m_my[m_my['source'] == 'parsing']
m_ocr = m_my[m_my['source'] == 'ocr']

In [5]:
m_parsing.info()

<class 'pandas.core.frame.DataFrame'>
Index: 271 entries, 0 to 270
Data columns (total 8 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   id                  271 non-null    object
 1   title               190 non-null    object
 2   date                32 non-null     object
 3   doi                 75 non-null     object
 4   path                271 non-null    object
 5   source              271 non-null    object
 6   reconstructed_date  271 non-null    object
 7   year                271 non-null    int64 
dtypes: int64(1), object(7)
memory usage: 19.1+ KB


### copying ocr files for sofie

In [121]:
# reltive_root = Path('..')
# destination_root = reltive_root.joinpath('data/interim/ocr_files_orig_structure/')

# for i, row in m_ocr.iterrows():
#     # find implicated file
#     current_path = reltive_root.joinpath(row['path'])
#     filename = current_path.name
#     # copy directory structure from the UTA Publications
#     path_structure = current_path.parent.parts[3:]
#     path_structure = Path(*path_structure)
#     # create destination directory
#     destination_dir = destination_root.joinpath(path_structure)
#     destination_dir.mkdir(parents=True, exist_ok=True)
#     # copy file
#     destination_file_path = destination_dir.joinpath(filename)
#     shutil.copy(current_path, destination_file_path)

In [123]:
# reltive_root = Path('..')
# destination_root = reltive_root.joinpath('data/interim/ocr_files_raw/')
# destination_root.mkdir(parents=True, exist_ok=True)

# for i, row in m_ocr.iterrows():
#     # find implicated file
#     current_path = reltive_root.joinpath(row['path'])
#     filename = current_path.name
#     # copy file
#     destination_file_path = destination_root.joinpath(filename)
#     shutil.copy(current_path, destination_file_path)

### merging title lookup (first round)

In [6]:
# lowercase relevant columns
m_rr['title_in_rr'] = m_rr['title'].str.lower()
m_parsing['title_in_prs'] = m_parsing['title'].str.lower()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  m_parsing['title_in_prs'] = m_parsing['title'].str.lower()


In [7]:
m_parsing_no_doi = m_parsing[m_parsing['doi'].isna()]
m_parsing_no_doi_yes_tit = m_parsing_no_doi[~m_parsing_no_doi['title_in_prs'].isna()]


found_pairs = []
for i, row in m_parsing_no_doi_yes_tit.iterrows():

    close_match = difflib.get_close_matches(
        row['title_in_prs'], 
        m_rr["title_in_rr"].to_list(),
        cutoff=0.8
        )
    
    if close_match == []:
        n_matches = 0
        selected_title = None
        close_match = None
    else: 
        n_matches = len(close_match)
        selected_title = close_match[0]
    
    found_pairs.append({
        'title_in_prs': row['title_in_prs'],
        'title_in_rr': selected_title,
        'matches': close_match,
        'id': row['id'],
        'n_matches': n_matches,
    })

found_pairs = pd.DataFrame(found_pairs)
useful_found_pairs = found_pairs.query('n_matches > 0')

useful_found_pairs.info()

<class 'pandas.core.frame.DataFrame'>
Index: 76 entries, 0 to 118
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   title_in_prs  76 non-null     object
 1   title_in_rr   76 non-null     object
 2   matches       76 non-null     object
 3   id            76 non-null     object
 4   n_matches     76 non-null     int64 
dtypes: int64(1), object(4)
memory usage: 3.6+ KB


In [8]:
m_parsing_aug1 = pd.merge(
    m_parsing_no_doi_yes_tit.drop(columns=['doi']),
    useful_found_pairs.drop(columns=['title_in_prs']),
    on='id', how='left'
    )

m_parsing_aug1 = pd.merge(
    m_parsing_aug1,
    m_rr,
    on='title_in_rr', how='left',
    suffixes=('_prs', '_rr')
    )

m_parsing_aug1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 119 entries, 0 to 118
Data columns (total 19 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   id                  119 non-null    object 
 1   title_prs           119 non-null    object 
 2   date                7 non-null      object 
 3   path                119 non-null    object 
 4   source              119 non-null    object 
 5   reconstructed_date  119 non-null    object 
 6   year_prs            119 non-null    int64  
 7   title_in_prs        119 non-null    object 
 8   title_in_rr         76 non-null     object 
 9   matches             76 non-null     object 
 10  n_matches           76 non-null     float64
 11  doi                 65 non-null     object 
 12  PMID                42 non-null     float64
 13  arXiv ID            0 non-null      float64
 14  title_rr            76 non-null     object 
 15  Abstract            61 non-null     object 
 16  Authors 

### merging based on doi

In [9]:
m_parsing_yes_doi = m_parsing[~m_parsing['doi'].isna()]

m_parsing_aug2 = pd.merge(m_parsing_yes_doi, m_rr, on='doi', how='left', suffixes=('_prs', '_rr'))
m_parsing_aug2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 75 entries, 0 to 74
Data columns (total 17 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   id                  75 non-null     object 
 1   title_prs           71 non-null     object 
 2   date                24 non-null     object 
 3   doi                 75 non-null     object 
 4   path                75 non-null     object 
 5   source              75 non-null     object 
 6   reconstructed_date  75 non-null     object 
 7   year_prs            75 non-null     int64  
 8   title_in_prs        71 non-null     object 
 9   PMID                57 non-null     float64
 10  arXiv ID            0 non-null      float64
 11  title_rr            61 non-null     object 
 12  Abstract            59 non-null     object 
 13  Authors             61 non-null     object 
 14  Journal             59 non-null     object 
 15  year_rr             60 non-null     float64
 16  title_in_r

### merging the datasets

In [10]:
# join the partially augumented dfs
m_parsing_aug_complete = pd.concat([m_parsing_aug1, m_parsing_aug2], ignore_index=True)
# drop garbage columns
m_parsing_aug_complete = m_parsing_aug_complete.drop(columns=['matches', 'n_matches', 'arXiv ID'])
m_parsing_aug_complete.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 194 entries, 0 to 193
Data columns (total 16 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   id                  194 non-null    object 
 1   title_prs           190 non-null    object 
 2   date                31 non-null     object 
 3   path                194 non-null    object 
 4   source              194 non-null    object 
 5   reconstructed_date  194 non-null    object 
 6   year_prs            194 non-null    int64  
 7   title_in_prs        190 non-null    object 
 8   title_in_rr         137 non-null    object 
 9   doi                 140 non-null    object 
 10  PMID                99 non-null     float64
 11  title_rr            137 non-null    object 
 12  Abstract            120 non-null    object 
 13  Authors             137 non-null    object 
 14  Journal             122 non-null    object 
 15  year_rr             136 non-null    float64
dtypes: float

In [11]:
assert m_parsing_aug_complete['id'].nunique() == len(m_parsing_aug_complete)

In [13]:
non_overlap = set(m_parsing['id'].tolist()) - set(m_parsing_aug_complete['id'].tolist())
assert len(m_parsing) - len(m_parsing_aug_complete) == len(non_overlap)

m_parsing_missing = m_parsing[m_parsing['id'].isin(non_overlap)]
m_parsing_missing = (m_parsing_missing
                     .drop(columns=['title', 'doi', 'title_in_prs'])
                     .rename(columns={'year': 'year_prs'})
                     )

m_parsing_missing.info()

<class 'pandas.core.frame.DataFrame'>
Index: 77 entries, 0 to 257
Data columns (total 6 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   id                  77 non-null     object
 1   date                1 non-null      object
 2   path                77 non-null     object
 3   source              77 non-null     object
 4   reconstructed_date  77 non-null     object
 5   year_prs            77 non-null     int64 
dtypes: int64(1), object(5)
memory usage: 4.2+ KB


In [14]:
m_parsing_ = pd.concat([m_parsing_aug_complete, m_parsing_missing], ignore_index=True)
m_parsing_ = m_parsing_.reset_index(drop=True)
m_parsing_.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 271 entries, 0 to 270
Data columns (total 16 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   id                  271 non-null    object 
 1   title_prs           190 non-null    object 
 2   date                32 non-null     object 
 3   path                271 non-null    object 
 4   source              271 non-null    object 
 5   reconstructed_date  271 non-null    object 
 6   year_prs            271 non-null    int64  
 7   title_in_prs        190 non-null    object 
 8   title_in_rr         137 non-null    object 
 9   doi                 140 non-null    object 
 10  PMID                99 non-null     float64
 11  title_rr            137 non-null    object 
 12  Abstract            120 non-null    object 
 13  Authors             137 non-null    object 
 14  Journal             122 non-null    object 
 15  year_rr             136 non-null    float64
dtypes: float

In [15]:
m_parsing_['year'] = None
m_parsing_['title'] = None

for i, row in m_parsing_.iterrows():
    # year
    if pd.isna(row['year_rr']):
        m_parsing_.loc[i, 'year'] = row['year_prs']
    else:
        m_parsing_.loc[i, 'year'] = row['year_rr']

    # title
    if pd.isna(row['title_rr']):
        m_parsing_.loc[i, 'title'] = row['title_prs']
    else:
        m_parsing_.loc[i, 'title'] = row['title_rr']

m_parsing_.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 271 entries, 0 to 270
Data columns (total 18 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   id                  271 non-null    object 
 1   title_prs           190 non-null    object 
 2   date                32 non-null     object 
 3   path                271 non-null    object 
 4   source              271 non-null    object 
 5   reconstructed_date  271 non-null    object 
 6   year_prs            271 non-null    int64  
 7   title_in_prs        190 non-null    object 
 8   title_in_rr         137 non-null    object 
 9   doi                 140 non-null    object 
 10  PMID                99 non-null     float64
 11  title_rr            137 non-null    object 
 12  Abstract            120 non-null    object 
 13  Authors             137 non-null    object 
 14  Journal             122 non-null    object 
 15  year_rr             136 non-null    float64
 16  year    

In [16]:
(m_parsing_
    .drop(columns=['title_in_rr', 'title_in_prs', 'year_prs', 'year_rr', 'title_prs', 'title_rr'])
    .drop(columns=['date', 'reconstructed_date'])
).info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 271 entries, 0 to 270
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   id        271 non-null    object 
 1   path      271 non-null    object 
 2   source    271 non-null    object 
 3   doi       140 non-null    object 
 4   PMID      99 non-null     float64
 5   Abstract  120 non-null    object 
 6   Authors   137 non-null    object 
 7   Journal   122 non-null    object 
 8   year      271 non-null    object 
 9   title     192 non-null    object 
dtypes: float64(1), object(9)
memory usage: 21.3+ KB


In [None]:
# save m_parsing_

### it worked

There are 272 good documents (2 are broken & can't be openned).  
Out of those, 271 were successfully parsed with `scipdf`.

Now we have full metadata specification for 140 documents & valid year for all the documents.

In [17]:
qual = pd.read_csv(os.path.join(PROCESSED_PATH, 'meta_publications_quality.csv'))

In [21]:
# remove rows with nan in 'text' column
# tp for text present
qual_tp = qual.dropna(subset=['text'])
qual_tp.groupby('passed_quality_check').size()

passed_quality_check
False     68
True     322
dtype: int64

In [25]:
qual_tp.query('passed_quality_check == False')['text']

0      Recent studies on mentalizing have shown that ...
1      Introduction\nTheory of Mind is defined as the...
51     \nIn the same way that a literal tool such as ...
71     Introduction\nWhen a child or adult starts to ...
73     \nRecent research has confirmed that dyslexia ...
                             ...                        
515    \n|\n\n \n\n \n\nCopniion, 50 (198) 115-122\n(...
523    \n \n\n \n\nDyslexia: can we have a shared\nth...
525    \nCHAPTER 13,\n\nCausal Modeling: A Structural...
529    \n[SPECIFIC SPELLING PROBLENS\n\nee Festn\n\nI...
537    \nKlaus B.Giinther/ Hartmut Giinther (Hg.)\nSc...
Name: text, Length: 68, dtype: object

In [26]:
qual_tp.info()

<class 'pandas.core.frame.DataFrame'>
Index: 390 entries, 0 to 541
Data columns (total 79 columns):
 #   Column                                  Non-Null Count  Dtype  
---  ------                                  --------------  -----  
 0   Unnamed: 0                              390 non-null    int64  
 1   pub_id                                  390 non-null    object 
 2   title                                   276 non-null    object 
 3   date                                    51 non-null     object 
 4   doi                                     115 non-null    object 
 5   path                                    390 non-null    object 
 6   source                                  390 non-null    object 
 7   reconstructed_date                      390 non-null    object 
 8   lang                                    390 non-null    object 
 9   text                                    390 non-null    object 
 10  flesch_reading_ease                     390 non-null    float64
 11