In [None]:
import numpy as np
import pandas as pd
import scipy

from IPython.display import clear_output

import sys
sys.path.append('../../../../Documents/GitHub/gustav/src/')

from gustav import ebi, ncbi, nlm, biogrid, nih, openalex
from gustav import publications
from gustav import github
from gustav import access_framework
from gustav import mapper

import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

sys.path.append('../general/src/')
from manuscript import export
from manuscript import inout
from manuscript import datasets
from manuscript import tools

from sklearn.metrics import auc
from scipy.stats import fisher_exact
pd.options.display.precision = 3
pd.options.display.expand_frame_repr = False
pd.options.display.max_columns = 20

import re
import matplotlib.dates as mdates

In [None]:
%%time
# load all works to do proper countby
works = openalex.works(
            dataset = 'main', columns=['work_id', 'type', 'year'])

In [None]:
works = works[(works['type'] == 'journal-article') | (works['type'] == 'proceedings-article')]

In [None]:
%%time
# load all works to do proper countby
works = openalex.works(
            dataset = 'main', 
    columns=['source_id', 'type', 'year', 'work_id'], 
    filters={'work_id':works['work_id']})

In [None]:
%%time
# writing to parquet does not load back in
works.to_csv('../data/openalex_articles_230328.csv', index=False)

# Identify indexing timeline of journals

In [None]:
%%time
sources = openalex.sources(
    dataset = 'main'
    )  
sources = sources.explode('issn').explode('issn_l')

In [None]:
scopus_deindexed = pd.read_csv('~/OneDrive - Northwestern University/aging_paper_mills/materials/general/manually_curated/' + \
            'scopus_discontinued_240301.csv',
                          encoding='latin', header=1)

In [None]:
scopus_deindexed = scopus_deindexed.rename(columns={'Print ISSN':'Print-ISSN', 'EISSN':'E-ISSN'})

In [None]:
scopus_indexed = pd.read_csv('~/OneDrive - Northwestern University/aging_paper_mills/materials/general/manually_curated/' + \
            'indexed_by_scopus_240301.csv',
                          encoding='latin', header=0)

In [None]:
scopus_indexed['Print-ISSN'] = scopus_indexed['Print-ISSN'].astype(str).apply(lambda x:x[0:4] + '-' + x[4:])
scopus_indexed['E-ISSN'] = scopus_indexed['E-ISSN'].astype(str).apply(lambda x:x[0:4] + '-' + x[4:])
scopus_deindexed['Print-ISSN'] = scopus_deindexed['Print-ISSN'].astype(str).apply(lambda x:x[0:4] + '-' + x[4:])
scopus_deindexed['E-ISSN'] = scopus_deindexed['E-ISSN'].astype(str).apply(lambda x:x[0:4] + '-' + x[4:])

In [None]:
wos_deindexed = pd.read_csv('~/OneDrive - Northwestern University/aging_paper_mills/materials/LICENSED/from_clarivate_230224/' + \
            'WoS_Core_Journal_History_2021_unfiltered.csv',
                          encoding='latin', header=0)

In [None]:
sources['scopus'] = (sources['issn'].isin(scopus_indexed['Print-ISSN']) | \
                     sources['issn'].isin(scopus_indexed['E-ISSN'])) | \
        (sources['issn_l'].isin(scopus_indexed['Print-ISSN']) |  \
                     sources['issn_l'].isin(scopus_indexed['E-ISSN']) | \
         sources['display_name'].str.lower().isin(scopus_indexed['Source Title'].str.lower()))

In [None]:
scopus_deindexed.loc[scopus_deindexed['Year'] == 'Unable to determine']['Year'] = np.nan

In [None]:
issn_year_scopus_dict = dict(zip(scopus_deindexed['Print-ISSN'], scopus_deindexed['Year']))
eissn_year_scopus_dict = dict(zip(scopus_deindexed['E-ISSN'], scopus_deindexed['Year']))
name_year_scopus_dict = dict(zip(scopus_deindexed['Source Title (newly added titles are highlighted in red)'].str.lower(),
                                scopus_deindexed['Year']))

In [None]:
sources['scopus_last_year_issn'] = sources['issn'].apply(lambda x: issn_year_scopus_dict.get(x)).fillna(-1)
sources['scopus_last_year_eissn'] = sources['issn'].apply(lambda x: eissn_year_scopus_dict.get(x)).fillna(-1)
sources['scopus_last_year_issn_l'] = sources['issn_l'].apply(lambda x: issn_year_scopus_dict.get(x)).fillna(-1)
sources['scopus_last_year_eissn_l'] = sources['issn_l'].apply(lambda x: eissn_year_scopus_dict.get(x)).fillna(-1)
sources['scopus_last_year_name'] = sources['display_name'].apply(lambda x: name_year_scopus_dict.get(x)).fillna(-1)
sources['scopus_last_year'] = np.max(sources[['scopus_last_year_issn', 'scopus_last_year_eissn',
                                              'scopus_last_year_issn_l', 'scopus_last_year_eissn_l',
                                             'scopus_last_year_name']].astype(int).values, axis=1)

In [None]:
sources['wos'] = (sources['issn'].isin(wos_deindexed['ISSN']) | \
                     sources['issn'].isin(wos_deindexed['EISSN'])) | \
        (sources['issn_l'].isin(wos_deindexed['ISSN']) |  \
                     sources['issn_l'].isin(wos_deindexed['EISSN']) | \
         sources['display_name'].str.lower().isin(wos_deindexed['Title'].str.lower()))

In [None]:
wos_deindexed = wos_deindexed[wos_deindexed['Active'] == 'No'].copy()

In [None]:
for col in ['SCIE', 'SSCI', 'AHCI',
       'ESCI']:
    wos_deindexed[col] = wos_deindexed[col].fillna(',').astype(str).apply(lambda x: x.replace('-',',').replace('.', ','))

In [None]:
wos_deindexed['years'] = wos_deindexed[['SCIE', 'SSCI', 'AHCI',
       'ESCI']].fillna(',').sum(axis=1)

In [None]:
wos_deindexed['years'] = wos_deindexed['years'].astype(str).apply(lambda x: x.split(','))

In [None]:
wos_deindexed['years'] = wos_deindexed['years'].astype(str).apply(lambda x: x[0:4])

In [None]:
wos_deindexed['years'] = wos_deindexed['years'].astype(int)

In [None]:
wos_deindexed = wos_deindexed.sort_values('years', ascending=False).drop_duplicates(subset=['Title' , 'ISSN', 'EISSN'], keep='first')

In [None]:
issn_year_wos_dict = dict(zip(wos_deindexed['ISSN'], wos_deindexed['years']))
eissn_year_wos_dict = dict(zip(wos_deindexed['EISSN'], wos_deindexed['years']))
name_year_wos_dict = dict(zip(wos_deindexed['Title'].str.lower(), wos_deindexed['years']))

In [None]:
sources['wos_last_year_issn'] = sources['issn'].apply(lambda x: issn_year_wos_dict.get(x)).fillna(-1)
sources['wos_last_year_eissn'] = sources['issn'].apply(lambda x: eissn_year_wos_dict.get(x)).fillna(-1)
sources['wos_last_year_issn_l'] = sources['issn_l'].apply(lambda x: issn_year_wos_dict.get(x)).fillna(-1)
sources['wos_last_year_eissn_l'] = sources['issn_l'].apply(lambda x: eissn_year_wos_dict.get(x)).fillna(-1)
sources['wos_last_year_name'] = sources['display_name'].str.lower().apply(lambda x: name_year_wos_dict.get(x)).fillna(-1)
sources['wos_last_year'] = np.max(sources[['wos_last_year_issn', 'wos_last_year_eissn',
                                          'wos_last_year_issn_l', 'wos_last_year_eissn_l',
                                          'wos_last_year_name']].values, axis=1)

In [None]:
# It's called PubMed here but really it represents MEDLINE, a core subset of PubMed
pubmed_df = pd.read_parquet('~/OneDrive - Northwestern University/aging_paper_mills/materials/general/manually_curated/' + \
                'nlm_results_parsed_230419.parquet')

In [None]:
pubmed_df = pubmed_df.explode('pubmed').explode('medline')

In [None]:
new_pubmed_values = []
for pubmed_list in pubmed_df['pubmed'].values:
    if type(pubmed_list) == list:
        if pubmed_list[1].endswith('-'):
            pubmed_list[1] = 'current'
    new_pubmed_values.append(pubmed_list)
    
pubmed_df['pubmed'] = new_pubmed_values

In [None]:
new_pubmed_values = []
for pubmed_list in pubmed_df['medline'].values:
    if type(pubmed_list) == list:
        if pubmed_list[1].endswith('-'):
            pubmed_list[1] = 'current'
    new_pubmed_values.append(pubmed_list)
    
pubmed_df['medline'] = new_pubmed_values

In [None]:
pubmed_df[pubmed_df['title'] == ' Computational intelligence and neuroscience.']

In [None]:
pubmed_df = pubmed_df.explode('pubmed').explode('medline')#.explode('medline').explode('medline')

In [None]:
pubmed_df['issn'] = pubmed_df['issn'].apply(lambda x: x.split(';'))
pubmed_df = pubmed_df.explode('issn')

In [None]:
pubmed_df['issn'] = pubmed_df['issn'].str.strip()

In [None]:
pubmed_df['issn'] = pubmed_df['issn'].apply(lambda x: x.split('(')[0].upper())

In [None]:
pubmed_df['title'] = pubmed_df['title'].str.rstrip('.').str.strip()

In [None]:
pubmed_titles = pubmed_df['title']
pubmed_issns = pubmed_df['issn']

In [None]:
pubmed_df['pubmed'] = pubmed_df['pubmed'].str.strip('-').replace({'current':'10000'})
pubmed_df['medline'] = pubmed_df['medline'].str.strip('-').replace({'current':'10000'})
pubmed_df['pubmed'] = pubmed_df['pubmed'].astype(float)
pubmed_df['medline'] = pubmed_df['medline'].astype(float)

In [None]:
#pubmed_df = pubmed_df[~truth_series_0 & ~truth_series_1].dropna(subset=['pubmed', 'medline']).groupby('issn').max().reset_index()
pubmed_df = pubmed_df.dropna(subset=['pubmed', 'medline']).groupby('issn').max().reset_index()

In [None]:
pubmed_df['last_year'] = np.max(pubmed_df[['pubmed', 'medline']], axis=1)

In [None]:
pubmed_df = pubmed_df[pubmed_df['issn'] != ''].copy()

In [None]:
issn_year_pubmed_dict = dict(zip(pubmed_df['issn'], pubmed_df['last_year']))
name_year_pubmed_dict = dict(zip(pubmed_df['title'],
                                pubmed_df['last_year']))

In [None]:
sources['pubmed'] = sources['issn'].isin(pubmed_issns) | \
sources['issn_l'].isin(pubmed_issns) | \
sources['display_name'].isin(pubmed_titles)

In [None]:
sources['pubmed_last_year_issn'] = sources['issn'].apply(lambda x: issn_year_pubmed_dict.get(x)).fillna(-1)
sources['pubmed_last_year_issn_l'] = sources['issn_l'].apply(lambda x: issn_year_pubmed_dict.get(x)).fillna(-1)
sources['pubmed_last_year_name'] = sources['display_name'].apply(lambda x: name_year_pubmed_dict.get(x)).fillna(-1)
sources['pubmed_last_year'] = np.max(sources[['pubmed_last_year_issn', 
                                              'pubmed_last_year_issn_l', 
                                              'pubmed_last_year_name']].values, axis=1)

In [None]:
sources.loc[sources['pubmed_last_year'] == 10000, 'pubmed_last_year'] = -1.0

In [None]:
# get most recent year
sources_index = sources[['source_id', 'display_name', 'scopus', 'scopus_last_year', 
                         'wos', 'wos_last_year', 'pubmed', 'pubmed_last_year']].groupby('source_id').max()

In [None]:
sources_index = sources_index.reset_index().replace(-1.0, np.nan)

In [None]:
sources_index['scopus'] = sources_index['scopus'] | ~sources_index['scopus_last_year'].isna()

In [None]:
sources_index = sources_index.set_index('source_id')

In [None]:
issn_groupby = sources.groupby('source_id')['issn'].apply(list)
issn_l_groupby = sources.groupby('source_id')['issn_l'].apply(list)

In [None]:
sources_index['issn_openalex'] = issn_groupby
sources_index['issn_l_openalex'] = issn_l_groupby

In [None]:
sources_index = sources_index.reset_index()

In [None]:
sources_index.to_csv('../data/sources_scopus_wos_indexing_240418.csv', index=False)

# Merging work info and source info

In [None]:
%%time
works = pd.read_csv('../data/openalex_articles_230328.csv')

In [None]:
sources_index = pd.read_csv('../data/sources_scopus_wos_indexing_240418.csv')

In [None]:
%%time
works = pd.merge(works, sources_index, how='left', on='source_id')

In [None]:
works['after_scopus_deindex'] = works['year'] >= works['scopus_last_year']
works['after_wos_deindex'] = works['year'] >= works['wos_last_year']
works['after_pubmed_deindex'] = works['year'] >= works['pubmed_last_year']

# How many ARDA-listed journals were indexed in Scopus in 2020 and later de-indexed? (13/39)
Compare to baseline rate with two-sided Z test of proportions

In [None]:
%%time
print(len(works[(works['year'] == 2020) & (works['scopus'] == True)]['source_id'].unique()))

In [None]:
%%time
print(len(works[(works['year'] == 2020) & \
                (works['scopus'] == True) & \
                (works['scopus_last_year'] >= 2020)]['source_id'].unique()))

In [None]:
import statsmodels.stats.proportion
statsmodels.stats.proportion.proportions_ztest(count=[172,13], nobs=[28228, 39])

# Start building year_df

In [None]:
%%time
year_df = works[['work_id', 'year']].groupby('year').count()
year_df = year_df.rename(columns={'work_id':'total_works'})

In [None]:
%%time
temp = works[['work_id', 'scopus', 'year']].groupby(['scopus', 'year']).count()

In [None]:
temp = temp.reset_index()
temp = temp[temp['scopus'] == True].set_index('year').rename(columns={'work_id':'n_scopus'})
year_df['n_scopus'] = temp['n_scopus']

In [None]:
%%time
temp = works[['work_id', 'wos', 'year']].groupby(['wos', 'year']).count()
temp = temp.reset_index()
temp = temp[temp['wos'] == True].set_index('year').rename(columns={'work_id':'n_wos'})
year_df['n_wos'] = temp['n_wos']

In [None]:
%%time
temp = works[['work_id', 'pubmed', 'year']].groupby(['pubmed', 'year']).count()
temp = temp.reset_index()
temp = temp[temp['pubmed'] == True].set_index('year').rename(columns={'work_id':'n_pubmed'})
year_df['n_pubmed'] = temp['n_pubmed']

In [None]:
%%time
temp = works[['work_id', 'after_wos_deindex', 'year']].groupby(['after_wos_deindex', 'year']).count()
temp = temp.reset_index()
temp = temp[temp['after_wos_deindex'] == True].set_index('year').rename(columns={'work_id':'n_wos_after_deindex'})
year_df['n_wos_after_deindex'] = temp['n_wos_after_deindex']

In [None]:
%%time
temp = works[['work_id', 'after_scopus_deindex', 'year']].groupby(['after_scopus_deindex', 'year']).count()
temp = temp.reset_index()
temp = temp[temp['after_scopus_deindex'] == True].set_index('year').rename(columns={'work_id':'n_scopus_after_deindex'})
year_df['n_scopus_after_deindex'] = temp['n_scopus_after_deindex']

In [None]:
%%time
temp = works[['work_id', 'after_pubmed_deindex', 'year']].groupby(['after_pubmed_deindex', 'year']).count()
temp = temp.reset_index()
temp = temp[temp['after_pubmed_deindex'] == True].set_index('year').rename(columns={'work_id':'n_pubmed_after_deindex'})
year_df['n_pubmed_after_deindex'] = temp['n_pubmed_after_deindex']

In [None]:
works['deindexed_scopus'] = works['scopus'] & ~works['scopus_last_year'].isna()
works['deindexed_wos'] = works['wos'] & ~works['wos_last_year'].isna()
works['deindexed_pubmed'] = works['pubmed'] & ~works['pubmed_last_year'].isna()

In [None]:
%%time
temp = works[['work_id', 'deindexed_scopus', 'year']].groupby(['deindexed_scopus', 'year']).count()
temp = temp.reset_index()
temp = temp[temp['deindexed_scopus'] == True].set_index('year').rename(columns={'work_id':'n_scopus_deindex'})
year_df['n_scopus_deindex'] = temp['n_scopus_deindex']

In [None]:
%%time
temp = works[['work_id', 'deindexed_wos', 'year']].groupby(['deindexed_wos', 'year']).count()
temp = temp.reset_index()
temp = temp[temp['deindexed_wos'] == True].set_index('year').rename(columns={'work_id':'n_wos_deindex'})
year_df['n_wos_deindex'] = temp['n_wos_deindex']

In [None]:
%%time
temp = works[['work_id', 'deindexed_pubmed', 'year']].groupby(['deindexed_pubmed', 'year']).count()
temp = temp.reset_index()
temp = temp[temp['deindexed_pubmed'] == True].set_index('year').rename(columns={'work_id':'n_pubmed_deindex'})
year_df['n_pubmed_deindex'] = temp['n_pubmed_deindex']

Note: Counting "actively publishing" journals only

In [None]:
%%time
temp = works[works['wos'] & ~(works['after_wos_deindex'])].groupby(['year'])['source_id'].nunique()
temp = pd.DataFrame(temp).reset_index()
temp = temp.set_index('year')
temp = temp.rename(columns={'source_id':'n_wos_journals'})
year_df['n_wos_journals'] = temp['n_wos_journals']

In [None]:
%%time
temp = works[works['scopus'] & ~(works['after_scopus_deindex'])].groupby(['year'])['source_id'].nunique()
temp = pd.DataFrame(temp).reset_index()
temp = temp.set_index('year')
temp = temp.rename(columns={'source_id':'n_scopus_journals'})
year_df['n_scopus_journals'] = temp['n_scopus_journals']

In [None]:
%%time
temp = works[works['pubmed'] & ~(works['after_pubmed_deindex'])].groupby(['year'])['source_id'].nunique()
temp = pd.DataFrame(temp).reset_index()
temp = temp.set_index('year')
temp = temp.rename(columns={'source_id':'n_pubmed_journals'})
year_df['n_pubmed_journals'] = temp['n_pubmed_journals']

In [None]:
%%time
temp = works.groupby(['year'])['source_id'].nunique()
temp = pd.DataFrame(temp).reset_index()
temp = temp.set_index('year')
temp = temp.rename(columns={'source_id':'n_journals'})
year_df['n_journals'] = temp['n_journals']

In [None]:
year_df = year_df.rename(columns={'work_id':'total_works'})

In [None]:
year_df = year_df.fillna(0.0)

# Count paper mill products, PubPeer-commented articles retractions

In [None]:
paper_mill_df = pd.read_csv('../data/paper_mill_dois_240418.csv')

In [None]:
# Unique dois in paper mill df (paper mill)
paper_mill_df['doi'].value_counts()

In [None]:
# Unique dois in paper mill df (paper mill) without tortured or sem_img
paper_mill_df[~paper_mill_df['set'].isin(['pps_tortured', 'sem_misid'])]['doi'].value_counts()

In [None]:
rw_db = pd.read_csv('../data/240304_retraction_watch_db.csv', encoding='latin')
a = rw_db[(rw_db['RetractionNature'] == 'Retraction')]['OriginalPaperDOI'].values
a = np.unique([str(x).lower() for x in a])
a = a[(a != 'unavailable') & (a != 'nan')]
a = ['https://doi.org/' + x for x in a]
retracted_doi = a

In [None]:
pubpeer_df = pd.read_csv('C:/Users/richa/OneDrive - Northwestern University/pubpeer_comments_20240201.csv')
pubpeer_df['id'] = pubpeer_df['ids'].astype(str).apply(lambda x: x.split(','))
pubpeer_df = pubpeer_df.explode('id')
pubpeer_df['paper_link'] = pubpeer_df['Pubpeer Link'].astype(str).apply(lambda x: '/'.join(x.split('/')[:-1])).values
b = pubpeer_df['id'].values
b = [x.lower() for x in b]
b = ['https://doi.org/' + x for x in b]
pubpeer_doi = b

In [None]:
year_df = year_df.set_index('year')

### Retractions

In [None]:
%%time
works = openalex.works(
            dataset = 'main', 
            filters={'doi': retracted_doi})

In [None]:
works['doi_no_link'] = works['doi'].apply(lambda x: x.split('https://doi.org/')[1]).values

In [None]:
works = pd.merge(works, sources_index, how='left', on='source_id')

In [None]:
works['after_scopus_deindex'] = works['year'] >= works['scopus_last_year']
works['after_wos_deindex'] = works['year'] >= works['wos_last_year']
works['after_pubmed_deindex'] = works['year'] >= works['pubmed_last_year']

In [None]:
works_slice = works.copy()

In [None]:
temp = works_slice.groupby('year').nunique()
temp = temp.rename(columns={'work_id':'n_retracted'})
year_df['n_retracted'] = temp['n_retracted']

In [None]:
temp = works_slice.groupby('year').nunique()
temp = temp.rename(columns={'source_id':'n_retracted_journals'})
year_df['n_retracted_journals'] = temp['n_retracted_journals']

In [None]:
temp = works_slice[works_slice['scopus'] & ~works_slice['after_scopus_deindex']].groupby('year').nunique()
temp = temp.rename(columns={'work_id':'n_scopus_retracted', 
                            'source_id':'n_scopus_retracted_journals'})
year_df['n_scopus_retracted'] = temp['n_scopus_retracted']
year_df['n_scopus_retracted_journals'] = temp['n_scopus_retracted_journals']

In [None]:
temp = works_slice[works_slice['wos'] & ~works_slice['after_wos_deindex']].groupby('year').nunique()
temp = temp.rename(columns={'work_id':'n_wos_retracted', 
                            'source_id':'n_wos_retracted_journals'})
year_df['n_wos_retracted'] = temp['n_wos_retracted']
year_df['n_wos_retracted_journals'] = temp['n_wos_retracted_journals']

In [None]:
temp = works_slice[works_slice['pubmed'] & ~works_slice['after_pubmed_deindex']].groupby('year').nunique()
temp = temp.rename(columns={'work_id':'n_pubmed_retracted', 
                            'source_id':'n_pubmed_retracted_journals'})
year_df['n_pubmed_retracted'] = temp['n_pubmed_retracted']
year_df['n_pubmed_retracted_journals'] = temp['n_pubmed_retracted_journals']

### Paper mill products

In [None]:
paper_mill_dois = ['https://doi.org/' + x for x in paper_mill_df['doi'].astype(str).str.lower().values]

In [None]:
%%time
works = openalex.works(
            dataset = 'main', 
            filters={'doi': paper_mill_dois})

In [None]:
works['doi_no_link'] = works['doi'].apply(lambda x: x.split('https://doi.org/')[1]).values

In [None]:
works = pd.merge(works, sources_index, how='left', on='source_id')

In [None]:
works['after_scopus_deindex'] = works['year'] >= works['scopus_last_year']
works['after_wos_deindex'] = works['year'] >= works['wos_last_year']
works['after_pubmed_deindex'] = works['year'] >= works['pubmed_last_year']

In [None]:
works_slice = works.copy()

In [None]:
temp = works_slice.groupby('year').count()
temp = temp.rename(columns={'work_id':'n_paper_mill'})
year_df['n_paper_mill'] = temp['n_paper_mill']

In [None]:
temp = works_slice.groupby('year').nunique()
temp = temp.rename(columns={'source_id':'n_paper_mill_journals'})
year_df['n_paper_mill_journals'] = temp['n_paper_mill_journals']

In [None]:
works_slice = works.drop_duplicates('work_id')
works_slice = works_slice[works_slice['doi'].isin(retracted_doi)]
temp = works_slice.groupby('year').nunique()
temp = temp.rename(columns={'work_id':'n_paper_mill_retracted'})
year_df['n_paper_mill_retracted'] = temp['n_paper_mill_retracted']

In [None]:
works_slice = works.drop_duplicates('work_id')
works_slice = works_slice[works_slice['wos'] & ~works_slice['after_wos_deindex']]

temp = works_slice.groupby('year').nunique()
temp = temp.rename(columns={'source_id':'n_wos_paper_mill_journals', 
                            'work_id':'n_paper_mill_wos'})
year_df['n_wos_paper_mill_journals'] = temp['n_wos_paper_mill_journals']
year_df['n_paper_mill_wos'] = temp['n_paper_mill_wos']

In [None]:
works_slice = works.drop_duplicates('work_id')
works_slice = works_slice[works_slice['scopus'] & ~works_slice['after_scopus_deindex']]

temp = works_slice.groupby('year').nunique()
temp = temp.rename(columns={'source_id':'n_scopus_paper_mill_journals', 
                            'work_id':'n_paper_mill_scopus'})
year_df['n_scopus_paper_mill_journals'] = temp['n_scopus_paper_mill_journals']
year_df['n_paper_mill_scopus'] = temp['n_paper_mill_scopus']

In [None]:
works_slice = works.drop_duplicates('work_id')
works_slice = works_slice[works_slice['pubmed'] & ~works_slice['after_pubmed_deindex']]

temp = works_slice.groupby('year').nunique()
temp = temp.rename(columns={'source_id':'n_pubmed_paper_mill_journals', 
                            'work_id':'n_paper_mill_pubmed'})
year_df['n_pubmed_paper_mill_journals'] = temp['n_pubmed_paper_mill_journals']
year_df['n_paper_mill_pubmed'] = temp['n_paper_mill_pubmed']

### PubPeer-commented

In [None]:
%%time
works = openalex.works(
            dataset = 'main', 
            filters={'doi': pubpeer_doi})

In [None]:
works['doi_no_link'] = works['doi'].apply(lambda x: x.split('https://doi.org/')[1]).values

In [None]:
works = pd.merge(works, sources_index, how='left', on='source_id')

In [None]:
works['after_scopus_deindex'] = works['year'] >= works['scopus_last_year']
works['after_wos_deindex'] = works['year'] >= works['wos_last_year']
works['after_pubmed_deindex'] = works['year'] >= works['pubmed_last_year']

In [None]:
works_slice = works.copy()

In [None]:
temp = works_slice.groupby('year').count()
temp = temp.rename(columns={'work_id':'n_paper_mill'})
year_df['n_pubpeer'] = temp['n_paper_mill']

In [None]:
temp = works_slice.groupby('year').nunique()
temp = temp.rename(columns={'source_id':'n_paper_mill_journals'})
year_df['n_pubpeer_journals'] = temp['n_paper_mill_journals']

In [None]:
works_slice = works.drop_duplicates('work_id')
works_slice = works_slice[works_slice['doi'].isin(retracted_doi)]
temp = works_slice.groupby('year').nunique()
temp = temp.rename(columns={'work_id':'n_paper_mill_retracted'})
year_df['n_pubpeer_retracted'] = temp['n_paper_mill_retracted']

In [None]:
works_slice = works.drop_duplicates('work_id')
works_slice = works_slice[works_slice['wos'] & ~works_slice['after_wos_deindex']]

temp = works_slice.groupby('year').nunique()
temp = temp.rename(columns={'source_id':'n_wos_paper_mill_journals', 
                            'work_id':'n_paper_mill_wos'})
year_df['n_wos_pubpeer_journals'] = temp['n_wos_paper_mill_journals']
year_df['n_pubpeer_wos'] = temp['n_paper_mill_wos']

In [None]:
works_slice = works.drop_duplicates('work_id')
works_slice = works_slice[works_slice['scopus'] & ~works_slice['after_scopus_deindex']]

temp = works_slice.groupby('year').nunique()
temp = temp.rename(columns={'source_id':'n_scopus_paper_mill_journals', 
                            'work_id':'n_paper_mill_scopus'})
year_df['n_scopus_pubpeer_journals'] = temp['n_scopus_paper_mill_journals']
year_df['n_pubpeer_scopus'] = temp['n_paper_mill_scopus']

In [None]:
works_slice = works.drop_duplicates('work_id')
works_slice = works_slice[works_slice['pubmed'] & ~works_slice['after_pubmed_deindex']]

temp = works_slice.groupby('year').nunique()
temp = temp.rename(columns={'source_id':'n_pubmed_paper_mill_journals', 
                            'work_id':'n_paper_mill_pubmed'})
year_df['n_pubmed_pubpeer_journals'] = temp['n_pubmed_paper_mill_journals']
year_df['n_pubpeer_pubmed'] = temp['n_paper_mill_pubmed']

In [None]:
year_df = year_df.fillna(0.0)

### Collect retractions as represented in databases
(using their respective web portals, collected manually in 240419_database_retractions_by_year.csv)

In [None]:
# For querying PubMed (Additional filters > MEDLINE)

import string
alphabet = string.ascii_lowercase

# for getting all records from pubmed
query_str = '''"'''
for a in alphabet:
    query_str = query_str + a + '''" OR "'''
for n in range(10):
    query_str = query_str + str(n) + '''" OR "'''
    
print(query_str)

In [None]:
# retractions as represented in database
database_retractions_df = pd.read_csv('../data/240419_database_retractions_by_year.csv')

In [None]:
year_df = pd.merge(year_df.reset_index(), database_retractions_df, on='year', how='left').fillna(0.0)

In [None]:
year_df.to_csv('../data/openalex_summary_by_year_240419.csv', index=False)