In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import mplhep as hep
from io import BytesIO
import zstd

from datetime import datetime
import mplhep as hep

import os
hep.style.use(hep.style.CMS)
from typing import Callable
from matplotlib.colors import LogNorm

In [2]:
with open('data.pkl.zst', 'rb') as f:
    df:pd.DataFrame = pd.read_pickle(BytesIO(zstd.decompress(f.read())))

In [3]:
def filter_df(df:pd.DataFrame, fn):
    mask = np.empty(len(df), dtype=np.bool_)
    for i, row in enumerate(df.itertuples()):
        mask[i] = fn(row)
    return df.iloc[mask]

In [4]:
# fdf = filter_df(df, lambda row: 'Experiment-HEP' in row.inspire_categories and row.earliest_date.year > 10)

In [5]:
subjects = ["Experiment-HEP", "Phenomenology-HEP", "Experiment-Nucl", "Instrumentation", "Theory-Nucl", "Astrophysics", "Lattice", "Theory-HEP", "Other", "General Physics", "Computing", "Accelerators", "Data Analysis and Statistics", "Gravitation and Cosmology", "Quantum Physics", "Condensed Matter", "Math and Math Physics"]

document_types = ["published", "article", "conference paper", "review", "introductory", "lectures", "book chapter", "note", "thesis", "book", "report", "proceedings"]


In [6]:
# all_dois = set()
# mask = np.zeros(len(df), dtype=np.bool_)
# for i, row in enumerate(df.itertuples()):
#     dois = row.dois
#     has_doi = False #or 'thesis' in row.document_type
#     for doi in dois:
#         has_doi = True
#         if dois in all_dois:
#             break
#         all_dois.add(doi)
#     else:
#         mask[i] = has_doi
# fdf = df.iloc[mask]

# len(fdf)

In [7]:
fdf = filter_df(df, lambda row: row.earliest_date.year > 1500)
len(fdf)

822779

In [8]:
y0, y1 = np.min(df.earliest_date).year, datetime.now().year
cnt = y1 - y0 + 1

In [9]:
len(filter_df(df, lambda row: row.earliest_date.year == 2020 and 'thesis' in row.document_type and 'Experiment-HEP' in row.inspire_categories))

488

In [10]:
from tqdm import tqdm
columns = []
df_arr = [[] for _ in range(len(subjects))]
for i,subject in enumerate(tqdm(subjects)):
    _fdf = filter_df(fdf, lambda row: subject in row.inspire_categories)
    thesis_mask = np.array(_fdf.document_type == {'thesis'})
    df_thesis = _fdf.iloc[thesis_mask]
    df_others = _fdf.iloc[~thesis_mask]
    df_arr[i] = [df_thesis, df_others]
    columns.append((subject, 'thesis'))
    columns.append((subject, 'others'))
    # break

  0%|          | 0/17 [00:00<?, ?it/s]

100%|██████████| 17/17 [00:39<00:00,  2.33s/it]


In [11]:

for cut in [0,10,50,100,500]:

    master_arr = np.zeros((len(subjects), 2, cnt), dtype=np.int32)
    for i,subject in enumerate(tqdm(subjects)):
        df_thesis, df_others = df_arr[i]
        df_others = df_others.query(f'citation_count >= {cut}')
        years_thesis = np.array([x.year for x in df_thesis.earliest_date])
        years_others = np.array([x.year for x in df_others.earliest_date])

        years_thesis = years_thesis
        years_others = years_others

        loc_t, v_t = np.unique(years_thesis, return_counts=True)
        loc_o, v_o = np.unique(years_others, return_counts=True)
        if len(loc_t):
            master_arr[i, 0][loc_t-y0] = v_t
        if len(loc_o):
            master_arr[i, 1][loc_o-y0] = v_o


    _master_arr = master_arr.reshape(len(subjects)*2,-1).T
    columns = columns = pd.MultiIndex.from_tuples(columns)
    df_len = pd.DataFrame(data=_master_arr, columns=columns, index=np.arange(y0,y1+1))

    # for subject in tqdm(subjects):
    #     print(f"{subject:30}: {df_len[subject]['others'].sum()/df_len[subject]['thesis'].sum():02.2f}")

    os.makedirs(f'/tmp/inspire/{cut}', exist_ok=True)

    for subject in tqdm(subjects):
        fig,ax = plt.subplots()
        ax:plt.Axes # type: ignore
        ax.set(xlabel='Year', ylabel='Number of publications', title=subject)
        ax.plot(df_len.index, np.array(df_len[subject]['thesis']),'o',ls='--', label='Thesis')
        ax.plot(df_len.index, np.array(df_len[subject]['others']),'o',ls='--', label='Publication (>15 citations)')
        ax.set_xlim(1950,None)
        # ax.legend(loc='upper left')
        # ax.legend(loc='lower right')
        # ax.set_yscale('log')
        fig.savefig(f'/tmp/inspire/{cut}/{subject}.png', bbox_inches='tight')
        plt.close()

100%|██████████| 17/17 [00:00<00:00, 23.80it/s]
100%|██████████| 17/17 [00:02<00:00,  5.91it/s]
100%|██████████| 17/17 [00:01<00:00, 10.94it/s]
100%|██████████| 17/17 [00:02<00:00,  5.97it/s]
100%|██████████| 17/17 [00:00<00:00, 88.84it/s]
100%|██████████| 17/17 [00:02<00:00,  6.01it/s]
100%|██████████| 17/17 [00:00<00:00, 168.85it/s]
100%|██████████| 17/17 [00:02<00:00,  6.44it/s]
100%|██████████| 17/17 [00:00<00:00, 347.07it/s]
100%|██████████| 17/17 [00:02<00:00,  6.92it/s]


In [12]:
norm = LogNorm()

for i,subject in enumerate(tqdm(subjects)):
    _, df_others = df_arr[i]
    years = np.array([x.year for x in df_others.earliest_date])
    loc, idx = np.unique(years, return_inverse=True)
    citations = np.array(df_others.citation_count)
    fig,ax = plt.subplots()
    ax:plt.Axes # type: ignore
    high = int(np.percentile(citations, 99))+1
    img = ax.hist2d(years, citations, bins=(np.arange(1950,2023),np.arange(0,high,high//100)), norm=norm)
    ax.set(xlabel='Year', ylabel='Citations')
    fig.colorbar(img[-1], ax=ax)
    plt.savefig(f'/tmp/inspire/{subject}_citation.png', bbox_inches='tight')
    plt.close()


100%|██████████| 17/17 [00:06<00:00,  2.62it/s]


In [13]:
for i,subject in enumerate(tqdm(subjects)):
    df_thesis, df_others = df_arr[i]
    citations = np.array(df_others.citation_count)
    c,n = np.unique(citations, return_counts=True)
    n = np.cumsum(n[::-1])[::-1]
    n = n/len(df_thesis)

    fig,ax = plt.subplots()
    ax:plt.Axes # type: ignore
    ax.step(c,n)
    ax.set(xlabel='Citation Cutoff', ylabel='Ratio to Thesis', yscale='log', xscale='log')
    plt.savefig(f'/tmp/inspire/{subject}_ratio.png', bbox_inches='tight')
    plt.close()


100%|██████████| 17/17 [00:05<00:00,  2.90it/s]
