# Exploration: Simple Corpus Statistics

Andrew Barros  
DS5001  
02 May 20024  


In [1]:
#!/usr/bin/env python3
import pandas as pd
import sqlite3
import plotly.graph_objects as go
from pathlib import Path

# Basic Corpus Statistics
Load some basic statistics about the articles including the metadata, document lengths, and number of unique terms. 

In [2]:
DATA_DIR = Path("../data")
db = sqlite3.connect("../data/corpus_f4.sqlite")
TOKENS = pd.read_sql("SELECT COUNT(*) FROM tokens", db)
TERMS = pd.read_sql("SELECT COUNT(*) FROM terms", db)
ARTICLE_LENGTHS = pd.read_sql("SELECT COUNT(*) FROM tokens GROUP BY title", db)
db.close()

ARTICLES = pd.read_parquet(DATA_DIR / "all_articles.parquet")
CITATIONS = pd.read_parquet(DATA_DIR / "citation_count.parquet")
ARTICLES = ARTICLES.merge(CITATIONS, left_on="pmcid", right_on="pmcid")

In [3]:
print("Total number of articles:", len(ARTICLES))
print("Total number of tokens:", TOKENS.iloc[0, 0])
print("Total number of terms:", TERMS.iloc[0, 0])
print(f"Years covered: {ARTICLES['pubyear'].min()} - {ARTICLES['pubyear'].max()}")

Total number of articles: 1215
Total number of tokens: 4778274
Total number of terms: 85538
Years covered: 2010 - 2024


Interpretation:
* We have a moderate corpus (~5M words) from 1.2k articles over a 13.5 year span

# General Trends

Create some basic figures for articles by year, distribution of article lengths, and distribution of citation counts

In [4]:
articles_per_year = ARTICLES.groupby("pubyear").size()
fig = go.Figure(
    data=go.Scatter(
        x=articles_per_year.index,
        y=articles_per_year.values,
        mode="lines+markers")
)

fig.update_layout(
    width=800,
    height=800,
    template="simple_white",
    title="Number of articles per year",
    xaxis_title="Year",
    yaxis_title="Number of articles"
)

fig.show()
fig.write_image("articles_by_year.png")

Interpretation:
* Article volume has increased over time
* There are very few articles in 2010 (the first year of PMC submission for our two journals) and 2024 (year currently in progress and academic publishing is _slow_)
* There were the most OA articles in 2020. This could be a COVID related increase in all articles published or perhaps just more OA articles

In [5]:
fig = go.Figure(
    data=go.Histogram(
        x=ARTICLE_LENGTHS["COUNT(*)"],
        nbinsx=100
    )
)

mean_length = ARTICLE_LENGTHS['COUNT(*)'].mean()
fig.add_vline(x=mean_length, line_color="red", line_width=4, line_dash="dash", annotation_text=f"Mean: {mean_length:.0f}")


fig.update_layout(
    width=800,
    height=400,
    template="simple_white",
    title="Length of articles",
    xaxis_title="Word Count",
    yaxis_title="Number of articles"
)


fig.show()
fig.write_image("article_lengths.png")

Interpretation:
* There is are three peaks in article lengths:
    * Very short (less than 100 words). These are probably corrections
    * Brief communications (around 2500 words). These are a combination of short research communications (brief reports, research letters, etc.) and letters to the editor.
    * Original research (full lenght articles, average length ~ 5k words). 

In [6]:
# Histogram of citation counts
fig = go.Figure(
    data=go.Histogram(
        x=ARTICLES["citation_count"],
        nbinsx=40
    )
)

fig.update_yaxes(type="log")

fig.update_layout(
    width=800,
    height=400,
    template="simple_white",
    title="Number of citations per article",
    xaxis_title="Citation Count",
    yaxis_title="Number of articles"
)

fig.show()
fig.write_image("citation_counts.png")

Interpretations:
* Most articles aren't cited
* The rest of the distribution seems to follow a power law
* There are a few highly cited articles