In [1]:
import pandas as pd

raw_data = pd.read_json("../analyze/pub_refs_union.jsonl", lines=True)

In [2]:
def flatten(df: pd.DataFrame):
    retval = pd.DataFrame(list(df["metadata"]))
    retval = retval[
        [
            "title",
            "normalized_venue",
            "year",
            "citation_count",
            "authors",
            "type",
            "open_access",
            "concepts",
        ]
    ].join(df[["PostId", "AddedAt"]])
    retval["year"] = retval["year"].astype("Int64")
    retval["AddedAt"] = pd.to_datetime(retval["AddedAt"])
    retval["AddedAt"] = retval["AddedAt"].dt.year
    return retval


def filter_concepts(row):
    concepts = row["concepts"]
    try:
        concepts.sort(key=lambda x: x["score"], reverse=True)
    except Exception:
        row["concepts"] = []
        return row
    filtered_concepts = []
    for concept in row["concepts"]:
        if concept["level"] > 1:
            continue
        if "computer interaction" in concept["display_name"]:
            concept["display_name"] = "human computer interaction"
        elif "time computing" in concept["display_name"]:
            concept["display_name"] = "real time computing"
        elif "Computer graphics" in concept["display_name"]:
            concept["display_name"] = "computer graphics images"
        elif "risk analysis" in concept["display_name"]:
            concept["display_name"] = "risk analysis"
        filtered_concepts.append(
            {
                "name": concept["display_name"].lower(),
                "score": concept["score"],
                "level": concept["level"],
            }
        )
    row["concepts"] = filtered_concepts
    return row


def get_l1_field_of_research(row: pd.Series):
    for con in row["concepts"]:
        if con["score"] <= 0.1:
            continue
        if con["level"] == 1:
            row["l1_field"] = con["name"]
            return row
    row["l1_field"] = None
    return row


cs_l1_fields = [
    "geometry",
    "algorithm",
    "multimedia",
    "database",
    "internet privacy",
    "natural language processing",
    "data science",
    "embedded system",
    "artificial intelligence",
    "real time computing",
    "statistics",
    "distributed computing",
    "computer hardware",
    "software engineering",
    "arithmetic",
    "mathematical economics",
    "mathematical optimization",
    "theoretical computer science",
    "computer architecture",
    "computer graphics images",
    "library science",
    "simulation",
    "telecommunications",
    "operating system",
    "discrete mathematics",
    "world wide web",
    "mathematics education",
    "parallel computing",
    "computational science",
    "information retrieval",
    "computer security",
    "knowledge management",
    "pure mathematics",
    "computer vision",
    "data mining",
    "econometrics",
    "speech recognition",
    "operations research",
    "mathematical physics",
    "computer engineering",
    "programming language",
    "human computer interaction",
    "computer network",
    "machine learning",
    "applied mathematics",
    "combinatorics",
    "mathematical analysis",
]

math_l1_fields = [
    "geometry",
    "algorithm",
    "statistics",
    "arithmetic",
    "mathematical economics",
    "mathematical optimization",
    "discrete mathematics",
    "mathematics education",
    "computational science",
    "pure mathematics",
    "econometrics",
    "operations research",
    "mathematical physics",
    "applied mathematics",
    "combinatorics",
    "mathematical analysis",
]


def get_field_of_research(row: pd.Series):
    top_l0 = None
    top_l1 = None
    for con in row["concepts"]:
        if con["score"] <= 0.1:
            continue
        if con["level"] == 1:
            if con["name"] in cs_l1_fields:
                if top_l1 is None:
                    top_l1 = con["name"]
                # elif top_l1 == "artificial intelligence" and con["name"] in {"computer vision", "machine learning", "natural language processing"} and con["score"] >= 0.5:
                #     top_l1 = con["name"]
            elif con["name"] in math_l1_fields:
                if top_l1 is None:
                    top_l1 = con["name"]
        elif con["level"] == 0:
            top_l0 = con["name"]
    row["field"] = top_l1 or top_l0
    return row


def process_concepts(df: pd.DataFrame):
    retval = df.apply(filter_concepts, axis=1)
    retval = retval.apply(get_l1_field_of_research, axis=1)
    retval = retval.apply(get_field_of_research, axis=1)
    return retval


def group_by_title(df: pd.DataFrame):
    retval = (
        df.groupby("title")
        .agg(
            {
                "normalized_venue": lambda x: set(x),
                "year": lambda x: set(x),
                "citation_count": "max",
                "open_access": "any",
                "PostId": lambda x: set(x),
                "field": lambda x: set(x),
                "AddedAt": list,
            }
        )
        .reset_index()
    )
    if any(retval["normalized_venue"].apply(len) > 1):
        print("Warning: multiple venues for the same title")
    # else:
        retval["normalized_venue"] = retval["normalized_venue"].apply(
            lambda x: list(x)[0]
        )
    if any(retval["year"].apply(len) > 1):
        print("Warning: multiple years for the same title")
    # else:
        retval["year"] = retval["year"].apply(lambda x: list(x)[0])
    if any(retval["field"].apply(len) > 1):
        print("Warning: multiple fields for the same title")
    # else:
        retval["field"] = retval["field"].apply(lambda x: list(x)[0])
    return retval


pd.set_option("display.max_rows", 10)
flattened_data = process_concepts(flatten(raw_data))
grouped_data = group_by_title(flattened_data)
grouped_data



Unnamed: 0,title,normalized_venue,year,citation_count,open_access,PostId,field,AddedAt
0,Neural Network Methods for Natural Language P...,Computational Linguistics,2017,549.0,True,{48445189},artificial intelligence,[2018]
1,"""Low-Resource"" Text Classification: A Paramete...",,2023,52.0,True,{76724158},artificial intelligence,[2023]
2,"#ifdef Considered Harmful, or Portability Expe...",Usenix summer,1992,246.0,False,"{489434, 8012461, 21097319, 9294487}",computer security,"[2022, 2012, 2018, 2014]"
3,$2.00 Gas! Studying the Effects of a Gas Tax M...,Journal of Public Economics,2008,162.0,True,{75879031},economics,[2023]
4,"$\mu $ RNG: A 300–950 mV, 323 Gbps/W All-Digit...",,2016,83.0,False,{61554191},algorithm,[2020]
...,...,...,...,...,...,...,...,...
13282,“One Against One” or “One Against All”: Which ...,,2006,272.0,False,"{16728386, 16252899, 16793390}",artificial intelligence,"[2013, 2013, 2013]"
13283,"“Search, Show Context, Expand on Demand”: Supp...",,2009,257.0,False,{21893401},information retrieval,[2014]
13284,“Should This Loan be Approved or Denied?”: A L...,,2018,33.0,True,{64403934},econometrics,[2020]
13285,“We Are Here to Help”Who Opens the Gate for Su...,,2018,10.0,False,"{5017208, 58591979, 14537012, 5151693}",political science,"[2013, 2019, 2011, 2011]"


In [58]:
grouped_data.sort_values("citation_count", ascending=False).head(10)

Unnamed: 0,title,normalized_venue,year,citation_count,open_access,PostId,field,AddedAt
3846,Deep Residual Learning for Image Recognition,Computer Vision and Pattern Recognition,2016,177579.0,True,"{62024327, 65604874, 46904460, 68258188, 61841...",artificial intelligence,"[2019, 2017, 2017, 2020, 2020, 2017, 2021, 202..."
1393,Adam: A Method for Stochastic Optimization,International Conference on Learning Represent...,2014,142334.0,True,"{60872321, 41450882, 49977348, 52474500, 60667...",mathematical optimization,"[2018, 2020, 2019, 2022, 2018, 2018, 2018, 201..."
6464,ImageNet classification with deep convolutiona...,Neural Information Processing Systems,2017,113943.0,True,"{61167744, 65405953, 50193801, 40060949, 41841...",machine learning,"[2015, 2014, 2016, 2015, 2017, 2015, 2020, 202..."
2196,Attention is All you Need,Neural Information Processing Systems,2017,110977.0,True,"{62575360, 59377538, 62177796, 56930821, 63178...",artificial intelligence,"[2022, 2020, 2021, 2021, 2022, 2023, 2019, 202..."
12798,Very Deep Convolutional Networks for Large-Sca...,International Conference on Learning Represent...,2014,94664.0,True,"{54997632, 61321247, 60812966, 65612711, 51854...",artificial intelligence,"[2020, 2016, 2017, 2016, 2017, 2021, 2024, 202..."
9838,Random Forests,Machine-mediated learning,2017,91631.0,True,"{70360996, 11501381, 65530093, 24663120, 58937...",environmental science,"[2020, 2019, 2022, 2021, 2021]"
12044,The moderator-mediator variable distinction in...,,1986,87911.0,True,{1219480},psychology,[2009]
2401,Basic local alignment search tool.,,1990,87795.0,True,{1432570},data mining,[2009]
11186,Statistical Power Analysis for the Behavioral ...,,1989,87715.0,False,{74005392},statistics,[2022]
2355,BERT: Pre-training of Deep Bidirectional Trans...,North American Chapter of the Association for ...,2018,84697.0,True,"{64588418, 78201090, 71965576, 56129165, 56828...",artificial intelligence,"[2018, 2023, 2020, 2021, 2020, 2020, 2022, 201..."


In [None]:
# grouped_data = grouped_data.drop([2401, 11186, 12044])

In [60]:
import plotly.express as px

correlation_data = grouped_data[["citation_count", "PostId"]]
correlation_data["num_references"] = correlation_data["PostId"].apply(len)
fig = px.scatter(
    correlation_data,
    x="num_references",
    y="citation_count",
    title="Correlation between Citation Count and Number of References",
    labels={
        "num_references": "Number of References",
        "citation_count": "Citation Count",
    },
)

fig.show()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [61]:
import pandas as pd
import scipy.stats as stats

# Calculate Pearson correlation
pearson_corr, pearson_p = stats.pearsonr(
    correlation_data["citation_count"], correlation_data["num_references"]
)

# Calculate Spearman correlation
spearman_corr, spearman_p = stats.spearmanr(
    correlation_data["citation_count"], correlation_data["num_references"]
)

print(f"Pearson correlation: {pearson_corr:.3f} (p-value: {pearson_p:.3e})")
print(f"Spearman correlation: {spearman_corr:.3f} (p-value: {spearman_p:.3e})")

# You can also use pandas built-in correlation method
pandas_corr = correlation_data["citation_count"].corr(
    correlation_data["num_references"], method="spearman"
)
print(f"\nPandas Spearman correlation: {pandas_corr:.3f}")

# For a more complete statistical summary:
print("\nDescriptive statistics:")
print(correlation_data[["citation_count", "num_references"]].describe())

Pearson correlation: 0.493 (p-value: 0.000e+00)
Spearman correlation: 0.232 (p-value: 1.728e-162)

Pandas Spearman correlation: 0.232

Descriptive statistics:
       citation_count  num_references
count    13282.000000    13282.000000
mean       841.712167        1.434573
std       4249.616866        2.206589
min          0.000000        1.000000
25%         19.000000        1.000000
50%         82.000000        1.000000
75%        378.750000        1.000000
max     177579.000000       67.000000


In [62]:
df = correlation_data

In [133]:
import plotly.graph_objects as go
import pandas as pd
import numpy as np

bin_order = ["{1}", "(1-2]", "(2-4]", "(4-8]", "(8-inf)"]


# Create bins for number of references
def create_ref_bins(
    df, bin_edges=[0, 1, 2, 4, 8, float("inf")]
):  # Modified bin edges to match labels
    labels = bin_order  # One fewer than bin edges
    df["ref_bin"] = pd.cut(df["num_references"], bins=bin_edges, labels=labels)
    return df


# Apply binning
df_binned = create_ref_bins(df)

# Create box plot using the binned references
fig = go.Figure()

# Create box plot for each bin in the specified order
for bin_name in bin_order:
    citations = df_binned[df_binned["ref_bin"] == bin_name]["citation_count"]

    if not citations.empty:
        # Count number of papers in this bin
        count = len(citations)

        fig.add_trace(
            go.Box(
                y=citations,
                name=f"<span style='font-size:0.9em'>{bin_name}</span><br><i style='font-size:0.75em'>n={count}</i>",
                boxpoints="outliers",
                marker_color="blue",
                line_color="blue",
                marker=dict(opacity=0.5, size=4),
            )
        )

# Update layout
fig.update_layout(
    yaxis_type="log",
    yaxis=dict(tickfont=dict(size=10)),
    showlegend=False,
    width=240,
    height=320,
    template="plotly_white",
    margin=dict(l=0, r=0, t=0, b=0),
)

fig.write_image("correlation.svg")
fig.show()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [57]:
import plotly.graph_objects as go
import numpy as np
import pandas as pd

# Create log10-based bins for citation counts
df["log10_citations"] = np.log10(df["citation_count"] + 1)  # Add 1 to handle zeros

# Create bins based on orders of magnitude
bin_edges = [
    0,
    1,
    2,
    3,
    4,
    5,
]  # This will create bins like 0-10, 10-100, 100-1000, etc.
labels = ["0-10", "10-100", "100-1K", "1K-10K", "10K+"]

# Bin the data
df["citation_bin"] = pd.cut(df["log10_citations"], bins=bin_edges, labels=labels)

# Create box plot
fig = go.Figure()

# Define bin order
bin_order = ["0-10", "10-100", "100-1K", "1K-10K", "10K+"]

# Add box plot for each bin in order
for bin_name in bin_order:
    references = df[df["citation_bin"] == bin_name]["num_references"]

    if not references.empty:
        fig.add_trace(
            go.Box(
                y=references,
                name=bin_name,
                boxpoints="outliers",
                marker_color="blue",
                line_color="blue",
                marker=dict(opacity=0.5, size=4),
            )
        )

fig.update_layout(
    title="Number of References Distribution by Citation Count",
    xaxis_title="Citation Count (orders of magnitude)",
    yaxis_title="Number of References",
    width=800,
    height=600,
    template="plotly_white",
)

# Add correlation information
correlation_text = (
    f'Pearson correlation: {df["citation_count"].corr(df["num_references"], method="pearson"):.3f}<br>'
    f'Spearman correlation: {df["citation_count"].corr(df["num_references"], method="spearman"):.3f}'
)

fig.add_annotation(
    text=correlation_text,
    xref="paper",
    yref="paper",
    x=0.5,
    y=1.1,
    showarrow=False,
    font=dict(size=12),
)

# Show distribution in bins
print("\nDistribution of papers across citation bins:")
print(df["citation_bin"].value_counts().sort_index())

fig.show()


Distribution of papers across citation bins:
citation_bin
0-10      1812
10-100    4969
100-1K    4373
1K-10K    1557
10K+       197
Name: count, dtype: int64




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

