In [2]:
import os
import json
import networkx as nx
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter

### Configuration parameters

In [3]:
# -----------------------------
# Configuration and Input Paths
# -----------------------------

metadata_path = "metadata_dataset.csv"  
# Path to the metadata CSV file produced during the graph-generation stage.
# This file contains all publication- or work-level metadata used in the analysis.

analized_country = "Italy"
# Country to be analyzed. Only records associated with this country will be considered.

start_year = 1990  # inclusive
end_year = 2024    # exclusive
# Time window for the analysis. Only works published within [start_year, end_year) are used.

max_topics = 13
# Maximum number of topics to consider when aggregating or visualizing topic distributions.

plots_dpi = 700
# Resolution (DPI) for all generated PDF figures.

# ---------------------------------------
# Output Filenames for Datasets and Plots
# ---------------------------------------

works_per_year_plot_filename = "works_per_year.pdf"
# Output filename for the plot showing the number of works per year.

works_per_year_dataset = "works_per_year.csv"
# Output filename for the CSV dataset containing yearly work counts.

application_domain_plot_filename = "application_domains_over_time.pdf"
# Output filename for the plot tracking application-domain trends over time.

cs_topics_over_time_plot_filename = "cs_topics_over_time.pdf"
# Output filename for the plot tracking Computer Science topic trends over time.

ccdf_path = "./ccdfs"
# Directory where CCDF (Complementary Cumulative Distribution Function) plots or data
# will be stored. This feature is still a work in progress.

# ----------------------------------------------------
# External Mappings (Required for Topic Normalization)
# ----------------------------------------------------
# These mappings must be provided manually if analyzing fields outside Computer Science.

from topic_to_category import topic_to_category
# Dictionary mapping fine-grained CS topics to broader topic categories.

from mappings import topics_mapping, application_domains_mapping
# topics_mapping: Normalizes topic names and groups synonyms/variants.
# application_domains_mapping: Mapping to unify and filter application-domain labels.


### Data load

In [None]:
data = {}
with open(metadata_path, 'r') as f:
    next(f) #skip header
    
    for line in f:
        parts = line.strip().split(',')
        work_id = parts[0]
        year = parts[1]
        number_of_authors = parts[2]

        if year not in data:
            data[year] = []

        data[year].append({"id" : parts[0], "author_count" : parts[2], "topics" : parts[3].split(';')})

data_sorted = dict(sorted(data.items()))

### Papers by year

In [None]:
x = [int(year) for year in data_sorted.keys() if year.isdigit() and int(year) >= start_year and int(year) < end_year]
y = [len(data_sorted[year]) for year in data_sorted.keys() if year.isdigit() and int(year) >= start_year  and int(year) < end_year]

print(f"Computed {len(y)} time intervals")

fig = plt.figure(figsize=(13, 4))

plt.plot(x, y, marker='o')

plt.xlabel('Year')
plt.ylabel('Number of Works')
plt.grid(True)
plt.xticks(x, rotation=60)

plt.tight_layout()
plt.title(f"Number of published works over year: {analized_country}", fontweight='bold', fontsize=15)
plt.savefig(works_per_year_plot_filename, bbox_inches='tight', dpi=plots_dpi)

index = list(range(len(x)))
df = pd.DataFrame({'Year': x, 'Papers': y}, index=index)
df.to_csv(works_per_year_plot_filename, index=True)

### Topic distribution by year

In [None]:
def get_topics_by_year(data, year):
    topics = [
        t
        for work in data[year]
        for t in work['topics']
    ]
    topic_counts = Counter(topics)

    # remove 'Computer science'
    if 'Computer science' in topic_counts:
        del topic_counts['Computer science']

    return topic_counts

def normalize_topic_counts(topic_counts):
    total = sum(topic_counts.values())
    normalized_counts = {topic: round((count / total) * 100, 2) for topic, count in topic_counts.items()}
    return normalized_counts

def get_application_domains(topics_by_year):
    # sort topics by frequency
    sorted_topics = sorted(topics_by_year.items(), key=lambda item: item[1], reverse=True)

    return (sorted_topics[:20])

def filter(topics, criteria):
    to_return = topics.copy()
    for topic in criteria.keys():
        if topic in to_return:
            del to_return[topic]
    return to_return

def marco_filter(topics, criteria):
    to_return = {}

    for topic in topics.keys():
        if topic not in criteria:
            #print(f"Topic '{topic}' not found in criteria mapping. Skipping.")
            continue

        if criteria[topic] == "Others":
            continue
        to_return[topic] = topics[topic]

    return to_return


def uniform_application_domain(topics, application_domains):
    to_return = {}

    for topic, freq in topics.items():
        new_key = application_domains[topic] if topic in application_domains else topic
        to_return[new_key] = to_return.get(new_key, 0) + freq

    return to_return

### Application domains

In [None]:
categories_over_time = {}

years = list(range(start_year, end_year))

for idx, year in enumerate(years):
    
    # here we get all topics by year
    topics = get_topics_by_year(data_sorted, str(year))
    # then we filter out CS topics 
    # to focus on the application domains
    filtered_topics = filter(topics, topics_mapping)
    # we need to uniform the application domains
    # otherwise we will have specific subtopics (e.g., Medicine, Internal medicine, etc.)
    uniformed_topics = uniform_application_domain(filtered_topics, application_domains_mapping)
    # then we normalize the counts to get percentages
    normalized_topics = normalize_topic_counts(uniformed_topics)
    # we sort the topics by percentage
    # and we get only the most frequent ones
    sorted_topics = dict(sorted(normalized_topics.items(), key=lambda item: item[1], reverse=True))

    for topic, percentage in list(sorted_topics.items())[:max_topics]:
        if topic not in categories_over_time:
            categories_over_time[topic] = np.zeros(len(years), dtype=float)
        
        categories_over_time[topic][idx] = percentage
        

years = list(range(start_year, end_year))

fig = plt.figure(figsize=(10, 6))
bottom = np.zeros(len(years))

width = 0.75

cmap = plt.get_cmap("tab20")
colors = cmap.colors

hatches = ['/', '\\', '|', '-', '+', 'x', 'o', '\\|', '.', '*']

for i, (category, percentage) in enumerate(categories_over_time.items()):
    plt.bar(years, percentage, width, bottom=bottom, label=category, color=colors[i], edgecolor='white', linewidth=2) #, hatch=hatches[i % len(hatches)]
    bottom += np.array(percentage)

plt.xlim(start_year-1, end_year)

# ylabel
plt.ylabel('Percentage of Works (%)', fontweight='bold', fontsize=13)
plt.title(f"Application Domains Over Time: {analized_country}", fontweight='bold', fontsize=15)
plt.legend(loc='upper right', bbox_to_anchor=(1.3,1))
plt.savefig(application_domain_plot_filename, bbox_inches='tight', dpi=plots_dpi)

### CS subfields

In [None]:
cs_topics_over_time = {}

topics = get_topics_by_year(data_sorted, str(1990))
filtered_topics = filter(topics, application_domains_mapping)
marco_filtered_topics = marco_filter(filtered_topics, topic_to_category)

years = list(range(start_year, end_year))

for idx, year in enumerate(years):
    # here we get all topics by year
    topics = get_topics_by_year(data_sorted, str(year))
    # then we filter out app domains topics 
    # to focus on CS-related subfields
    filtered_topics = filter(topics, application_domains_mapping)
    marco_filtered_topics = marco_filter(filtered_topics, topic_to_category)
    # we need to uniform the subtopics
    uniformed_topics = uniform_application_domain(marco_filtered_topics, topic_to_category)#cs_topics
    # then we normalize the counts to get percentages
    normalized_topics = normalize_topic_counts(uniformed_topics)
    # we sort the topics by percentage
    # and we get only the most frequent ones
    sorted_topics = dict(sorted(normalized_topics.items(), key=lambda item: item[1], reverse=True))

    for topic, percentage in list(sorted_topics.items())[:max_topics]:
        if topic not in cs_topics_over_time:
            cs_topics_over_time[topic] = np.zeros(len(years), dtype=float)

        cs_topics_over_time[topic][idx] = percentage

# transform cs_topic_over_time to df
cs_df = pd.DataFrame(cs_topics_over_time, index=years)

# sum the values over the rows
cs_df['Total'] = cs_df.sum(axis=1)

# create a new column 'Other' that is 100 - Total
cs_df['Other'] = 100 - cs_df['Total']


cs_topics_over_time['Other'] = cs_df['Other'].values

years = list(range(start_year, end_year))

fig = plt.figure(figsize=(10, 6))
bottom = np.zeros(len(years))

width = 0.75

cmap = plt.get_cmap("tab20")
colors = cmap.colors

hatches = ['/', '\\', '|', '-', '+', 'x', 'o', '\\|', '.', '*']

for i, (topic, percentage) in enumerate(cs_topics_over_time.items()):
    if topic == "Other":
        plt.bar(years, percentage, width, bottom=bottom, label=topic, color='lightgrey', edgecolor='white', linewidth=2) #, hatch=hatches[i % len(hatches)]
    else:
        plt.bar(years, percentage, width, bottom=bottom, label=topic, color=colors[i], edgecolor='white', linewidth=2) #, hatch=hatches[i % len(hatches)]
    bottom += np.array(percentage)


plt.xlim(start_year-1, end_year)

# ylabel
plt.title(f"Disciplines Over Time: {analized_country}", fontweight='bold', fontsize=15)
plt.ylabel('Percentage of Works (%)', fontweight='bold', fontsize=13)
plt.legend(loc='upper right', bbox_to_anchor=(1.35,1.), ncol=1)
plt.savefig(cs_topics_over_time_plot_filename, bbox_inches='tight', dpi=plots_dpi)

### CCDF (WIP)

In [None]:
def eval_ccdf(graph):
    degree_sequence = sorted([d for _, d in graph.degree()], reverse=True)  # degree sequence
    degreeCount = Counter(degree_sequence)
    deg, cnt = zip(*degreeCount.items())
    cs = np.cumsum(cnt)
    return deg, cs

years = range(start_year, end_year)


for year in years:
    print(f"Processing CCDF for year {year}...")
    output_path = os.path.join(ccdf_path, f"ccdf_{year}.csv")

    if os.path.exists(output_path):
        print(f"CCDF for year {year} already exists. Skipping...")
        continue

    #load the network in a df
    net_path = f"../data/{year}.csv"
    net_df = pd.read_csv(net_path, names=['year', 'work_id', 'author_id1', 'author_id2'])

    G = nx.from_pandas_edgelist(net_df, 'author_id1', 'author_id2')

    deg, cs = eval_ccdf(G)
    np.savetxt(output_path, np.array([deg, cs]).T, fmt='%d', delimiter=",", comments='', header='deg,cs')

np.savetxt(os.path.join(output_path, 'ccdf_before.txt'), np.array([deg_before, cs_before]).T, fmt='%d')

### Plotting CCDF

In [None]:
basepath = "../plot_data/ccdfs"

# generate a figure with 5x5 subplots, each subplot is a ccdf for a year from 1990 to 2023
fig, axs = plt.subplots(7, 5, figsize=(15, 15))
axs = axs.flatten()

first_idxs = {1990, 1995, 2000, 2005, 2010, 2015, 2020}
last_raw_idxs = {2020, 2021, 2022, 2023}

for i, year in enumerate(range(start_year, end_year)):
    #print(f"Plotting CCDF for year {year}...")
    ccdf_path = os.path.join(basepath, f"ccdf_{year}.csv")
    deg, cs = np.loadtxt(ccdf_path, delimiter=',', skiprows=1, unpack=True)

    axs[i].plot(deg, cs, marker='o', linestyle='None', color='darkturquoise', markersize=2)
    axs[i].set_xscale('log')
    axs[i].set_yscale('log')
    axs[i].set_title(f'{year}', fontweight='bold', fontsize=13)

    if year in last_raw_idxs:
        axs[i].set_xlabel('Degree', fontweight='bold', fontsize=13)

    if year in first_idxs:
        axs[i].set_ylabel('CCDF', fontweight='bold', fontsize=13)
    axs[i].grid(True)
    axs[i].set_xlim(left=1)
    axs[i].set_ylim(bottom=1)

fig.delaxes(axs[34])
plt.tight_layout()
plt.savefig('./ccdfs_year_by_year.pdf', bbox_inches='tight', dpi=700)