In [1]:
import pandas as pd
from pathlib import Path

# folder where your yearly CSVs live
data_dir = Path("openalex")

us_sums = {}

for year in range(2020, 2025):
    file_path = data_dir / f"{year}.csv"
    
    df = pd.read_csv(file_path)
    
    us_sum = df.loc[df["geo"] == "United States (US)", "count"].sum()
    us_sums[year] = us_sum

# convert to DataFrame for nice output
df_us_sums = pd.DataFrame(
    [{"year": y, "us_count_sum": v} for y, v in us_sums.items()]
).sort_values("year")

total_2020_2024 = df_us_sums["us_count_sum"].sum()

print(df_us_sums)
print("\nTOTAL 2020–2024:", total_2020_2024)

   year  us_count_sum
0  2020       2781693
1  2021       2687743
2  2022       2606712
3  2023       2634555
4  2024       2010773

TOTAL 2020–2024: 12721476


In [2]:
import pandas as pd
from pathlib import Path

# folder where your yearly CSVs live
data_dir = Path("openalex")

us_sums = {}

for year in range(2020, 2025):
    file_path = data_dir / f"{year}.csv"
    
    df = pd.read_csv(file_path)
    
    us_sum = df["count"].sum()
    us_sums[year] = us_sum

# convert to DataFrame for nice output
df_us_sums = pd.DataFrame(
    [{"year": y, "us_count_sum": v} for y, v in us_sums.items()]
).sort_values("year")

total_2020_2024 = df_us_sums["us_count_sum"].sum()

print(df_us_sums)
print("\nTOTAL 2020–2024:", total_2020_2024)

   year  us_count_sum
0  2020      15799563
1  2021      16624113
2  2022      16495474
3  2023      16975827
4  2024      13624064

TOTAL 2020–2024: 79519041


In [3]:
import pandas as pd
from pathlib import Path

us_sums = {}
us_ai_sums = {}

for year in range(2020, 2025):
    file_path = data_dir / f"{year}.csv"
    df = pd.read_csv(file_path)

    # ---- 1. US only ----
    us_sum = df.loc[
        df["geo"] == "United States (US)", 
        "count"
    ].sum()
    us_sums[year] = us_sum

    # ---- 2. US + Artificial Intelligence ----
    us_ai_sum = df.loc[
        (df["geo"] == "United States (US)") & 
        (df["domain"] == "Artificial Intelligence"),
        "count"
    ].sum()
    us_ai_sums[year] = us_ai_sum


# -----------------------------
# Convert to DataFrames
# -----------------------------

df_us = pd.DataFrame(
    [{"year": y, "us_count_sum": v} for y, v in us_sums.items()]
).sort_values("year")

df_us_ai = pd.DataFrame(
    [{"year": y, "us_ai_count_sum": v} for y, v in us_ai_sums.items()]
).sort_values("year")


# -----------------------------
# % Change (Year-over-Year)
# -----------------------------

df_us["us_pct_change"] = df_us["us_count_sum"].pct_change() * 100
df_us_ai["us_ai_pct_change"] = df_us_ai["us_ai_count_sum"].pct_change() * 100


# -----------------------------
# Totals
# -----------------------------

total_us_2020_2024 = df_us["us_count_sum"].sum()
total_us_ai_2020_2024 = df_us_ai["us_ai_count_sum"].sum()


# -----------------------------
# Output
# -----------------------------

print("\n=== US TOTALS ===")
print(df_us.round(2))
print("\nTOTAL US 2020–2024:", total_us_2020_2024)

print("\n=== US + ARTIFICIAL INTELLIGENCE TOTALS ===")
print(df_us_ai.round(2))
print("\nTOTAL US AI 2020–2024:", total_us_ai_2020_2024)


=== US TOTALS ===
   year  us_count_sum  us_pct_change
0  2020       2781693            NaN
1  2021       2687743          -3.38
2  2022       2606712          -3.01
3  2023       2634555           1.07
4  2024       2010773         -23.68

TOTAL US 2020–2024: 12721476

=== US + ARTIFICIAL INTELLIGENCE TOTALS ===
   year  us_ai_count_sum  us_ai_pct_change
0  2020            52759               NaN
1  2021            56289              6.69
2  2022            43193            -23.27
3  2023            44262              2.47
4  2024            35840            -19.03

TOTAL US AI 2020–2024: 232343


In [6]:
import pandas as pd
from pathlib import Path

us_sums = {}
us_ai_sums = {}

for year in range(2020, 2025):
    file_path = data_dir / f"{year}.csv"
    df = pd.read_csv(file_path)

    # ---- 1. US only ----
    us_sum = df["count"].sum()
    us_sums[year] = us_sum

    # ---- 2. US + Artificial Intelligence ----
    us_ai_sum = df.loc[
        (df["domain"] == "Artificial Intelligence"),
        "count"
    ].sum()
    us_ai_sums[year] = us_ai_sum


# -----------------------------
# Convert to DataFrames
# -----------------------------

df_us = pd.DataFrame(
    [{"year": y, "world_count_sum": v} for y, v in us_sums.items()]
).sort_values("year")

df_us_ai = pd.DataFrame(
    [{"year": y, "world_ai_count_sum": v} for y, v in us_ai_sums.items()]
).sort_values("year")


# -----------------------------
# % Change (Year-over-Year)
# -----------------------------

df_us["world_pct_change"] = df_us["world_count_sum"].pct_change() * 100
df_us_ai["world_ai_pct_change"] = df_us_ai["world_ai_count_sum"].pct_change() * 100


# -----------------------------
# Totals
# -----------------------------

total_us_2020_2024 = df_us["world_count_sum"].sum()
total_us_ai_2020_2024 = df_us_ai["world_ai_count_sum"].sum()


# -----------------------------
# Output
# -----------------------------

print("\n=== World TOTALS ===")
print(df_us.round(2))
print("\nTOTAL World 2020–2024:", total_us_2020_2024)

print("\n=== World + ARTIFICIAL INTELLIGENCE TOTALS ===")
print(df_us_ai.round(2))
print("\nTOTAL World AI 2020–2024:", total_us_ai_2020_2024)


=== World TOTALS ===
   year  world_count_sum  world_pct_change
0  2020         15799563               NaN
1  2021         16624113              5.22
2  2022         16495474             -0.77
3  2023         16975827              2.91
4  2024         13624064            -19.74

TOTAL World 2020–2024: 79519041

=== World + ARTIFICIAL INTELLIGENCE TOTALS ===
   year  world_ai_count_sum  world_ai_pct_change
0  2020              297304                  NaN
1  2021              333445                12.16
2  2022              317385                -4.82
3  2023              351768                10.83
4  2024              281277               -20.04

TOTAL World AI 2020–2024: 1581179


In [13]:
for year in range(2020, 2025):
    file_path = data_dir / f"{year}.csv"
    df = pd.read_csv(file_path)
    df = df[df["domain"] == "Artificial Intelligence"]
    print(year)
    print(df.sort_values(by='count',ascending=0).head(10))


2020
                      geo                   domain  count
226    United States (US)  Artificial Intelligence  52759
1797           China (CN)  Artificial Intelligence  44516
1205           India (IN)  Artificial Intelligence  16684
418        Indonesia (ID)  Artificial Intelligence  15949
935   United Kingdom (GB)  Artificial Intelligence  15032
873          Germany (DE)  Artificial Intelligence  12856
952           France (FR)  Artificial Intelligence   9193
1041          Canada (CA)  Artificial Intelligence   8085
1486           Japan (JP)  Artificial Intelligence   7814
669            Italy (IT)  Artificial Intelligence   7298
2021
                      geo                   domain  count
1      United States (US)  Artificial Intelligence  56289
0              China (CN)  Artificial Intelligence  54864
1649           India (IN)  Artificial Intelligence  19556
2051       Indonesia (ID)  Artificial Intelligence  18279
220   United Kingdom (GB)  Artificial Intelligence  17137
627 

In [7]:
year = 2023
file_path = data_dir / f"{year}.csv"
df = pd.read_csv(file_path)


In [8]:
df.loc[
    (df["domain"] == "Artificial Intelligence"),
    "count"
].sum()

np.int64(351768)

In [10]:
df = df[df["domain"] == "Artificial Intelligence"]

In [12]:
df.sort_values(by='count',ascending=0).head(20)

Unnamed: 0,geo,domain,count
2177,China (CN),Artificial Intelligence,72325
39,United States (US),Artificial Intelligence,44262
649,India (IN),Artificial Intelligence,25715
1090,Indonesia (ID),Artificial Intelligence,19194
182,United Kingdom (GB),Artificial Intelligence,14050
1216,Germany (DE),Artificial Intelligence,13910
1358,France (FR),Artificial Intelligence,9729
607,Italy (IT),Artificial Intelligence,8907
1470,Canada (CA),Artificial Intelligence,8222
91,Japan (JP),Artificial Intelligence,8044
