In [1]:
import pandas as pd

In [4]:
overview = pd.read_csv('data/cleaned/common_topics.csv')
overview.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1938 entries, 0 to 1937
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Unnamed: 0         1938 non-null   int64 
 1   topic              1938 non-null   int64 
 2   count              1938 non-null   int64 
 3   categories_list    1938 non-null   object
 4   descriptive_label  1938 non-null   object
 5   primary_category   1938 non-null   object
dtypes: int64(3), object(3)
memory usage: 91.0+ KB


In [5]:
# reorder and select necessary columns
overview_cols = ['topic','count','descriptive_label','primary_category']
overview = overview[overview_cols]

In [7]:
# percentage of total (labeled) corpus represented by each topic
overview['percent_of_corpus'] = overview['count']/(overview['count'].sum())

In [8]:
overview.head()

Unnamed: 0,topic,count,descriptive_label,primary_category,percent_of_corpus
0,0,2385,Partial Differential Equations: Nonlinear Elli...,math.AP,0.017896
1,1,1350,General Relativity: Black Holes and Spacetime ...,math.MP,0.01013
2,2,1093,Functional Analysis: Banach Spaces and Lattices,math.FA,0.008201
3,3,1038,Coding Theory: Linear Codes over Finite Fields,math.IT,0.007789
4,4,976,Computational Topology: Persistent Homology an...,math.AT,0.007323


In [9]:
# Sanity check: the percentages all add up to 1
overview['percent_of_corpus'].sum()

np.float64(1.0)

In [22]:
overview.to_csv('data/cleaned/common_topics.csv')

In [11]:
papers_df = pd.read_parquet('data/cleaned/papers_by_topic_no_outliers.parquet')
papers_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 121391 entries, 0 to 121390
Data columns (total 9 columns):
 #   Column             Non-Null Count   Dtype 
---  ------             --------------   ----- 
 0   id                 121391 non-null  object
 1   title              121391 non-null  object
 2   categories         121391 non-null  object
 3   abstract           121391 non-null  object
 4   update_date        121391 non-null  object
 5   authors_parsed     121391 non-null  object
 6   topic              121391 non-null  int64 
 7   short_label        121391 non-null  object
 8   descriptive_label  121391 non-null  object
dtypes: int64(1), object(8)
memory usage: 8.3+ MB


In [12]:
# Selecting columns for author analysis and count over time analysis
papers_cols = ['id','title','update_date','authors_parsed','topic']
papers_df = papers_df[papers_cols]

In [20]:
# sample dataset for Claude to help me :) 
topic_authors = ['id','topic','authors_parsed']
sample_authors_df = papers_df[topic_authors].head(300)
sample_authors_df.to_csv('data/cleaned/sample_authors_info.csv')

In [21]:
import json
import ast
from collections import Counter, defaultdict

# Load data from the parquet file
df = pd.read_parquet('data/cleaned/papers_by_topic_no_outliers.parquet')

# Function to parse authors_parsed column
def parse_authors(authors_data):
    try:
        # If it's a string, evaluate it to a list
        if isinstance(authors_data, str):
            authors_list = ast.literal_eval(authors_data)
        else:
            authors_list = authors_data
        
        # Format authors as "Last Name, First Name"
        return [f"{author[0]}, {author[1]}" for author in authors_list if len(author) >= 2]
    except:
        return []

# Count authors by topic
author_counts = defaultdict(Counter)

# Process each paper
for _, row in df.iterrows():
    topic = row['topic']
    authors = parse_authors(row['authors_parsed'])
    
    # Count each author for this topic
    for author in authors:
        author_counts[topic][author] += 1

# Prepare the final result structure
result = {
    "meta": {
        "description": "Top authors by topic for Math Research Compass",
        "topic_count": len(author_counts)
    },
    "top_authors_by_topic": {}
}

# Load topic metadata if available
try:
    topics_df = pd.read_csv('data/cleaned/common_topics.csv')
    topics_metadata = {}
    for _, row in topics_df.iterrows():
        topics_metadata[row['topic']] = {
            "descriptive_label": row.get('descriptive_label', f"Topic {row['topic']}"),
            "primary_category": row.get('primary_category', '')
        }
except:
    topics_metadata = {}

# Get top 10 authors for each topic
for topic, counter in author_counts.items():
    # Get top 10 authors
    top_authors = [
        {"name": author, "count": count}
        for author, count in counter.most_common(10)
    ]
    
    # Add topic metadata if available
    if topic in topics_metadata:
        result["top_authors_by_topic"][str(topic)] = {
            "topic_label": topics_metadata[topic]["descriptive_label"],
            "primary_category": topics_metadata[topic]["primary_category"],
            "authors": top_authors
        }
    else:
        result["top_authors_by_topic"][str(topic)] = {
            "topic_label": f"Topic {topic}",
            "authors": top_authors
        }

# Save to JSON
with open('results/topics/top_authors_by_topic.json', 'w') as f:
    json.dump(result, f, indent=2)

print(f"Saved top authors data for {len(author_counts)} topics")

Saved top authors data for 1938 topics


In [14]:
import pandas
import json

#author_df = pd.read_json("results/topics/top_authors_by_topic.json")

with open('results/topics/top_authors_by_topic.json', 'r') as f:
        topic_authors = json.load(f)

In [16]:
topic_authors

{'meta': {'description': 'Top authors by topic for Math Research Compass',
  'topic_count': 1938},
 'top_authors_by_topic': {'66': {'topic_label': 'Algebra: Structure and Classification of Lie Superalgebras and Nilpotent Lie Algebras',
   'primary_category': 'math.RA',
   'authors': [{'name': 'Bouarroudj, Sofiane', 'count': 16},
    {'name': 'Niroomand, Peyman', 'count': 14},
    {'name': 'Leites, Dimitry', 'count': 12},
    {'name': 'Kaygorodov, Ivan', 'count': 10},
    {'name': 'Niroomand, P.', 'count': 10},
    {'name': 'Upadhyay, Sumit Kumar', 'count': 9},
    {'name': 'Kumar, Amit', 'count': 8},
    {'name': 'Lebedev, Alexei', 'count': 7},
    {'name': 'Shchepochkina, Irina', 'count': 7},
    {'name': 'Padhan, Rudra Narayan', 'count': 7}]},
  '1479': {'topic_label': 'Geometric Topology: Properties of Jordan Curves',
   'primary_category': 'math.MG',
   'authors': [{'name': 'Greene, Joshua Evan', 'count': 4},
    {'name': 'Lobb, Andrew', 'count': 4},
    {'name': 'Matschke, Benjami

In [21]:
topic_authors["top_authors_by_topic"]['1560']["authors"]

[{'name': 'Massey, David B.', 'count': 3},
 {'name': 'Liu, Yongqiang', 'count': 2},
 {'name': 'Maxim, Laurentiu', 'count': 2},
 {'name': 'Wang, Botong', 'count': 2},
 {'name': 'Gouttard, Valentin', 'count': 1},
 {'name': 'Dyckerhoff, Tobias', 'count': 1},
 {'name': 'Kapranov, Mikhail', 'count': 1},
 {'name': 'Soibelman, Yan', 'count': 1},
 {'name': 'Hennecart, Lucien', 'count': 1},
 {'name': 'Goresky, Mark', 'count': 1}]

# Topic wordclouds 

In [6]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import pandas as pd
import json


In [8]:
file_path = "results/topics/topic_keywords_20250509_221839.json"

# Read the JSON file
with open(file_path, 'r') as f:
    topic_keywords = json.load(f)


In [10]:
topic_keywords["0"]

[['weak solutions', 0.4914427697658539],
 ['laplacian', 0.4874769151210785],
 ['nonlinear elliptic', 0.47341063618659973],
 ['elliptic equations', 0.449662983417511],
 ['quasilinear elliptic', 0.4248131811618805],
 ['sobolev', 0.3979707956314087],
 ['laplace', 0.35457226634025574],
 ['free boundary', 0.3421449661254883],
 ['positive solutions', 0.32754287123680115],
 ['elliptic', 0.3167794644832611]]

In [11]:
import json
import os
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import numpy as np

# Step 1: Load your topic keywords JSON file
# Find the most recent topic_keywords file
topic_dir = "results/topics/"
files = sorted([f for f in os.listdir(topic_dir) if f.startswith("topic_keywords_")], reverse=True)
if not files:
    print("No topic_keywords files found!")
    exit(1)

latest_file = os.path.join(topic_dir, files[0])
print(f"Loading keywords from: {latest_file}")

with open(latest_file, 'r') as f:
    topic_keywords = json.load(f)

# Step 2: Create output directory for wordclouds
output_dir = "results/wordclouds"
os.makedirs(output_dir, exist_ok=True)

# Step 3: Generate a wordcloud for each topic
for topic_id, keywords in topic_keywords.items():
    # Convert keywords to dictionary format {word: weight}
    word_dict = {word: weight for word, weight in keywords}
    
    # Create the wordcloud
    wordcloud = WordCloud(
        width=800, 
        height=400,
        background_color='white',
        colormap='viridis',
        max_words=30,
        prefer_horizontal=0.9,
        contour_width=1,
        contour_color='steelblue'
    ).generate_from_frequencies(word_dict)
    
    # Plot and save the wordcloud
    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    
    # Get top words for the title
    top_words = [keywords[i][0] for i in range(min(3, len(keywords)))]
    title = f"Topic {topic_id}: {', '.join(top_words)}"
    plt.title(title, fontsize=16)
    
    # Save the figure
    output_file = os.path.join(output_dir, f"wordcloud_topic_{topic_id}.png")
    plt.tight_layout()
    plt.savefig(output_file, dpi=300)
    plt.close()
    
    print(f"Generated wordcloud for Topic {topic_id}")

print(f"Wordclouds saved to {output_dir}")

Loading keywords from: results/topics/topic_keywords_20250509_221839.json
Generated wordcloud for Topic 0
Generated wordcloud for Topic 1
Generated wordcloud for Topic 2
Generated wordcloud for Topic 3
Generated wordcloud for Topic 4
Generated wordcloud for Topic 5
Generated wordcloud for Topic 6
Generated wordcloud for Topic 7
Generated wordcloud for Topic 8
Generated wordcloud for Topic 9
Generated wordcloud for Topic 10
Generated wordcloud for Topic 11
Generated wordcloud for Topic 12
Generated wordcloud for Topic 13
Generated wordcloud for Topic 14
Generated wordcloud for Topic 15
Generated wordcloud for Topic 16
Generated wordcloud for Topic 17
Generated wordcloud for Topic 18
Generated wordcloud for Topic 19
Generated wordcloud for Topic 20
Generated wordcloud for Topic 21
Generated wordcloud for Topic 22
Generated wordcloud for Topic 23
Generated wordcloud for Topic 24
Generated wordcloud for Topic 25
Generated wordcloud for Topic 26
Generated wordcloud for Topic 27
Generated wo

# Representative Articles

In [23]:
import pandas as pd

docs_df = pd.read_parquet('results/topics/papers_by_topic_no_outliers.parquet')
docs_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 121391 entries, 0 to 121390
Data columns (total 9 columns):
 #   Column             Non-Null Count   Dtype 
---  ------             --------------   ----- 
 0   id                 121391 non-null  object
 1   title              121391 non-null  object
 2   categories         121391 non-null  object
 3   abstract           121391 non-null  object
 4   update_date        121391 non-null  object
 5   authors_parsed     121391 non-null  object
 6   topic              121391 non-null  int64 
 7   short_label        121391 non-null  object
 8   descriptive_label  121391 non-null  object
dtypes: int64(1), object(8)
memory usage: 8.3+ MB


In [25]:
# Select and reorder relevant columns
doc_cols = ['id','topic','title','authors_parsed','abstract','update_date']
docs_df = docs_df[doc_cols]

In [28]:
docs_df['url'] = "https://arxiv.org/abs/" + docs_df['id']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  docs_df['url'] = "https://arxiv.org/abs/" + docs_df['id']


In [29]:
docs_df.head()

Unnamed: 0,id,topic,title,authors_parsed,abstract,update_date,url
0,1001.0176,66,A note on the Schur multiplier of a nilpotent ...,"[['Niroomand', 'Peyman', '', 'Damghan Universi...",For a nilpotent Lie algebra $L$ of dimension...,2021-05-24,https://arxiv.org/abs/1001.0176
1,1001.0186,1479,On the Square Peg Problem and Its Relatives,"[['Matschke', 'Benjamin', '']]",Toeplitz's Square Peg Problem asks whether e...,2022-03-21,https://arxiv.org/abs/1001.0186
2,1001.0462,695,Representation Theory of Finite Groups,"[['Singh', 'Anupam', '']]",The point of view of these notes on the topi...,2022-12-22,https://arxiv.org/abs/1001.0462
3,1001.0608,1818,An Efficient Quantum Algorithm for some Instan...,"[['Gall', 'François Le', '']]",In this paper we consider the problem of tes...,2021-10-05,https://arxiv.org/abs/1001.0608
4,1001.0872,94,"Symmetry, Conserved Charges, and Lax Represent...","[['Papachristou', 'C. J.', '']]",A certain non-Noetherian connection between ...,2024-08-29,https://arxiv.org/abs/1001.0872


In [30]:
# First, let's extract and clean up the authors field
def format_authors(authors_str):
    try:
        # Parse the JSON string
        authors_parsed = json.loads(authors_str.replace("'", "\""))
        
        # If it's a list of lists (common format in ArXiv data), extract the names
        if isinstance(authors_parsed, list) and authors_parsed and isinstance(authors_parsed[0], list):
            # Format: [["Last Name", "First Name"], ["Last Name", "First Name"], ...]
            author_names = [f"{first} {last}" for last, first in authors_parsed]
            return ", ".join(author_names[:3]) + ("..." if len(author_names) > 3 else "")
        # If it's a simple list of names
        elif isinstance(authors_parsed, list):
            return ", ".join(authors_parsed[:3]) + ("..." if len(authors_parsed) > 3 else "")
        else:
            return str(authors_parsed)
    except:
        # If parsing fails, return as is
        return authors_str

# Apply the function to format authors
docs_df['authors'] = docs_df['authors_parsed'].apply(format_authors)

# Convert update_date to a proper datetime format
docs_df['date'] = pd.to_datetime(docs_df['update_date'])

# Create a more compact version with just what we need for the UI
compact_docs_df = docs_df[['id', 'topic', 'title', 'authors', 'date']].copy()

# Add the URL column
compact_docs_df['url'] = "https://arxiv.org/abs/" + compact_docs_df['id']

# Save this compact dataset for quicker loading in the app
compact_docs_df.to_csv('data/cleaned/compact_docs_with_topics.csv', index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  docs_df['authors'] = docs_df['authors_parsed'].apply(format_authors)


#

In [31]:
compact_docs_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 121391 entries, 0 to 121390
Data columns (total 6 columns):
 #   Column   Non-Null Count   Dtype         
---  ------   --------------   -----         
 0   id       121391 non-null  object        
 1   topic    121391 non-null  int64         
 2   title    121391 non-null  object        
 3   authors  121391 non-null  object        
 4   date     121391 non-null  datetime64[ns]
 5   url      121391 non-null  object        
dtypes: datetime64[ns](1), int64(1), object(4)
memory usage: 5.6+ MB


In [33]:
sample_compact = compact_docs_df.head(300)

In [34]:
sample_compact = sample_compact.to_csv('data/cleaned/sample_compact_docs_with_topics.csv')

In [38]:
import ast

def format_authors(authors_str):
        
        if pd.isna(authors_str) or not authors_str:
            return "N/A Authors"
        try:
            authors_list_of_lists = ast.literal_eval(authors_str)
            formatted_names = []
            for author_parts in authors_list_of_lists:
                if isinstance(author_parts, list) and len(author_parts) >= 2:
                    last_name = author_parts[0].strip()
                    first_name = author_parts[1].strip()
                    # Middle initial/other parts can be added if needed and consistently present
                    # For now, just first and last name
                    if first_name and last_name:
                        formatted_names.append(f"{first_name} {last_name}")
                    elif last_name: # Only last name
                        formatted_names.append(last_name)
                    elif first_name: # Only first name (less likely for academic papers)
                        formatted_names.append(first_name)
            return ", ".join(formatted_names) if formatted_names else "N/A Authors"
        except (ValueError, SyntaxError, TypeError) as e:
            # If parsing fails, return the original string or an error message
            print(f"Error parsing authors string '{authors_str}': {e}")
            return authors_str # Or "Error parsing authors"

In [39]:
compact_docs_df['authors_formatted'] = compact_docs_df['authors'].apply(format_authors)

In [40]:
compact_docs_df.head()

Unnamed: 0,id,topic,title,authors,date,url,authors_formatted
0,1001.0176,66,A note on the Schur multiplier of a nilpotent ...,"[['Niroomand', 'Peyman', '', 'Damghan Universi...",2021-05-24,https://arxiv.org/abs/1001.0176,"Peyman Niroomand, Francesco G. Russo"
1,1001.0186,1479,On the Square Peg Problem and Its Relatives,"[['Matschke', 'Benjamin', '']]",2022-03-21,https://arxiv.org/abs/1001.0186,Benjamin Matschke
2,1001.0462,695,Representation Theory of Finite Groups,"[['Singh', 'Anupam', '']]",2022-12-22,https://arxiv.org/abs/1001.0462,Anupam Singh
3,1001.0608,1818,An Efficient Quantum Algorithm for some Instan...,"[['Gall', 'François Le', '']]",2021-10-05,https://arxiv.org/abs/1001.0608,François Le Gall
4,1001.0872,94,"Symmetry, Conserved Charges, and Lax Represent...","[['Papachristou', 'C. J.', '']]",2024-08-29,https://arxiv.org/abs/1001.0872,C. J. Papachristou


In [41]:
compact_docs_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 121391 entries, 0 to 121390
Data columns (total 7 columns):
 #   Column             Non-Null Count   Dtype         
---  ------             --------------   -----         
 0   id                 121391 non-null  object        
 1   topic              121391 non-null  int64         
 2   title              121391 non-null  object        
 3   authors            121391 non-null  object        
 4   date               121391 non-null  datetime64[ns]
 5   url                121391 non-null  object        
 6   authors_formatted  121391 non-null  object        
dtypes: datetime64[ns](1), int64(1), object(5)
memory usage: 6.5+ MB
