In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import requests
import time
from sklearn.feature_extraction.text import TfidfVectorizer
import kagglehub
import shutil
import os
import json
from tqdm import tqdm
from transformers import AutoTokenizer, pipeline
from sentence_transformers import SentenceTransformer
from bertopic import BERTopic
import torch
import numpy as np
import yake
import joblib
from itertools import chain
import multiprocessing
from span_marker import SpanMarkerModel

In [None]:
def save_as_json(file_path, data_list):
    with open('/content/drive/My Drive/' + file_path, "w", encoding="utf-8") as f:
        json.dump(data_list, f, ensure_ascii=False, indent=4)

In [None]:
if not os.path.exists("./data/arxiv-metadata-oai-snapshot.json"):
    temp_path = kagglehub.dataset_download("Cornell-University/arxiv")
    temp_path += "/arxiv-metadata-oai-snapshot.json"
    dest_path = "./data"

    os.makedirs(dest_path, exist_ok=True)
    shutil.move(temp_path, dest_path)

In [None]:
file_path = "./data/arxiv-metadata-oai-snapshot.json"
text_summary = []

with open(file_path, "r", encoding="utf-8") as file:
    for line in tqdm(file, desc="reading dataset"):
        data = json.loads(line)
        if data['categories'][:2] != 'cs':
            continue
        if 'title' not in data or 'abstract' not in data:
            continue
        text_summary.append(data['title'] + " " + data['abstract'])

print(len(text_summary))

reading dataset: 2683176it [00:37, 71363.97it/s]

596048





In [None]:
base_url = "https://api.openalex.org/topics?per-page=100"
openalex_topics = {}
kw_topics = set()
page = 1

while True:
    url = f"{base_url}&page={page}"
    # print(f"sending request to URL: {url}")
    resp = requests.get(url).json()
    if 'results' not in resp or len(resp['results']) == 0:
        print("All Finished")
        break

    for item in resp['results']:
        if item['domain']['display_name'] != 'Physical Sciences':
            continue
        if item['field']['display_name'] != 'Computer Science':
            continue
        topic = item['display_name']
        keyword = item['keywords']

        kw_topics.update(keyword)

        subfield = item['subfield']['display_name']

        if subfield not in openalex_topics.keys():
            openalex_topics[subfield] = list()

        openalex_topics[subfield].append(topic)

    page += 1
    time.sleep(0.2)

print(openalex_topics.keys())
print(len(kw_topics))
save_as_json("openalex_topics.json", list(chain(*openalex_topics.values())))
save_as_json("kw_topics.json", list(kw_topics))

All Finished
dict_keys(['Information Systems', 'Artificial Intelligence', 'Computer Networks and Communications', 'Hardware and Architecture', 'Computational Theory and Mathematics', 'Computer Vision and Pattern Recognition', 'Signal Processing', 'Computer Science Applications', 'Computer Graphics and Computer-Aided Design', 'Software', 'Human-Computer Interaction'])
2321


In [None]:
vectorizer = TfidfVectorizer(
    ngram_range=(1,3),
    max_df=0.9,
    min_df=5,
    stop_words='english',
    max_features=2000,
    sublinear_tf=True,
    use_idf=True,
    smooth_idf=True,
    dtype=np.float32
)

with joblib.parallel_backend('threading', n_jobs=-1):
    X = vectorizer.fit_transform(text_summary)

tfidf_topics = set(vectorizer.get_feature_names_out())
print(len(tfidf_topics))
save_as_json("tfidf_topics.json", list(tfidf_topics))

2000


In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model = SpanMarkerModel.from_pretrained("zhang19991111/scibert-spanmarker-STEM-NER").to(device)

bertner_topics = set()

for summary in tqdm(text_summary, desc="extracting topics", unit="docs"):
    try:
        ner_results = model.predict(summary)
    except:
        print("skip")
        continue
    bertner_topics.update([entity['span'] for entity in ner_results])

print(len(bertner_topics))
save_as_json("bertner_topics.json", list(bertner_topics))


extracting topics:   0%|          | 75/596048 [00:22<48:54:58,  3.38docs/s]

skip


extracting topics:   0%|          | 89/596048 [00:26<37:04:38,  4.46docs/s]

skip


extracting topics:   0%|          | 600/596048 [02:58<47:58:50,  3.45docs/s]

skip
skip


extracting topics:   0%|          | 603/596048 [02:58<28:55:59,  5.72docs/s]

skip


extracting topics:   0%|          | 609/596048 [03:00<40:49:37,  4.05docs/s]

In [None]:
extractor = yake.KeywordExtractor(lan="en", n=3, dedupLim=0.9, top=5)

def extract_keywords(text):
    kws = extractor.extract_keywords(text)
    return [kw[0] for kw in kws]

with multiprocessing.Pool(processes=multiprocessing.cpu_count()) as pool:
    results = list(tqdm(pool.imap(extract_keywords, text_summary), total=len(text_summary), desc="extracting topics", unit="doc"))

yake_topics = set(keyword for result in results for keyword in result)

print(len(yake_topics))
save_as_json("yake_topics.json", list(yake_topics))

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"

model = SentenceTransformer("all-MiniLM-L6-v2", device=device)

embeddings = model.encode(text_summary, batch_size=32, device=device, convert_to_tensor=True)

topic_model = BERTopic(umap_model=None)
topics, probs = topic_model.fit_transform(text_summary, embeddings)

print(len(topics))
save_as_json("topicbert_topics.json", list(topics))