In [3]:
import json
import pandas as pd
from pathlib import Path
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

In [4]:
data = []
datasets = ["dblp-ref-0.json"]


for dataset in datasets:
    with open(Path("datasets") / dataset, "r") as f:
        for line in f:
            data.append(json.loads(line))

In [5]:
df = pd.DataFrame(data)
print(df.head())
print(df.shape)

                                            abstract  \
0  The purpose of this study is to develop a lear...   
1  This paper describes the design and implementa...   
2  This article applied GARCH model instead AR or...   
3                                                NaN   
4                                                NaN   

                                             authors  n_citation  \
0  [Makoto Satoh, Ryo Muramatsu, Mizue Kayama, Ka...           0   
1                        [Gareth Beale, Graeme Earl]          50   
2  [Altaf Hossain, Faisal Zaman, Mohammed Nasser,...          50   
3  [Jea-Bum Park, Byungmok Kim, Jian Shen, Sun-Yo...           0   
4                [Giovanna Guerrini, Isabella Merlo]           2   

                                          references  \
0  [51c7e02e-f5ed-431a-8cf5-f761f266d4be, 69b625b...   
1  [10482dd3-4642-4193-842f-85f3b70fcf65, 3133714...   
2  [2d84c0f2-e656-4ce7-b018-90eda1c132fe, a083a1b...   
3  [8c78e4b0-632b-4293-b491-85

In [6]:
# Remove entries with missing or empty title or venue
required_cols = ["title", "venue"]
df = df.dropna(subset=required_cols)

for col in ["title", "venue"]:
    df[col] = df[col].astype(str).str.strip()
    df = df[df[col] != ""]

# Remove entries where venue contains "arxiv"
mask_arxiv = df["venue"].str.contains("arxiv", case=False, na=False)
df = df[~mask_arxiv].copy()

# Filter venues with at least min_papers_per_venue papers
min_papers_per_venue = 1000

venue_counts = df["venue"].value_counts()
selected_venues = venue_counts[venue_counts >= min_papers_per_venue].index

venue_df = df[df["venue"].isin(selected_venues)].copy()


print(df.shape)
print(df.head())

(767565, 8)
                                            abstract  \
0  The purpose of this study is to develop a lear...   
1  This paper describes the design and implementa...   
2  This article applied GARCH model instead AR or...   
5                                                NaN   
7                                                NaN   

                                             authors  n_citation  \
0  [Makoto Satoh, Ryo Muramatsu, Mizue Kayama, Ka...           0   
1                        [Gareth Beale, Graeme Earl]          50   
2  [Altaf Hossain, Faisal Zaman, Mohammed Nasser,...          50   
5  [Rafael Álvarez, Leandro Tortosa, José-Francis...           0   
7   [Guzin Ulutas, Mustafa Ulutas, Vasif V. Nabiyev]           0   

                                          references  \
0  [51c7e02e-f5ed-431a-8cf5-f761f266d4be, 69b625b...   
1  [10482dd3-4642-4193-842f-85f3b70fcf65, 3133714...   
2  [2d84c0f2-e656-4ce7-b018-90eda1c132fe, a083a1b...   
5                 

In [7]:
# Combine title into one text field per paper
venue_df["text"] = venue_df["title"] + " "

venue_text = (
    venue_df
    .groupby("venue")["text"]
    .apply(lambda s: " ".join(s))
    .reset_index()
)

tfidf = TfidfVectorizer(
    max_features=1000,
    stop_words="english"
)

X_venue = tfidf.fit_transform(venue_text["text"])
feature_names = tfidf.get_feature_names_out()
print(feature_names)



row_sums = np.asarray(X_venue.sum(axis=1)).ravel()
non_empty_mask = row_sums > 0

venue_text = venue_text[non_empty_mask].reset_index(drop=True)
X_venue = X_venue[non_empty_mask]

print(X_venue.shape)

['2d' '3d' '802' 'abstract' 'access' 'accuracy' 'accurate' 'acoustic'
 'acquisition' 'action' 'active' 'activity' 'ad' 'adaptation' 'adaptive'
 'advanced' 'agent' 'agents' 'aggregation' 'aided' 'algebra' 'algebraic'
 'algorithm' 'algorithms' 'alignment' 'allocation' 'analog' 'analysis'
 'analyzing' 'annotation' 'ant' 'antenna' 'application' 'applications'
 'applied' 'applying' 'approach' 'approaches' 'approximate'
 'approximation' 'arbitrary' 'architecture' 'architectures' 'area' 'array'
 'arrays' 'artificial' 'aspects' 'assembly' 'assessment' 'assignment'
 'assisted' 'association' 'asymptotic' 'asynchronous' 'attribute' 'audio'
 'augmented' 'authentication' 'automata' 'automated' 'automatic'
 'autonomous' 'aware' 'balancing' 'band' 'bandwidth' 'base' 'based'
 'basis' 'bayesian' 'beamforming' 'behavior' 'behaviour' 'belief' 'best'
 'binary' 'bit' 'blind' 'block' 'body' 'boolean' 'bound' 'boundary'
 'bounded' 'bounds' 'brain' 'brief' 'broadcast' 'building' 'business'
 'cache' 'calculus'

In [None]:
# Convert sparse TF-IDF matrix to dense
X_dense = X_venue.toarray()

# Z-score normalization (mean 0, variance 1 for each feature)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_dense)

k = 30
kmeans = KMeans(
    n_clusters=k,
    random_state=42,
    n_init=10
)

cluster_labels = kmeans.fit_predict(X_scaled)

# Attach cluster labels back to the venues
venue_text["cluster"] = cluster_labels
print(venue_text.head())

pca = PCA(n_components=2, random_state=42)
X_pca_2d = pca.fit_transform(X_scaled)

# Store coordinates if you want to inspect later
venue_text["pc1"] = X_pca_2d[:, 0]
venue_text["pc2"] = X_pca_2d[:, 1]

# Example: reduce to 50 dims for clustering
pca_50 = PCA(n_components=50, random_state=42)
X_pca_50 = pca_50.fit_transform(X_scaled)

kmeans = KMeans(n_clusters=30, random_state=42, n_init=10)
cluster_labels = kmeans.fit_predict(X_pca_50)

venue_text["cluster"] = cluster_labels

# Then a second PCA to 2D just for plotting:
pca_vis = PCA(n_components=2, random_state=42)
X_pca_2d = pca_vis.fit_transform(X_pca_50)
venue_text["pc1"] = X_pca_2d[:, 0]
venue_text["pc2"] = X_pca_2d[:, 1]

