In [2]:
from datasets import load_dataset

# Login using e.g. `huggingface-cli login` to access this dataset
ds = load_dataset("kilian-group/arxiv-classifier", "default")

df = ds["train"].to_pandas()

df_clean = df.dropna(subset=['abstract', 'title']).copy()
df_clean = df_clean[df_clean['abstract'].apply(len) > 50] # Hapus abstrak terlalu pendek

In [3]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from sklearn.preprocessing import LabelEncoder

# --- 1. SETUP & RESOURCE ---
# Download resource NLTK (Hanya jika belum ada)
try:
    nltk.data.find('corpora/stopwords')
    nltk.data.find('corpora/wordnet')
except LookupError:
    nltk.download('stopwords')
    nltk.download('wordnet')
    nltk.download('omw-1.4')

# Inisialisasi tools cleaning
lemmatizer = WordNetLemmatizer()
custom_stopwords = set([
    'based', 'proposed', 'using', 'paper', 'data', 'results', 'method', 
    'model', 'approach', 'analysis', 'study', 'performance', 'new', 
    'presented', 'show', 'demonstrate', 'investigate'
])
stop_words = set(stopwords.words('english')) | custom_stopwords

# --- 2. DEFINISI FUNGSI ---

def clean_text_classifier(text):
    """
    Fungsi cleaning khusus untuk Classifier (Router).
    Fokus: Hapus noise, pertahankan sinyal matematika ([EQ]).
    """
    if not isinstance(text, str): return ""
    
    # 1. Ganti rumus LaTeX ($...$) dengan token khusus [EQ]
    text = re.sub(r'\$.*?\$', ' [EQ] ', text)
    
    # 2. Lowercase
    text = text.lower()
    
    # 3. Hapus karakter non-alfabet (kecuali token [EQ] dan spasi)
    text = re.sub(r'[^a-z\s\[\]]', ' ', text)
    
    # 4. Tokenisasi, Stopwords, & Lemmatization
    words = text.split()
    cleaned_words = [
        lemmatizer.lemmatize(w) 
        for w in words 
        if w not in stop_words and len(w) > 2
    ]
    
    return " ".join(cleaned_words)

def preprocess_batch(df):
    """
    Wrapper untuk menerapkan preprocessing ke dataframe (Train atau Test).
    """
    df = df.copy()
    
    # A. FEATURE ENGINEERING (Pada Raw Text)
    # Hitung fitur sebelum teks dibersihkan/hilang
    df['num_latex'] = df['abstract'].astype(str).apply(lambda x: x.count('$'))
    df['text_len'] = df['abstract'].astype(str).apply(len)
    
    # B. TEXT CLEANING
    print("   > Membersihkan teks...")
    df['clean_abstract'] = df['abstract'].astype(str).apply(clean_text_classifier)
    
    # C. FITUR GABUNGAN (Text + Hand-crafted Features)
    # Menambahkan sinyal 'high_math_density' jika banyak rumus
    df['final_text'] = df['clean_abstract'] + \
                       np.where(df['num_latex'] > 5, ' high_math_density', '')
    
    return df

# --- 3. EKSEKUSI PIPELINE (SPLIT DULU!) ---

# Asumsi: df_clean adalah dataframe awal Anda yang sudah membuang baris null
# df_clean = ... (Load data Anda di sini)

print("1. Melakukan Split Data (Stratified)...")
# Kita split RAW data dulu untuk mencegah kebocoran informasi
X_raw = df_clean.drop(columns=['field']) # Fitur mentah
y_raw = df_clean['field']               # Label mentah

X_train_raw, X_val_raw, y_train_raw, y_val_raw = train_test_split(
    X_raw, 
    y_raw,
    test_size=0.2, 
    stratify=y_raw, # Menjaga proporsi kelas
    random_state=42
)

print(f"   Data Train: {len(X_train_raw)}")
print(f"   Data Val  : {len(X_val_raw)}")

# --- 4. TERAPKAN PREPROCESSING TERPISAH ---

print("\n2. Memproses Data TRAIN...")
X_train_processed = preprocess_batch(X_train_raw)

print("3. Memproses Data VAL...")
X_val_processed = preprocess_batch(X_val_raw)

# --- 5. LABEL ENCODING & CLASS WEIGHTS ---

print("\n4. Menyiapkan Label & Bobot...")
le = LabelEncoder()

# Fit hanya pada TRAIN, lalu transform ke keduanya
y_train_enc = le.fit_transform(y_train_raw)
y_val_enc = le.transform(y_val_raw) # Gunakan mapping yang sama dengan train

# Simpan mapping
label_mapping = dict(zip(le.classes_, le.transform(le.classes_)))

# Hitung Class Weights (Hanya dari data TRAIN)
class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(y_train_enc),
    y=y_train_enc
)
class_weight_dict = dict(zip(np.unique(y_train_enc), class_weights))

print("\n✅ PIPELINE SELESAI!")
print("Label Mapping:", label_mapping)
print("\nContoh Class Weights (Train Data):")
for label_name in list(label_mapping.keys())[:5]:
    idx = label_mapping[label_name]
    print(f"  - {label_name}: {class_weight_dict[idx]:.4f}")

# --- OUTPUT DATA SIAP PAKAI ---
# X_train_processed['final_text'] -> Input Teks untuk Model
# X_train_processed['num_latex']  -> Input Numerik Tambahan (jika pakai arsitektur hybrid)
# y_train_enc                     -> Target Label

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


1. Melakukan Split Data (Stratified)...
   Data Train: 86941
   Data Val  : 21736

2. Memproses Data TRAIN...
   > Membersihkan teks...
3. Memproses Data VAL...
   > Membersihkan teks...

4. Menyiapkan Label & Bobot...

✅ PIPELINE SELESAI!
Label Mapping: {'astro-ph': np.int64(0), 'cond-mat': np.int64(1), 'cs': np.int64(2), 'econ': np.int64(3), 'eess': np.int64(4), 'gr-qc': np.int64(5), 'hep-ex': np.int64(6), 'hep-lat': np.int64(7), 'hep-ph': np.int64(8), 'hep-th': np.int64(9), 'math': np.int64(10), 'math-ph': np.int64(11), 'nlin': np.int64(12), 'nucl-ex': np.int64(13), 'nucl-th': np.int64(14), 'physics': np.int64(15), 'q-bio': np.int64(16), 'q-fin': np.int64(17), 'quant-ph': np.int64(18), 'stat': np.int64(19)}

Contoh Class Weights (Train Data):
  - astro-ph: 0.9725
  - cond-mat: 0.7554
  - cs: 0.1949
  - econ: 12.6736
  - eess: 1.8128


In [8]:
from huggingface_hub import HfApi
import pandas as pd
from io import BytesIO

api = HfApi()

# create csv
# df_train = pd.concat([X_train_raw, y_train_raw], axis=1)
# df_train.to_csv("raw_train.csv", index=False)

# df_test = pd.concat([X_val_raw, y_val_raw], axis=1)
# df_test.to_csv("raw_test.csv", index=False)

repo_id = "ALAN43/NLP-PAPERMATCH"

# --- TRAIN ---
buffer_train = BytesIO()
df_train = pd.concat([X_train_raw, y_train_raw], axis=1)
df_train.to_csv(buffer_train, index=False)
buffer_train.seek(0)

api.upload_file(
    path_or_fileobj=buffer_train,
    path_in_repo="raw_train.csv",
    repo_id=repo_id,
    repo_type="dataset"
)

# --- TEST ---
buffer_test = BytesIO()
df_test = pd.concat([X_val_raw, y_val_raw], axis=1)
df_test.to_csv(buffer_test, index=False)
buffer_test.seek(0)

api.upload_file(
    path_or_fileobj=buffer_test,
    path_in_repo="raw_test.csv",
    repo_id=repo_id,
    repo_type="dataset"
)

print("Uploaded directly without saving any local files!")

print("Upload complete!")

The history saving thread hit an unexpected error (OperationalError('database or disk is full')).History will not be written to the database.


KeyboardInterrupt: 

In [None]:
# df_concat_train = pd.concat([X_train_processed['final_text'], y_train_enc], axis=1)
# # X_train_processed['final_text'] -> Input Teks untuk Model
# df_concat_train.to_csv('./topic_classification/topic_classification_train.csv')
# df_concat_test = pd.concat([X_val_processed['final_text'], y_val_enc], axis=1)
# df_concat_test.to_csv('./topic_classification/topic_classification_test.csv')

In [None]:
import json

with open("label_mapping.json", "w") as f:
    json.dump(label_mapping, f, indent=4)


with open("class_weight.json", "w") as f:
    json.dump(label_mapping, f, indent=4)