In [38]:
import numpy as np

from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns

from file_process import load_corpus
import pandas as pd
from scipy.stats import pearsonr
from sklearn.preprocessing import StandardScaler
from scipy import stats
from scipy.stats import t
from openpyxl import load_workbook
from pathlib import Path
import torch
from torch.utils.data import Dataset, DataLoader, random_split
from torch.optim import AdamW
import torch.nn as nn
from transformers import get_linear_schedule_with_warmup
from torch.optim.lr_scheduler import ReduceLROnPlateau
import torch.optim as optim

In [2]:
#load file
chinese_folder = "../data/chinese/"
english_folder = "../data/english/"

In [3]:
chinese_df = load_corpus(chinese_folder,lang="chinese")
english_df = load_corpus(english_folder,lang="english")


Processing genre: history_fiction
  Success: 1. 长安十二时辰 (马伯庸) (Z-Library).txt                    → 7,570 words
  Success: 10. 历史的裂变：中国历史上的十三场政变（畅销书《大唐兴亡三百年》作者王觉仁力作，用小说笔法，讲述 → 4,591 words
  Success: 2. 风起陇西 (马伯庸 [马伯庸]) (Z-Library).txt                → 4,151 words
  Success: 3. 隋乱 (酒徒) (Z-Library).txt                         → 24,246 words
  Success: 4. 新宋 (阿越) (Z-Library).txt                         → 41,427 words
  Success: 5. 步步生莲 (月关) (Z-Library).txt                       → 45,475 words
  Success: 6. 宰执天下 (cuslaa) (Z-Library).txt                   → 138,306 words
  Success: 7. 窃明 (灰熊猫) (Z-Library).txt                        → 18,725 words
  Success: 8. 四时歌：骑桶人自选集 (骑桶人) (Z-Library).txt                → 2,211 words
  Success: 9. 辛亥：计划外革命 (雪珥) (Z-Library).txt                   → 1,649 words

Processing genre: horror
  Success: 1. 精绝古城 (天下霸唱) (Z-Library).txt                     → 2,762 words
  Success: 10. 死亡通知单系列（套装5本）（死亡通知单+死亡通知单之离别曲（上）+死亡通知单之离别曲（下）+ → 17,152 words
  Success: 2. 鬼吹灯之龙岭

In [2]:
english_df = pd.read_excel('english_df.xlsx')
chinese_df = pd.read_excel('chinese_df.xlsx')

In [8]:
genres = english_df.genre.unique()

In [32]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(
    "FacebookAI/xlm-roberta-base",   # or -large
    num_labels=len(genres)   # ← this is the only thing you set
)

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at FacebookAI/xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You sho

In [33]:
# Prepare English-only samples and labels
if "language" in english_df.columns:
    english_texts = english_df[english_df["language"].str.lower().str.startswith("en")].reset_index(drop=True)
english_texts = english_texts.dropna(subset=["genre"]).reset_index(drop=True)
if len(english_texts) == 0:
    raise ValueError("No English samples found for classification.")



In [34]:
# Create label dictionaries
label2id = {g: i for i, g in enumerate(sorted(english_texts["genre"].unique()))}
id2label = {v: k for k, v in label2id.items()}
num_labels = len(label2id)
num_labels

4

In [35]:
# Dataset for genre classification reading text directly from DataFrame
class GenreDataset(Dataset):
    def __init__(self, df, tokenizer, label2id, max_length=256):
        self.df = df.reset_index(drop=True)
        self.tokenizer = tokenizer
        self.label2id = label2id
        self.max_length = max_length

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        text = row.get("text", "") or ""
# pull raw text from DataFrame
        if not isinstance(text, str):
            text = str(text)
        text = text.strip()
        if not text:
            text = " "
# avoid empty strings for tokenizer
        encoded = self.tokenizer(
            text,
            truncation=True,
            padding=False,
            max_length=self.max_length,
            return_tensors="pt"
        )
        encoded = {k: v.squeeze(0) for k, v in encoded.items()}
        encoded["labels"] = torch.tensor(self.label2id[row["genre"]], dtype=torch.long)
        return encoded

# Instantiate dataset and collator
dataset = GenreDataset(english_texts, tokenizer, label2id, max_length=256)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
len(dataset)

40

In [None]:
# Add optimizer (e.g., Adam, SGD, AdamW, etc.)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [42]:
model.to(device)

XLMRobertaForSequenceClassification(
  (roberta): XLMRobertaModel(
    (embeddings): XLMRobertaEmbeddings(
      (word_embeddings): Embedding(250002, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): XLMRobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x XLMRobertaLayer(
          (attention): XLMRobertaAttention(
            (self): XLMRobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): XLMRobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=

In [40]:
# 80/20 train/validation split
total_samples = len(dataset)
if total_samples < 2:
    raise ValueError("Need at least 2 English samples for an 80/20 split.")
train_size = max(1, int(0.8 * total_samples))
val_size = total_samples - train_size
if val_size == 0:
    val_size = 1
    train_size = total_samples - 1
generator = torch.Generator().manual_seed(42)
train_ds, val_ds = random_split(dataset, [train_size, val_size], generator=generator)
train_loader = DataLoader(train_ds, batch_size=8, shuffle=True, collate_fn=data_collator)
val_loader = DataLoader(val_ds, batch_size=8, shuffle=False, collate_fn=data_collator)
len(train_ds), len(val_ds)

(32, 8)

In [45]:
# Simple training + validation loop with scheduler and grad clipping
num_epochs = 5
total_steps = max(1, num_epochs * len(train_loader))
warmup_steps = max(1, int(0.1 * total_steps))

optimizer = optim.Adam(model.parameters(), lr=1e-5)
scheduler = ReduceLROnPlateau(
    optimizer, mode='min', factor=0.5, patience=2,  min_lr=1e-8
)
max_grad_norm = 1.0

best_val_loss = float("inf")
for epoch in range(num_epochs):
    model.train()
    train_loss = 0.0
    for batch in train_loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
        optimizer.step()
        
        optimizer.zero_grad()
        train_loss += loss.item()
    avg_train_loss = train_loss / max(len(train_loader), 1)

    model.eval()
    val_loss = 0.0
    correct = 0
    total = 0
    with torch.no_grad():
        for batch in val_loader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            val_loss += outputs.loss.item()
            preds = outputs.logits.argmax(dim=-1)
            correct += (preds == batch["labels"]).sum().item()
            total += batch["labels"].size(0)
    avg_val_loss = val_loss / max(len(val_loader), 1)
    scheduler.step(avg_val_loss)
    val_acc = correct / max(total, 1)

    # Track best for quick eyeballing
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss

    print(
        f"Epoch {epoch + 1}: train_loss={avg_train_loss:.4f} val_loss={avg_val_loss:.4f} val_acc={val_acc:.3f} best_val_loss={best_val_loss:.4f}"
    )

Epoch 1: train_loss=1.4273 val_loss=1.5567 val_acc=0.000 best_val_loss=1.5567
Epoch 2: train_loss=1.3801 val_loss=1.5472 val_acc=0.000 best_val_loss=1.5472
Epoch 2: train_loss=1.3801 val_loss=1.5472 val_acc=0.000 best_val_loss=1.5472


KeyboardInterrupt: 