In [1]:
from datasets import load_dataset, concatenate_datasets
from sentence_transformers import SentenceTransformer
from transformers import pipeline
import torch
import numpy as np

In [2]:
ds_ag = load_dataset("sh0416/ag_news", split="train[:10%]")

In [3]:
ds_ag

Dataset({
    features: ['label', 'title', 'description'],
    num_rows: 12000
})

In [4]:
ag_map = {1: "politics", 2: "sports", 3: "business", 4: "tech"}
def process_ag(batch):
    batch["label_description"] = [ag_map[label] for label in batch["label"]]
    return batch


In [5]:
combined_dataset = ds_ag.map(process_ag, batched=True)

Map:   0%|          | 0/12000 [00:00<?, ? examples/s]

In [6]:
combined_dataset[0]

{'label': 3,
 'title': 'Wall St. Bears Claw Back Into the Black (Reuters)',
 'description': "Reuters - Short-sellers, Wall Street's dwindling\\band of ultra-cynics, are seeing green again.",
 'label_description': 'business'}

In [7]:
combined_dataset[-1]

{'label': 4,
 'title': 'Linux Desktop KDE Plans Google Style Search',
 'description': 'Linux Desktop KDE Plans Google Style Search\\\\Open-source Linux desktop environment KDE developers have announced plans to make searching for files on the KDE desktop more simple by adding a Google-style search feature. CNet reports that the next version of KDE, which will either be called 3.4 or 4, and is ...',
 'label_description': 'tech'}

In [8]:
combined_dataset[1]

{'label': 3,
 'title': 'Carlyle Looks Toward Commercial Aerospace (Reuters)',
 'description': 'Reuters - Private investment firm Carlyle Group,\\which has a reputation for making well-timed and occasionally\\controversial plays in the defense industry, has quietly placed\\its bets on another part of the market.',
 'label_description': 'business'}

In [9]:
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

In [10]:
device = "cuda" if torch.cuda.is_available() else "cpu"

sentiment_pipe = pipeline("text-classification", model="cardiffnlp/twitter-roberta-base-sentiment", device=device)
bias_pipe = pipeline("text-classification", model="newsmediabias/UnBIAS-classifier", device=device)
subjectivity_pipe = pipeline("text-classification", model="GroNLP/mdebertav3-subjectivity-multilingual", device=device)

framing_model = "cross-encoder/nli-deberta-v3-small"
classifier = pipeline(
    "zero-shot-classification", 
    model=framing_model, 
    device=device
)

Device set to use cuda
Device set to use cuda
Device set to use cuda
Device set to use cuda


In [11]:
hypothesis_template = "The perspective of this news article is {}."
framing_map = {
    "Corporate & Markets": 1.0,
    "Social Impact & Labor": -1.0,
    "Neutral/Reporting": 0.0,
    "Non-Economic": 0.0
}

sentiment_map = {
    0: -1,  # Negative
    1: 0,   # Neutral
    2: 1    # Positive
}

bias_map_norm = {
    "Neutral": 0.0,
    "Slightly Biased": 0.5,
    "Highly Biased": 1.0
}

In [12]:
def analyse_metadata(batch):
    texts = batch["description"]
    
    sent_res = sentiment_pipe(texts, truncation=True, max_length=512)
    batch["sentiment_norm"] = [sentiment_map[int(r['label'].split('_')[1])] for r in sent_res]
    
    bias_res = bias_pipe(texts, truncation=True, max_length=512)
    batch["bias_norm"] = [bias_map_norm[r['label']] for r in bias_res]
    
    subj_res = subjectivity_pipe(texts, truncation=True, max_length=512)
    batch["subjectivity_norm"] = [1 if r['label'] == "SUBJ" else 0 for r in subj_res]
    
    candidate_labels = list(framing_map.keys())
    
    framing_res = classifier(
            texts, 
            candidate_labels=candidate_labels, 
            multi_label=False,
            hypothesis_template=hypothesis_template,
            truncation=True, max_length=512
        )
    
    batch["framing_score"] = [framing_map[res['labels'][0]] for res in framing_res]
    
    return batch

In [13]:
combined_dataset = combined_dataset.map(analyse_metadata, batched=True, batch_size=1024)

Map:   0%|          | 0/12000 [00:00<?, ? examples/s]

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


In [14]:
model.to(device)

def embed_text(batch):
    embeddings = model.encode(batch["description"], show_progress_bar=True)
    return {"embeddings": embeddings}

In [15]:
combined_dataset = combined_dataset.map(embed_text, batched=True, batch_size=512)

Map:   0%|          | 0/12000 [00:00<?, ? examples/s]

Batches:   0%|          | 0/16 [00:00<?, ?it/s]

Batches:   0%|          | 0/16 [00:00<?, ?it/s]

Batches:   0%|          | 0/16 [00:00<?, ?it/s]

Batches:   0%|          | 0/16 [00:00<?, ?it/s]

Batches:   0%|          | 0/16 [00:00<?, ?it/s]

Batches:   0%|          | 0/16 [00:00<?, ?it/s]

Batches:   0%|          | 0/16 [00:00<?, ?it/s]

Batches:   0%|          | 0/16 [00:00<?, ?it/s]

Batches:   0%|          | 0/16 [00:00<?, ?it/s]

Batches:   0%|          | 0/16 [00:00<?, ?it/s]

Batches:   0%|          | 0/16 [00:00<?, ?it/s]

Batches:   0%|          | 0/16 [00:00<?, ?it/s]

Batches:   0%|          | 0/16 [00:00<?, ?it/s]

Batches:   0%|          | 0/16 [00:00<?, ?it/s]

Batches:   0%|          | 0/16 [00:00<?, ?it/s]

Batches:   0%|          | 0/16 [00:00<?, ?it/s]

Batches:   0%|          | 0/16 [00:00<?, ?it/s]

Batches:   0%|          | 0/16 [00:00<?, ?it/s]

Batches:   0%|          | 0/16 [00:00<?, ?it/s]

Batches:   0%|          | 0/16 [00:00<?, ?it/s]

Batches:   0%|          | 0/16 [00:00<?, ?it/s]

Batches:   0%|          | 0/16 [00:00<?, ?it/s]

Batches:   0%|          | 0/16 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

In [16]:
def create_full_feature_vector(example):
    text_emb = np.array(example["embeddings"])
    
    meta_features = np.array([
        example["sentiment_norm"],
        example["bias_norm"],
        example["subjectivity_norm"],
        example["framing_score"]
    ])
    
    example["full_feature_vector"] = np.concatenate([text_emb, meta_features]).tolist()
    return example

In [None]:
print(f"Embedding for row 0: {combined_dataset[0]['embeddings'][:5]}...") 
print(f"Vector size: {len(combined_dataset[0]['embeddings'])}")

Embedding for row 0: [0.01201203279197216, 0.006566411349922419, 0.0575903058052063, 0.06646198779344559, 0.05520528927445412]...
Vector size: 384


In [18]:
combined_dataset = combined_dataset.map(create_full_feature_vector)

Map:   0%|          | 0/12000 [00:00<?, ? examples/s]

In [19]:
sample_vector = combined_dataset[0]["full_feature_vector"]
print(f"Vector Length: {len(sample_vector)}") # Should be 388
print(f"First 3 values (Embeddings): {sample_vector[:3]}")
print(f"Last 4 values (Metadata): {sample_vector[-4:]}")

Vector Length: 388
First 3 values (Embeddings): [0.01201203279197216, 0.006566411349922419, 0.0575903058052063]
Last 4 values (Metadata): [0.0, 1.0, 0.0, 1.0]


In [20]:
combined_dataset[0]


{'label': 3,
 'title': 'Wall St. Bears Claw Back Into the Black (Reuters)',
 'description': "Reuters - Short-sellers, Wall Street's dwindling\\band of ultra-cynics, are seeing green again.",
 'label_description': 'business',
 'sentiment_norm': 0,
 'bias_norm': 1.0,
 'subjectivity_norm': 0,
 'framing_score': 1.0,
 'embeddings': [0.01201203279197216,
  0.006566411349922419,
  0.0575903058052063,
  0.06646198779344559,
  0.05520528927445412,
  -0.015975097194314003,
  0.011482849717140198,
  -0.017920957878232002,
  0.010438605211675167,
  -0.05571218207478523,
  -0.006151342298835516,
  0.0657198503613472,
  -0.025898512452840805,
  -0.033229995518922806,
  0.04263618588447571,
  0.025235211476683617,
  0.00340402964502573,
  0.005453020799905062,
  -0.09407653659582138,
  -0.06103198230266571,
  -0.09746859967708588,
  -0.06216021254658699,
  -0.035968199372291565,
  0.0008852744358591735,
  -0.03494129702448845,
  0.03607833385467529,
  -0.047584112733602524,
  0.016297725960612297,
  

In [None]:
combined_dataset.push_to_hub("mkita/topic-discovery-for-news-articles", split="train")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ? shards/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

README.md:   0%|          | 0.00/688 [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/mkita/topic-discovery-for-news-articles/commit/ac3c79c5fed6f6bc8fe960cc45790d2374590e4d', commit_message='Upload dataset', commit_description='', oid='ac3c79c5fed6f6bc8fe960cc45790d2374590e4d', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/mkita/topic-discovery-for-news-articles', endpoint='https://huggingface.co', repo_type='dataset', repo_id='mkita/topic-discovery-for-news-articles'), pr_revision=None, pr_num=None)

The dataset should be publicly available: https://huggingface.co/datasets/mkita/topic-discovery-for-news-articles