In [1]:
import pandas as pd
from transformers import pipeline
from datasets import load_dataset
from tqdm.auto import tqdm
import torch
import json
import os

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
dataset = load_dataset('parquet', data_files="../data/5.cryptonews_sentiment.parquet")['train']
print(dataset)

Dataset({
    features: ['title', 'text', 'source_name', 'date', 'tanalysis_absa', 'economy_absa', 'regulation_absa', 'technology_absa', 'adoption_absa', 'cybersecurity_absa', 'NER', 'news_text_title', 'sentiment_CryptoBERT', 'score_CryptoBERT', 'sentiment_finbert', 'score_finbert', 'index'],
    num_rows: 159486
})


In [3]:
# dataset = dataset.select(range(200))

In [4]:
candidate_labels = [
  "whale activity, large transaction", "market sentiment", "liquidity, volatility, risk", "price action, price movement, trading",
  "exchange traded funds, ETF", "institutional investments", "treasury, reserve", "stablecoin, CBDC",
  "government policy and regulations", "legal proceedings", "licensing approval", "enforcement actions",
  "mining technology, mining difficulty, halving", "consensus updates", "security protocols", "quantum developments", "technical infrastructure",
  "nft, metaverse, virtual world", "social platforms", "sustainability, renewable, environmental", "interest rates, economic outlook",
  "product launches", "corporate partnerships"
]

pipe = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

# def classify_topic(batch):
#   results = pipe(batch['news_text_title'], candidate_labels)
#   topics = [result['labels'][0] for result in results]
#   scores = [result['scores'][0] for result in results]
#   batch['topic'] = topics
#   batch['topic_confidence_score'] = scores
#   return batch
# results = dataset.map(classify_topic, batched=True, batch_size=16)

def classify_topic(batch, output_file):
  # process batch
  results = pipe(batch['news_text_title'], candidate_labels)
  topics = [result['labels'][0] for result in results]
  scores = [result['scores'][0] for result in results]
  batch['topic'] = topics
  batch['topic_confidence_score'] = scores

  # save batch
  records = []
  for i in range(len(batch['topic'])):
    record = {
      'topic': batch['topic'][i],
      'topic_confidence_score': batch['topic_confidence_score'][i],
    }
    records.append(record)
  
  try:
    with open(output_file, 'a') as f:
      for record in records:
        json.dump(record, f)
        f.write('\n')
  except Exception as e:
    print(f"Error saving batch: {e}")

output_file = '../data/6.topic_classification.jsonl'

BATCH_SIZE = 24

total_batches = (len(dataset) + BATCH_SIZE-1) // BATCH_SIZE
for i in tqdm(range(0, len(dataset), BATCH_SIZE), total=total_batches, desc="Processing batches"):
  batch = dataset[i:i+BATCH_SIZE]
  processed_batch = classify_topic(batch, output_file)

Device set to use cuda:0
Processing batches:   0%|          | 10/6646 [02:04<23:11:34, 12.58s/it]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
Processing batches: 100%|██████████| 6646/6646 [22:39:43<00:00, 12.28s/it]   


In [5]:
output_file = '../data/6.topic_classification.jsonl'

df_output = pd.read_json(output_file, lines=True)
print(df_output.shape)
df_output

(159486, 2)


Unnamed: 0,topic,topic_confidence_score
0,product launches,0.508901
1,"price action, price movement, trading",0.443893
2,"price action, price movement, trading",0.647850
3,"price action, price movement, trading",0.355922
4,"price action, price movement, trading",0.516643
...,...,...
159481,"price action, price movement, trading",0.335582
159482,"price action, price movement, trading",0.334338
159483,consensus updates,0.202182
159484,"price action, price movement, trading",0.736547


In [6]:
df_output['topic'].value_counts()

topic
price action, price movement, trading            88702
market sentiment                                 13286
exchange traded funds, ETF                       12420
consensus updates                                 6742
liquidity, volatility, risk                       5456
legal proceedings                                 4142
technical infrastructure                          3740
product launches                                  3593
institutional investments                         3590
social platforms                                  3504
whale activity, large transaction                 3029
government policy and regulations                 1782
sustainability, renewable, environmental          1623
nft, metaverse, virtual world                     1156
treasury, reserve                                 1093
enforcement actions                               1069
mining technology, mining difficulty, halving     1017
security protocols                                1012
corp

In [7]:
df = pd.read_parquet("../data/5.cryptonews_sentiment.parquet")
df.reset_index(drop=True, inplace=True)
print(df.shape)
df.head()

(159486, 16)


Unnamed: 0,title,text,source_name,date,tanalysis_absa,economy_absa,regulation_absa,technology_absa,adoption_absa,cybersecurity_absa,NER,news_text_title,sentiment_CryptoBERT,score_CryptoBERT,sentiment_finbert,score_finbert
0,VanEck makes another Bitcoin ETF run,Investment management firm VanEck has filed wi...,Modern Consensus,"Fri, 01 Jan 2021 11:07:58 -0500",0.0,0.0,0.3,0.0,0.7,0.0,"[[VanEck, ORG], [VanEck, ORG], [the U.S. Secur...",VanEck makes another Bitcoin ETF run Investmen...,LABEL_1,0.953463,neutral,0.704972
1,Markets Report: Bitcoin starts 2021 with a ban...,Bitcoin begins a new year on a high in almost ...,Modern Consensus,"Fri, 01 Jan 2021 09:51:10 -0500",0.9,0.0,0.0,0.0,0.5,0.0,"[[2021, DATE], [500,000, MONEY], [a new year, ...",Markets Report: Bitcoin starts 2021 with a ban...,LABEL_1,0.966029,positive,0.927262
2,Bitcoin Rally Extends Into New Year,Jan.03 -- Bitcoin is continuing its rally into...,Bloomberg Markets and Finance,"Sun, 03 Jan 2021 18:46:20 -0500",0.8,0.0,0.0,0.0,0.0,0.0,"[[New Year, EVENT], [Jan.03 --, DATE], [the ne...",Bitcoin Rally Extends Into New Year Jan.03 -- ...,LABEL_1,0.957252,positive,0.817353
3,"BitPay CCO Says Bitcoin Will Hit $45,000",Jan.04 -- Bitpay Chief Commercial Officer Sonn...,Bloomberg Technology,"Mon, 04 Jan 2021 20:29:35 -0500",0.8,0.0,0.0,0.0,0.0,0.0,"[[BitPay CCO, ORG], [45,000, MONEY], [Jan.04, ...","BitPay CCO Says Bitcoin Will Hit $45,000 Jan.0...",LABEL_1,0.91419,neutral,0.362224
4,"Markets Report: Bitcoin Dives to $27,700 as a ...",Bitcoin is seeing a wild start to the week wit...,Modern Consensus,"Mon, 04 Jan 2021 09:01:49 -0500",-0.8,0.0,0.0,0.0,0.0,0.0,"[[27,700, MONEY], [a Crazy Weekend, DATE], [th...","Markets Report: Bitcoin Dives to $27,700 as a ...",LABEL_0,0.917384,negative,0.932793


In [8]:
# concat with the classified topics
df = pd.concat([df, df_output], axis=1)
print(df.shape)
df.head()

(159486, 18)


Unnamed: 0,title,text,source_name,date,tanalysis_absa,economy_absa,regulation_absa,technology_absa,adoption_absa,cybersecurity_absa,NER,news_text_title,sentiment_CryptoBERT,score_CryptoBERT,sentiment_finbert,score_finbert,topic,topic_confidence_score
0,VanEck makes another Bitcoin ETF run,Investment management firm VanEck has filed wi...,Modern Consensus,"Fri, 01 Jan 2021 11:07:58 -0500",0.0,0.0,0.3,0.0,0.7,0.0,"[[VanEck, ORG], [VanEck, ORG], [the U.S. Secur...",VanEck makes another Bitcoin ETF run Investmen...,LABEL_1,0.953463,neutral,0.704972,product launches,0.508901
1,Markets Report: Bitcoin starts 2021 with a ban...,Bitcoin begins a new year on a high in almost ...,Modern Consensus,"Fri, 01 Jan 2021 09:51:10 -0500",0.9,0.0,0.0,0.0,0.5,0.0,"[[2021, DATE], [500,000, MONEY], [a new year, ...",Markets Report: Bitcoin starts 2021 with a ban...,LABEL_1,0.966029,positive,0.927262,"price action, price movement, trading",0.443893
2,Bitcoin Rally Extends Into New Year,Jan.03 -- Bitcoin is continuing its rally into...,Bloomberg Markets and Finance,"Sun, 03 Jan 2021 18:46:20 -0500",0.8,0.0,0.0,0.0,0.0,0.0,"[[New Year, EVENT], [Jan.03 --, DATE], [the ne...",Bitcoin Rally Extends Into New Year Jan.03 -- ...,LABEL_1,0.957252,positive,0.817353,"price action, price movement, trading",0.64785
3,"BitPay CCO Says Bitcoin Will Hit $45,000",Jan.04 -- Bitpay Chief Commercial Officer Sonn...,Bloomberg Technology,"Mon, 04 Jan 2021 20:29:35 -0500",0.8,0.0,0.0,0.0,0.0,0.0,"[[BitPay CCO, ORG], [45,000, MONEY], [Jan.04, ...","BitPay CCO Says Bitcoin Will Hit $45,000 Jan.0...",LABEL_1,0.91419,neutral,0.362224,"price action, price movement, trading",0.355922
4,"Markets Report: Bitcoin Dives to $27,700 as a ...",Bitcoin is seeing a wild start to the week wit...,Modern Consensus,"Mon, 04 Jan 2021 09:01:49 -0500",-0.8,0.0,0.0,0.0,0.0,0.0,"[[27,700, MONEY], [a Crazy Weekend, DATE], [th...","Markets Report: Bitcoin Dives to $27,700 as a ...",LABEL_0,0.917384,negative,0.932793,"price action, price movement, trading",0.516643


In [10]:
df.drop(columns=['news_text_title'], inplace=True)

In [13]:
print(df.shape)
df.tail()

(159486, 17)


Unnamed: 0,title,text,source_name,date,tanalysis_absa,economy_absa,regulation_absa,technology_absa,adoption_absa,cybersecurity_absa,NER,sentiment_CryptoBERT,score_CryptoBERT,sentiment_finbert,score_finbert,topic,topic_confidence_score
159481,Bitcoin remains volatile despite $103k milesto...,SkyBridge Capital founder Anthony Scaramucci h...,Crypto news,"Sat, 07 Dec 2024 14:17:59 -0500",-0.3,0.0,-0.7,0.0,0.0,0.0,"[[$103k, MONEY], [Scaramucci, PERSON], [SkyBri...",LABEL_1,0.961925,neutral,0.511942,"price action, price movement, trading",0.335582
159482,Mt. Gox Bitcoin Payout Triggers Major Transfer...,Mt. Gox cryptocurrency exchange has triggered ...,The Currency Analytics,"Sat, 07 Dec 2024 13:48:19 -0500",0.1,0.0,0.0,0.0,0.1,0.0,"[[Mt. Gox, ORG], [Mt. Gox, ORG], [27,871 BTC, ...",LABEL_1,0.962343,neutral,0.543639,"price action, price movement, trading",0.334338
159483,US Treasury Acknowledges Bitcoin as ‘Digital G...,The U.S. Treasury's Fiscal Year 2024 Q4 Report...,Bitcoin,"Sat, 07 Dec 2024 13:30:41 -0500",1.0,0.5,0.0,0.0,0.5,0.0,"[[US Treasury, ORG], [The U.S. Treasury's, ORG...",LABEL_1,0.965928,positive,0.663188,consensus updates,0.202182
159484,"Can Bitcoin, Ethereum, Dogecoin, Shiba Inu or ...",The prices of cryptocurrencies have soared in ...,Benzinga,"Sat, 07 Dec 2024 13:30:17 -0500",-0.5,0.0,0.0,0.0,-0.3,0.0,"[[Dogecoin, PRODUCT], [Solana, PRODUCT], [2025...",LABEL_1,0.962493,positive,0.691946,"price action, price movement, trading",0.736547
159485,"MicroStrategy hits $40b in Bitcoin, critics qu...",A prominent investment expert has raised conce...,Crypto news,"Sat, 07 Dec 2024 13:28:56 -0500",-0.3,0.0,0.0,0.0,0.2,0.0,"[[MicroStrategy, ORG], [40b, MONEY], [Saylor, ...",LABEL_0,0.695217,negative,0.777374,"liquidity, volatility, risk",0.176538


In [14]:
df.to_parquet("../data/6b.cryptonews_topic.parquet")