In [1]:
import pandas as pd
import os
import json
from openai import OpenAI
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm

In [2]:
df = pd.read_parquet('../data/3b.cryptonews_absa.parquet')
print(df.shape)
df.head(2)

(155376, 10)


Unnamed: 0_level_0,title,text,source_name,date,tanalysis_absa,economy_absa,regulation_absa,technology_absa,adoption_absa,cybersecurity_absa
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,2021 Bitcoin Price Predictions: Is The Massive...,As the bitcoin price hovers under the psycholo...,Forbes,2021-01-02 00:20:00+00:00,0.5,0.0,0.0,0.0,0.0,0.0
1,Will Central Banks Hold Bitcoin in 2021?,Central banks (CB) will hold bitcoin sooner or...,BeInCrypto,2021-01-01 20:31:35+00:00,0.0,0.5,0.0,0.0,0.7,0.0


In [3]:
# os.environ['deepseek_api_key'] = 'YOUR_KEY'
cryptonews_api_key = os.getenv('deepseek_api_key')
client = OpenAI(api_key=cryptonews_api_key, base_url="https://api.deepseek.com")

system_prompt = """
You are a cryptocurrency news expert.
Your task is to perform Named-Entity Recognition (NER) on text related to the cryptocurrency industry.
Specifically, identify and extract the following entities:

1. **People**: Famous individuals, founders, CEOs, and influential figures in the crypto space.
2. **Organizations**: Top companies, exchanges, blockchain projects, and institutions in the cryptocurrency industry.
3. **Cryptocurrencies**: Names of cryptocurrencies, tokens, and digital assets.
4. **Events**: Major events, conferences, or milestones in the crypto world.

Provide the extracted entities in JSON format, ensure accuracy and relevance to the cryptocurrency domain.
Example JSON output:
{
  "People": ["Elon Musk", "Changpeng Zhao"],
  "Organizations": ["Microstrategy", "Meta"],
  "Cryptocurrencies": ["Bitcoin"],
  "Events": ["Bitcoin Halving"],
}
"""

def analyze_aspects(example):
  try:
    user_prompt = f"Title: {example['title']}\nText: {example['text']}"
    messages = [
      {"role": "system", "content": system_prompt},
      {"role": "user", "content": user_prompt}
    ]
    response = client.chat.completions.create(
      model="deepseek-chat",
      messages=messages,
      response_format={'type': 'json_object'}
    )
    return {"ner": json.loads(response.choices[0].message.content)}
  except Exception as e:
    print(f"Error processing example: {example['title']}\n{e}")
    return {"ner": None}

def analyze_aspects_parallel(df, max_workers=64):
  with open("data/4.deepseek_ner_output.json", "a") as file, ThreadPoolExecutor(max_workers=max_workers) as executor:
    future_to_index = {
      executor.submit(analyze_aspects, row): idx
      for idx, row in df.iterrows()
    }
    for future in tqdm(as_completed(future_to_index), total=len(future_to_index)):
      idx = future_to_index[future]
      try:
        result = future.result()
        file.write(json.dumps({"index": idx, "ner": result["ner"]}) + "\n")
      except Exception as e:
        print(f"Error processing result: {e}")
        file.write(json.dumps({"index": idx, "ner": None}) + "\n")

analyze_aspects_parallel(df)

OpenAIError: The api_key client option must be set either by passing api_key to the client or by setting the OPENAI_API_KEY environment variable

---------------------------------------------

In [None]:
df_output = pd.read_json("data/4.deepseek_ner_output.json", lines=True)
df_output.set_index('index', inplace=True)
df_output.head()

Unnamed: 0_level_0,ner
index,Unnamed: 1_level_1
23,"{'People': [], 'Organizations': [], 'Cryptocur..."
34,"{'People': ['Pentoshi'], 'Organizations': ['Th..."
56,"{'People': ['Sally Ho'], 'Organizations': [], ..."
44,"{'People': [], 'Organizations': [], 'Cryptocur..."
66,"{'People': [], 'Organizations': [], 'Cryptocur..."


In [None]:
df = pd.merge(df, df_output, left_index=True, right_on="index")
df.head(3)

Unnamed: 0_level_0,title,text,source_name,date,tanalysis_absa,economy_absa,regulation_absa,technology_absa,adoption_absa,cybersecurity_absa,ner
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,2021 Bitcoin Price Predictions: Is The Massive...,As the bitcoin price hovers under the psycholo...,Forbes,2021-01-02 00:20:00+00:00,0.5,0.0,0.0,0.0,0.0,0.0,"{'People': [], 'Organizations': [], 'Cryptocur..."
1,Will Central Banks Hold Bitcoin in 2021?,Central banks (CB) will hold bitcoin sooner or...,BeInCrypto,2021-01-01 20:31:35+00:00,0.0,0.5,0.0,0.0,0.7,0.0,"{'People': ['Krüger'], 'Organizations': ['Cent..."
2,"Bitcoin Is Digital Social Justice, feat. Tyron...",The podcaster and CEO of Onramp Invest discuss...,Coindesk,2021-01-01 19:15:02+00:00,0.0,0.3,0.0,0.5,0.7,0.0,"{'People': ['Tyrone Ross'], 'Organizations': [..."


In [None]:
df['ner'][25]

{'People': ['Michael Saylor'],
 'Organizations': ['MicroStrategy', 'Funky Crypto Podcast', 'The Daily Hodl'],
 'Cryptocurrencies': ['Bitcoin', 'BTC'],
 'Events': []}

In [None]:
entity_types = ['People', 'Organizations', 'Cryptocurrencies', 'Events']
for entity in entity_types:
  df[entity] = df['ner'].apply(lambda x: x.get(entity, []) if x else [])

df.drop(columns=['ner'], inplace=True)

In [None]:
df.head(3)

Unnamed: 0_level_0,title,text,source_name,date,tanalysis_absa,economy_absa,regulation_absa,technology_absa,adoption_absa,cybersecurity_absa,People,Organizations,Cryptocurrencies,Events
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0,2021 Bitcoin Price Predictions: Is The Massive...,As the bitcoin price hovers under the psycholo...,Forbes,2021-01-02 00:20:00+00:00,0.5,0.0,0.0,0.0,0.0,0.0,[],[],[Bitcoin],[]
1,Will Central Banks Hold Bitcoin in 2021?,Central banks (CB) will hold bitcoin sooner or...,BeInCrypto,2021-01-01 20:31:35+00:00,0.0,0.5,0.0,0.0,0.7,0.0,[Krüger],[Central banks],[Bitcoin],[]
2,"Bitcoin Is Digital Social Justice, feat. Tyron...",The podcaster and CEO of Onramp Invest discuss...,Coindesk,2021-01-01 19:15:02+00:00,0.0,0.3,0.0,0.5,0.7,0.0,[Tyrone Ross],[Onramp Invest],[Bitcoin],[]


In [None]:
df.to_parquet("../data/4b.cryptonews_ner.parquet")

In [None]:
##################   Analysis   ##################

In [None]:
people_freq = df['People'].explode().value_counts()
people_freq.head(20)

People
Michael Saylor          2467
Elon Musk               1670
Donald Trump            1615
Peter Schiff             969
Sally Ho                 940
Cathie Wood              928
Jack Dorsey              879
Nayib Bukele             861
Robert Kiyosaki          815
Satoshi Nakamoto         790
Arthur Hayes             594
Peter Brandt             553
Mike Novogratz           543
Jerome Powell            511
Gary Gensler             499
Anthony Scaramucci       455
Trump                    452
Michaël van de Poppe     377
Mike McGlone             371
Benjamin Cowen           306
Name: count, dtype: int64

In [None]:
people_freq = df['Organizations'].explode().value_counts()
people_freq.head(20)

Organizations
SEC                       4413
MicroStrategy             3785
BlackRock                 3301
Binance                   2194
Coinbase                  2174
Grayscale                 2114
The Daily Hodl            1823
Federal Reserve           1630
Tesla                     1622
CNBC                      1577
CryptoQuant               1432
Fed                       1403
CryptoSlate               1356
FTX                       1330
Glassnode                 1253
Bloomberg                 1211
El Salvador               1171
The Block                 1119
BeInCrypto                1068
The Currency Analytics     986
Name: count, dtype: int64

In [None]:
people_freq = df['Cryptocurrencies'].explode().value_counts()
people_freq.head(20)

Cryptocurrencies
Bitcoin      145001
BTC           51248
Ethereum      13063
ETH            5172
XRP            3227
bitcoin        2759
Dogecoin       2443
Solana         1763
Ether          1455
SOL            1392
DOGE           1291
ADA            1215
BNB            1214
Cardano         882
BTC/USD         859
Shiba Inu       628
SHIB            566
DOT             546
USDT            543
AVAX            422
Name: count, dtype: int64

In [21]:
event_freq = df['Events'].explode().value_counts()
event_freq.head(20)

Events
Bitcoin Halving                                        2980
Bitcoin halving                                         414
Bitcoin ETF approval                                    386
Bitcoin ETF Approval                                    325
Bitcoin ETF                                             316
2021 Berkshire Hathaway Annual Shareholders Meeting     264
FOMC meeting                                            193
crypto winter                                           177
Spot Bitcoin ETF Approval                               173
spot Bitcoin ETF approval                               164
Bitcoin ETFs                                            158
FTX collapse                                            154
Uptober                                                 109
U.S. presidential election                               95
Spot Bitcoin ETF approval                                92
The Merge                                                91
US election                      

In [None]:
df = pd.read_parquet("../data/4b.cryptonews_ner.parquet")
entity_types = ['Events']
top_entities = {}

for entity in entity_types:
    top_entities[entity] = df[entity].explode().value_counts().head(20).index.tolist()

# Step 2: Create columns for each top entity and populate with binary values
for entity, top_values in top_entities.items():
    for value in top_values:
        column_name = f"{entity}_{value.replace(' ', '_')}"  # Replace spaces with underscores for column names
        df[column_name] = df[entity].apply(lambda x: 1 if value in x else 0)

# Step 3: Verify the new columns
df.head()

Unnamed: 0_level_0,title,text,source_name,date,tanalysis_absa,economy_absa,regulation_absa,technology_absa,adoption_absa,cybersecurity_absa,...,Events_Bitcoin_ETFs,Events_FTX_collapse,Events_Uptober,Events_U.S._presidential_election,Events_Spot_Bitcoin_ETF_approval,Events_The_Merge,Events_US_election,Events_approval_of_spot_Bitcoin_ETFs,Events_Bitcoin_2024_conference,Events_U.S._election
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,2021 Bitcoin Price Predictions: Is The Massive...,As the bitcoin price hovers under the psycholo...,Forbes,2021-01-02 00:20:00+00:00,0.5,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
1,Will Central Banks Hold Bitcoin in 2021?,Central banks (CB) will hold bitcoin sooner or...,BeInCrypto,2021-01-01 20:31:35+00:00,0.0,0.5,0.0,0.0,0.7,0.0,...,0,0,0,0,0,0,0,0,0,0
2,"Bitcoin Is Digital Social Justice, feat. Tyron...",The podcaster and CEO of Onramp Invest discuss...,Coindesk,2021-01-01 19:15:02+00:00,0.0,0.3,0.0,0.5,0.7,0.0,...,0,0,0,0,0,0,0,0,0,0
3,Bitcoin hits all-time high against gold as hav...,"BTC has hit another milestone, this time again...",Cointelegraph,2021-01-01 18:52:00+00:00,1.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
4,"The Last Time This Indicator Flashed, Bitcoin ...",Bitcoin has been facing some turbulence as of ...,Bitcoinist,2021-01-01 18:00:00+00:00,0.3,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0


: 

In [None]:
# Convert the 'date' column to datetime format, remove timezone, and round to the nearest hour
df['date'] = pd.to_datetime(df['date']).dt.tz_localize(None)  # Remove timezone
df['date'] = df['date'].dt.floor('h')  # Round down to the nearest hour

# Get the top 20 most frequent values in the 'Events' column
top_20_events = df['Events'].explode().value_counts().head(20).index.tolist()

# Filter the 'Events' column to only include the top 20 values
df['Filtered_Events'] = df['Events'].apply(lambda events: [event for event in events if event in top_20_events])

# Group by the 'date' column and count occurrences of the top 20 events
df_grouped = df.groupby('date').apply(
    lambda group: pd.Series({
        event: group['Filtered_Events'].explode().value_counts().get(event, 0) for event in top_20_events
    })
).reset_index()

# Set 'date' as the index and rename it to 'date_hour'
df_grouped.set_index('date', inplace=True)
df_grouped.index.name = 'date_hour'

# Display the grouped DataFrame
df_grouped.head()

# Optionally, save the grouped DataFrame to a file
df_grouped.to_parquet("../data/4d.cryptonews_top20_events_grouped.parquet")

  df['date'] = df['date'].dt.floor('H')  # Round down to the nearest hour
  df_grouped = df.groupby('date').apply(


In [19]:
df_grouped


Unnamed: 0_level_0,Bitcoin Halving,Bitcoin halving,Bitcoin ETF approval,Bitcoin ETF Approval,Bitcoin ETF,2021 Berkshire Hathaway Annual Shareholders Meeting,FOMC meeting,crypto winter,Spot Bitcoin ETF Approval,spot Bitcoin ETF approval,Bitcoin ETFs,FTX collapse,Uptober,U.S. presidential election,Spot Bitcoin ETF approval,The Merge,US election,approval of spot Bitcoin ETFs,Bitcoin 2024 conference,U.S. election
date_hour,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
2021-01-01 06:00:00,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2021-01-01 08:00:00,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2021-01-01 09:00:00,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2021-01-01 10:00:00,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2021-01-01 11:00:00,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2025-01-01 00:00:00,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2025-01-01 01:00:00,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2025-01-01 02:00:00,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2025-01-01 03:00:00,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [6]:
df.to_parquet("../data/4c.cryptonews_ner.parquet")