In [1]:
import pandas as pd
import re
from sklearn.preprocessing import LabelEncoder
import numpy as np
from nltk.corpus import stopwords
from sklearn.preprocessing import MultiLabelBinarizer
from collections import Counter

In [2]:
df_price = pd.read_parquet("../../technical_indicators/btcusd_hourly_price_indicators.parquet")
df_news = pd.read_parquet("../../crypto_news/data/6b.cryptonews_topic.parquet")
df_reddit = pd.read_parquet("../../reddit/reddit_sentiment_processed.parquet.gzip")

### preprocess df_news

In [3]:
# preprocess df_news

# remove original text
df_news.drop(columns=['title', 'text'], inplace=True)

# format date
df_news['date'] = pd.to_datetime(df_news['date'], utc=True)

# one-hot encode the news source
top_sources = df_news['source_name'].value_counts().nlargest(15).index
df_news['source_name'] = df_news['source_name'].apply(lambda x: x if x in top_sources else 'Other')
df_news = pd.get_dummies(df_news, columns=['source_name'], prefix='source')

# preprocess CryptoBERT sentiment
conditions = [
  (df_news['sentiment_CryptoBERT'] == "LABEL_1") & (df_news['score_CryptoBERT'] > 0.9),
  (df_news['sentiment_CryptoBERT'] == "LABEL_0") & (df_news['score_CryptoBERT'] > 0.9)
]
df_news['sentiment_CryptoBERT'] = np.select(conditions, [1, -1], default=0)

# preprocess FinBERT sentiment
mask_positive = df_news['sentiment_finbert'] == "positive"
df_news.loc[mask_positive, 'sentiment_finbert'] = np.where(
  df_news.loc[mask_positive, 'score_finbert'] > 0.75, 1, 0
)
mask_negative = df_news['sentiment_finbert'] == "negative"
df_news.loc[mask_negative, 'sentiment_finbert'] = np.where(
  df_news.loc[mask_negative, 'score_finbert'] > 0.75, -1, 0
)
mask_neutral = df_news['sentiment_finbert'] == "neutral"
df_news.loc[mask_neutral, 'sentiment_finbert'] = 0

df_news.drop(columns=['score_CryptoBERT', 'score_finbert'], inplace=True)

# preprocess topic
df_news['topic'] = np.where(df_news['topic_confidence_score'] < 0.4, 'others', df_news['topic'])
top_topics = df_news['topic'].value_counts().nlargest(5).index
df_news['topic'] = df_news['topic'].apply(lambda x: x if x in top_topics else 'Other')
df_news = pd.get_dummies(df_news, columns=['topic'], prefix='topic')
df_news.drop(columns=['topic_confidence_score'], inplace=True)

# preprocess NER
stopword_set = set(stopwords.words('english'))
df_news['NER'] = df_news['NER'].apply(
  lambda entities: [entity[0] for entity in entities]
  if isinstance(entities, (list, np.ndarray)) else list()
)
df_news['NER'] = df_news['NER'].apply(
  lambda ner_list: [re.sub(r'[^\w\s]', '', entity).lower() for entity in ner_list]
)
df_news['NER'] = df_news['NER'].apply(
  lambda ner_list: [entity for entity in ner_list if len(entity) > 1 and entity not in stopword_set and not entity in ["first", "one", "daily", "today", "2024", "two", "this week"]]
)

all_tokens = [token for tokens_list in df_news['NER'] for token in tokens_list]
top_entities = Counter(all_tokens).most_common(10)

top_entities_tokens = [entity for entity, _ in top_entities]
df_news['NER'] = df_news['NER'].apply(lambda tokens: [t for t in tokens if t in top_entities_tokens])

mlb = MultiLabelBinarizer()
ner_encoded = pd.DataFrame(
  mlb.fit_transform(df_news['NER']),
  columns=[f'NER_{ent}' for ent in mlb.classes_],
  index=df_news.index
)
df_news = pd.concat([df_news, ner_encoded], axis=1)

print(df_news.shape)
df_news.head()

(159486, 42)


Unnamed: 0,date,tanalysis_absa,economy_absa,regulation_absa,technology_absa,adoption_absa,cybersecurity_absa,NER,sentiment_CryptoBERT,sentiment_finbert,...,NER_binance,NER_blackrock,NER_coinbase,NER_el salvador,NER_fed,NER_grayscale,NER_michael saylor,NER_microstrategy,NER_sec,NER_us
0,2021-01-01 16:07:58+00:00,0.0,0.0,0.3,0.0,0.7,0.0,[sec],1,0,...,0,0,0,0,0,0,0,0,1,0
1,2021-01-01 14:51:10+00:00,0.9,0.0,0.0,0.0,0.5,0.0,[],1,1,...,0,0,0,0,0,0,0,0,0,0
2,2021-01-03 23:46:20+00:00,0.8,0.0,0.0,0.0,0.0,0.0,[],1,1,...,0,0,0,0,0,0,0,0,0,0
3,2021-01-05 01:29:35+00:00,0.8,0.0,0.0,0.0,0.0,0.0,[],1,0,...,0,0,0,0,0,0,0,0,0,0
4,2021-01-04 14:01:49+00:00,-0.8,0.0,0.0,0.0,0.0,0.0,[],-1,-1,...,0,0,0,0,0,0,0,0,0,0


In [4]:
# group 

aggregation_dict = {
  'date': 'count',  # for counting the number of rows
  'tanalysis_absa': 'mean',
  'economy_absa': 'mean',
  'regulation_absa': 'mean',
  'technology_absa': 'mean',
  'adoption_absa': 'mean',
  'cybersecurity_absa': 'mean',
  'source_AMBCrypto': 'sum',
  'source_BeInCrypto': 'sum',
  'source_Benzinga': 'sum',
  'source_Bitcoin': 'sum',
  'source_Bitcoin Magazine': 'sum',
  'source_CoinPedia': 'sum',
  'source_Coindesk': 'sum',
  'source_Coingape': 'sum',
  'source_Cointelegraph': 'sum',
  'source_CryptoPotato': 'sum',
  'source_Cryptopolitan': 'sum',
  'source_NewsBTC': 'sum',
  'source_Other': 'sum',
  'source_The Currency Analytics': 'sum',
  'source_The Daily Hodl': 'sum',
  'source_UToday': 'sum',
}

bool_cols = [
  'topic_Other',
  'topic_exchange traded funds, ETF',
  'topic_institutional investments',
  'topic_market sentiment',
  'topic_others',
  'topic_price action, price movement, trading',
  'NER_binance',
  'NER_blackrock',
  'NER_coinbase',
  'NER_el salvador',
  'NER_fed',
  'NER_grayscale',
  'NER_michael saylor',
  'NER_microstrategy',
  'NER_sec',
  'NER_us'
]

bool_agg = {col: lambda x: int(x.eq(1).any()) for col in bool_cols}
combined_agg = {**aggregation_dict, **bool_agg}

df_news = df_news.groupby(pd.Grouper(key='date', freq='h')).agg(combined_agg)
df_news.rename(columns={'date': 'news_count'}, inplace=True)

# set index to date
df_news.reset_index(inplace=True)
df_news.set_index('date', inplace=True)

# timestamp formatting
# df_news.index = df_news.index.tz_localize(None)

df_news

Unnamed: 0_level_0,news_count,tanalysis_absa,economy_absa,regulation_absa,technology_absa,adoption_absa,cybersecurity_absa,source_AMBCrypto,source_BeInCrypto,source_Benzinga,...,NER_binance,NER_blackrock,NER_coinbase,NER_el salvador,NER_fed,NER_grayscale,NER_michael saylor,NER_microstrategy,NER_sec,NER_us
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2021-01-01 06:00:00+00:00,2,0.850000,0.000,0.00,0.0,0.150,0.0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2021-01-01 07:00:00+00:00,0,,,,,,,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2021-01-01 08:00:00+00:00,2,0.400000,0.000,0.00,0.0,0.500,0.0,0,1,0,...,0,0,0,0,0,0,1,1,0,0
2021-01-01 09:00:00+00:00,2,-0.050000,0.000,0.00,0.0,0.400,0.0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2021-01-01 10:00:00+00:00,3,0.766667,0.000,0.00,0.0,0.300,0.0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2025-01-01 00:00:00+00:00,4,0.025000,0.175,0.30,0.0,0.325,0.0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2025-01-01 01:00:00+00:00,1,0.800000,0.000,0.00,0.0,0.000,0.0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2025-01-01 02:00:00+00:00,2,0.050000,0.000,0.00,0.0,0.250,0.0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
2025-01-01 03:00:00+00:00,2,0.500000,-0.100,0.00,0.0,0.000,0.0,0,0,0,...,0,0,0,0,1,0,0,0,0,0


In [5]:
# timestamp formatting
df_news.index = df_news.index.tz_localize(None)
df_news

Unnamed: 0_level_0,news_count,tanalysis_absa,economy_absa,regulation_absa,technology_absa,adoption_absa,cybersecurity_absa,source_AMBCrypto,source_BeInCrypto,source_Benzinga,...,NER_binance,NER_blackrock,NER_coinbase,NER_el salvador,NER_fed,NER_grayscale,NER_michael saylor,NER_microstrategy,NER_sec,NER_us
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2021-01-01 06:00:00,2,0.850000,0.000,0.00,0.0,0.150,0.0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2021-01-01 07:00:00,0,,,,,,,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2021-01-01 08:00:00,2,0.400000,0.000,0.00,0.0,0.500,0.0,0,1,0,...,0,0,0,0,0,0,1,1,0,0
2021-01-01 09:00:00,2,-0.050000,0.000,0.00,0.0,0.400,0.0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2021-01-01 10:00:00,3,0.766667,0.000,0.00,0.0,0.300,0.0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2025-01-01 00:00:00,4,0.025000,0.175,0.30,0.0,0.325,0.0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2025-01-01 01:00:00,1,0.800000,0.000,0.00,0.0,0.000,0.0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2025-01-01 02:00:00,2,0.050000,0.000,0.00,0.0,0.250,0.0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
2025-01-01 03:00:00,2,0.500000,-0.100,0.00,0.0,0.000,0.0,0,0,0,...,0,0,0,0,1,0,0,0,0,0


### df_price

In [6]:
df_price

Unnamed: 0_level_0,open,high,low,close,volume,return_forward,return,obv,sma_5,sma_20,...,rsi_7,rsi_14,atr_7,atr_14,upper_band_10,middle_band_10,lower_band_10,upper_band_20,middle_band_20,lower_band_20
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2021-01-01 00:00:00,28912.47,29052.03,28776.17,28949.03,365.064315,0.008612,0.000970,147510.239086,28736.930,28854.8905,...,57.159365,55.986099,422.886222,417.797020,29223.387286,28741.153,28258.918714,29307.198761,28854.8905,28402.582239
2021-01-01 01:00:00,28949.71,29200.00,28921.64,29198.34,520.772208,-0.007061,0.008612,148031.011293,28813.352,28858.0050,...,64.203215,59.942816,402.239619,407.837232,29240.874492,28744.982,28249.089508,29318.792740,28858.0050,28397.217260
2021-01-01 02:00:00,29200.00,29219.01,28947.70,28992.18,394.441884,0.005256,-0.007061,147636.569409,28932.070,28869.1935,...,55.413387,55.499724,383.535388,398.085287,29256.754515,28750.826,28244.897485,29331.599971,28869.1935,28406.787029
2021-01-01 03:00:00,28987.60,29191.60,28945.26,29144.56,220.722444,-0.005207,0.005256,147857.291854,29041.018,28889.6280,...,60.121363,57.978989,363.936047,387.246338,29351.781840,28798.405,28245.028160,29362.660536,28889.6280,28416.595464
2021-01-01 04:00:00,29135.25,29161.46,28843.74,28992.79,500.353472,0.002213,-0.005207,147356.938382,29055.380,28887.0185,...,53.550828,54.709601,357.333754,382.280171,29380.790455,28842.084,28303.377545,29357.161215,28887.0185,28416.875785
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-12-31 19:00:00,93760.00,94158.00,93600.00,94008.00,7.863550,-0.006308,0.002666,175364.391806,94342.200,94002.9000,...,44.656824,49.600999,787.401340,594.021005,95801.067876,94516.700,93232.332124,95433.425204,94002.9000,92572.374796
2024-12-31 20:00:00,94025.00,94059.00,93300.00,93415.00,4.565475,0.002858,-0.006308,175359.826331,93968.200,93998.4000,...,34.760191,42.325843,783.344006,605.805219,95876.395307,94461.600,93046.804693,95435.711991,93998.4000,92561.088009
2024-12-31 21:00:00,93386.00,93777.00,93336.00,93682.00,28.484950,-0.004142,0.002858,175388.311280,93768.800,94007.2500,...,41.563036,46.155291,734.437720,594.033418,95898.271810,94421.000,92943.728190,95434.444994,94007.2500,92580.055006
2024-12-31 22:00:00,93703.00,93703.00,93227.00,93294.00,67.859345,0.000933,-0.004142,175320.451936,93631.400,93996.7000,...,35.319145,41.810713,697.518045,585.602459,95932.335823,94307.800,92683.264177,95441.599457,93996.7000,92551.800543


### preprocess df_reddit

In [7]:
df_reddit

Unnamed: 0,date,total_sentiment,average_sentiment
0,2021-01-01 00:00:00,0.2966,0.015611
1,2021-01-01 01:00:00,1.6673,0.049038
2,2021-01-01 02:00:00,0.4588,0.021848
3,2021-01-01 03:00:00,-0.4329,-0.020614
4,2021-01-01 04:00:00,0.0000,0.000000
...,...,...,...
34857,2024-12-31 19:00:00,0.0000,0.000000
34858,2024-12-31 20:00:00,0.0000,0.000000
34859,2024-12-31 21:00:00,0.0000,0.000000
34860,2024-12-31 22:00:00,0.6312,0.028691


In [8]:
df_reddit['date'] = pd.to_datetime(df_reddit['date'])

df_reddit.rename(columns={'total_sentiment': 'reddit_total_sentiment',
                          'average_sentiment': 'reddit_average_sentiment'
                          }, inplace=True)

df_reddit.set_index('date', inplace=True)
df_reddit

Unnamed: 0_level_0,reddit_total_sentiment,reddit_average_sentiment
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2021-01-01 00:00:00,0.2966,0.015611
2021-01-01 01:00:00,1.6673,0.049038
2021-01-01 02:00:00,0.4588,0.021848
2021-01-01 03:00:00,-0.4329,-0.020614
2021-01-01 04:00:00,0.0000,0.000000
...,...,...
2024-12-31 19:00:00,0.0000,0.000000
2024-12-31 20:00:00,0.0000,0.000000
2024-12-31 21:00:00,0.0000,0.000000
2024-12-31 22:00:00,0.6312,0.028691


### Merge all dataframe

In [9]:
# merge df_news and df_price and df_reddit

df = df_news.merge(df_price, left_index=True, right_index=True, how='outer')
df = df.merge(df_reddit, left_index=True, right_index=True, how='outer')
df = df.iloc[6:]

In [10]:
df

Unnamed: 0,news_count,tanalysis_absa,economy_absa,regulation_absa,technology_absa,adoption_absa,cybersecurity_absa,source_AMBCrypto,source_BeInCrypto,source_Benzinga,...,atr_7,atr_14,upper_band_10,middle_band_10,lower_band_10,upper_band_20,middle_band_20,lower_band_20,reddit_total_sentiment,reddit_average_sentiment
2021-01-01 06:00:00,2.0,0.850000,0.000,0.00,0.0,0.150,0.0,0.0,0.0,0.0,...,378.472758,389.244944,29544.781706,28973.235,28401.688294,29450.949582,28913.7180,28376.486418,-0.891,-0.111375
2021-01-01 07:00:00,0.0,,,,,,,0.0,0.0,0.0,...,373.920936,386.199590,29501.163293,29058.309,28615.454707,29488.063359,28931.4545,28374.845641,0.000,0.000000
2021-01-01 08:00:00,2.0,0.400000,0.000,0.00,0.0,0.500,0.0,0.0,1.0,0.0,...,348.770802,372.747477,29483.328440,29132.890,28782.451560,29515.716093,28939.8210,28363.925907,0.000,0.000000
2021-01-01 09:00:00,2.0,-0.050000,0.000,0.00,0.0,0.400,0.0,0.0,0.0,0.0,...,347.013545,370.156228,29500.447220,29169.830,28839.212780,29553.465096,28957.6910,28361.916904,0.000,0.000000
2021-01-01 10:00:00,3.0,0.766667,0.000,0.00,0.0,0.300,0.0,1.0,0.0,0.0,...,317.624467,353.808641,29496.559713,29199.137,28901.714287,29578.630359,28970.1450,28361.659641,0.000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2025-01-01 00:00:00,4.0,0.025000,0.175,0.30,0.0,0.325,0.0,1.0,0.0,0.0,...,,,,,,,,,,
2025-01-01 01:00:00,1.0,0.800000,0.000,0.00,0.0,0.000,0.0,0.0,0.0,0.0,...,,,,,,,,,,
2025-01-01 02:00:00,2.0,0.050000,0.000,0.00,0.0,0.250,0.0,0.0,0.0,0.0,...,,,,,,,,,,
2025-01-01 03:00:00,2.0,0.500000,-0.100,0.00,0.0,0.000,0.0,0.0,0.0,0.0,...,,,,,,,,,,


In [11]:
df.to_parquet("final_merged_dataset.parquet", index=True)