In [1]:
import pandas as pd
import numpy as np
import time
from pprint import pprint

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, precision_recall_fscore_support, confusion_matrix, roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import RFE, RFECV
from sklearn.model_selection import GridSearchCV, TimeSeriesSplit
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

In [2]:
df = pd.read_parquet("final_merged_dataset.parquet")
df['year'] = df.index.year
df = df[df['year'].isin([2021, 2022, 2023, 2024])]

In [3]:
def replace_inf(X):
	return X.replace([np.inf, -np.inf], np.nan).fillna(0)
df = replace_inf(df)

In [4]:
print(df.shape)
df.head()

(35058, 120)


Unnamed: 0,news_count,tanalysis_absa,economy_absa,regulation_absa,technology_absa,adoption_absa,cybersecurity_absa,source_AMBCrypto,source_BeInCrypto,source_Benzinga,...,upper_band_20,middle_band_20,lower_band_20,10_band_width,pct_b_10,20_band_width,pct_b_20,reddit_total_sentiment,reddit_average_sentiment,year
2021-01-01 06:00:00,2.0,0.85,0.0,0.0,0.0,0.15,0.0,0.0,0.0,0.0,...,29450.949582,28913.718,28376.486418,0.039453,0.942558,0.037161,1.026218,-0.891,-0.111375,2021
2021-01-01 07:00:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,29488.063359,28931.4545,28374.845641,0.03048,0.71567,0.038478,0.785547,0.0,0.0,2021
2021-01-01 08:00:00,2.0,0.4,0.0,0.0,0.0,0.5,0.0,0.0,1.0,0.0,...,29515.716093,28939.821,28363.925907,0.024058,0.803534,0.039799,0.852329,0.0,0.0,2021
2021-01-01 09:00:00,2.0,-0.05,0.0,0.0,0.0,0.4,0.0,0.0,0.0,0.0,...,29553.465096,28957.691,28361.916904,0.022668,0.682311,0.041148,0.779207,0.0,0.0,2021
2021-01-01 10:00:00,3.0,0.766667,0.0,0.0,0.0,0.3,0.0,1.0,0.0,0.0,...,29578.630359,28970.145,28361.659641,0.020372,0.572225,0.042008,0.723469,0.0,0.0,2021


### Remove non-stationarity column features

In [5]:
columns = df.columns.tolist()
for i in range(0, len(columns), 5):
  print(columns[i:i+5])

['news_count', 'tanalysis_absa', 'economy_absa', 'regulation_absa', 'technology_absa']
['adoption_absa', 'cybersecurity_absa', 'source_AMBCrypto', 'source_BeInCrypto', 'source_Benzinga']
['source_Bitcoin', 'source_Bitcoin Magazine', 'source_CoinPedia', 'source_Coindesk', 'source_Coingape']
['source_Cointelegraph', 'source_CryptoPotato', 'source_Cryptopolitan', 'source_NewsBTC', 'source_Other']
['source_The Currency Analytics', 'source_The Daily Hodl', 'source_UToday', 'topic_Other', 'topic_exchange traded funds, ETF']
['topic_institutional investments', 'topic_market sentiment', 'topic_others', 'topic_price action, price movement, trading', 'NER_binance']
['NER_blackrock', 'NER_coinbase', 'NER_el salvador', 'NER_fed', 'NER_grayscale']
['NER_michael saylor', 'NER_microstrategy', 'NER_sec', 'NER_us', 'return_forward']
['open', 'high', 'low', 'close', 'volume']
['return', 'sma_5', 'sma_20', 'sma_50', 'sma_200']
['ema_5', 'ema_20', 'ema_50', 'ema_200', 'sma_5_20_diff']
['sma_20_50_diff', '

In [6]:
df.drop(columns=[
  'open', 'high', 'low', 'close', 'volume', 'return', 
  'sma_5', 'sma_20', 'sma_50', 'sma_200', 
  'ema_5', 'ema_20', 'ema_50', 'ema_200'
], inplace=True)

### Create 3-class prediction target

In [7]:
# Create 3-class target:
df['target'] = np.where(df['return_forward'] > 0.001, 2,
                        np.where(df['return_forward'] < -0.001, 0, 1))

print("Target class distribution:")
print(df['target'].value_counts())

Target class distribution:
target
2    13105
0    12641
1     9312
Name: count, dtype: int64


### Feature enginnering on news related columns

In [8]:
absa_cols = ['tanalysis_absa', 'economy_absa', 'regulation_absa', 'technology_absa', 'adoption_absa', 'cybersecurity_absa']
for col in absa_cols:
  df[col + '_ewm_12h'] = df[col].ewm(span=12, adjust=False).mean()
  df[col + '_ewm_70h'] = df[col].ewm(span=70, adjust=False).mean()
df.head()

Unnamed: 0,news_count,tanalysis_absa,economy_absa,regulation_absa,technology_absa,adoption_absa,cybersecurity_absa,source_AMBCrypto,source_BeInCrypto,source_Benzinga,...,economy_absa_ewm_12h,economy_absa_ewm_70h,regulation_absa_ewm_12h,regulation_absa_ewm_70h,technology_absa_ewm_12h,technology_absa_ewm_70h,adoption_absa_ewm_12h,adoption_absa_ewm_70h,cybersecurity_absa_ewm_12h,cybersecurity_absa_ewm_70h
2021-01-01 06:00:00,2.0,0.85,0.0,0.0,0.0,0.15,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.15,0.15,0.0,0.0
2021-01-01 07:00:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.126923,0.145775,0.0,0.0
2021-01-01 08:00:00,2.0,0.4,0.0,0.0,0.0,0.5,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.18432,0.155753,0.0,0.0
2021-01-01 09:00:00,2.0,-0.05,0.0,0.0,0.0,0.4,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.217501,0.162633,0.0,0.0
2021-01-01 10:00:00,3.0,0.766667,0.0,0.0,0.0,0.3,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.230193,0.166503,0.0,0.0


In [9]:
columns_to_engineer = [
  'source_AMBCrypto', 'source_BeInCrypto', 'source_Benzinga',
  'source_Bitcoin', 'source_Bitcoin Magazine', 'source_CoinPedia', 'source_Coindesk', 'source_Coingape',
  'source_Cointelegraph', 'source_CryptoPotato', 'source_Cryptopolitan', 'source_NewsBTC', 'source_Other',
  'source_The Currency Analytics', 'source_The Daily Hodl', 'source_UToday',
  'topic_Other', 'topic_exchange traded funds, ETF',
  'topic_institutional investments', 'topic_market sentiment', 'topic_others', 'topic_price action, price movement, trading', 'NER_binance',
  'NER_blackrock', 'NER_coinbase', 'NER_el salvador', 'NER_fed', 'NER_grayscale',
  'NER_michael saylor', 'NER_microstrategy', 'NER_sec', 'NER_us'
]

# # If any row in the window has a 1 then the maximum will be 1, else 0.
for col in columns_to_engineer:
  df[col + "_last3"] = df[col].rolling(window=5, min_periods=1).max().astype(int)
  df[col + "_last24"] = df[col].rolling(window=48, min_periods=1).max().astype(int)

df.drop(columns=columns_to_engineer, inplace=True)
df.head()

Unnamed: 0,news_count,tanalysis_absa,economy_absa,regulation_absa,technology_absa,adoption_absa,cybersecurity_absa,return_forward,sma_5_20_diff,sma_20_50_diff,...,NER_grayscale_last3,NER_grayscale_last24,NER_michael saylor_last3,NER_michael saylor_last24,NER_microstrategy_last3,NER_microstrategy_last24,NER_sec_last3,NER_sec_last24,NER_us_last3,NER_us_last24
2021-01-01 06:00:00,2.0,0.85,0.0,0.0,0.0,0.15,0.0,-0.007795,219.4,351.4194,...,1,1,0,0,0,0,0,0,0,0
2021-01-01 07:00:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.003292,253.0935,336.2693,...,1,1,0,0,0,0,0,0,0,0
2021-01-01 08:00:00,2.0,0.4,0.0,0.0,0.0,0.5,0.0,-0.001883,284.941,309.6284,...,1,1,1,1,1,1,0,0,0,0
2021-01-01 09:00:00,2.0,-0.05,0.0,0.0,0.0,0.4,0.0,-0.001648,326.589,301.568,...,1,1,1,1,1,1,0,0,0,0
2021-01-01 10:00:00,3.0,0.766667,0.0,0.0,0.0,0.3,0.0,0.000118,351.167,289.6874,...,1,1,1,1,1,1,0,0,0,0


In [10]:
source_cols = [
  'source_Benzinga_last3', 'source_Benzinga_last24', 'source_Bitcoin_last3', 'source_Bitcoin_last24', 'source_Bitcoin Magazine_last3',
  'source_Bitcoin Magazine_last24', 'source_CoinPedia_last3', 'source_CoinPedia_last24', 'source_Coindesk_last3', 'source_Coindesk_last24',
  'source_Coingape_last3', 'source_Coingape_last24', 'source_Cointelegraph_last3', 'source_Cointelegraph_last24', 'source_CryptoPotato_last3',
  'source_CryptoPotato_last24', 'source_Cryptopolitan_last3', 'source_Cryptopolitan_last24', 'source_NewsBTC_last3', 'source_NewsBTC_last24',
  'source_Other_last3', 'source_Other_last24', 'source_The Currency Analytics_last3', 'source_The Currency Analytics_last24', 'source_The Daily Hodl_last3',
  'source_The Daily Hodl_last24', 'source_UToday_last3', 'source_UToday_last24'
]

# if value > 0, then 1 else 0
for col in source_cols:
  df[col] = df[col].apply(lambda x: 1 if x > 0 else 0)

In [11]:
# remove unwanted cols
col_to_remove = [
  'cybersecurity_absa_ewm_12h', 'cybersecurity_absa_ewm_70h',
  'NER_us_last24', 'topic_others_last24', 'topic_price action, price movement, trading_last24', 'topic_Other_last24'
]
df.drop(columns=col_to_remove, inplace=True)

### Feature engineering on reddit

In [12]:
reddit_col = ['reddit_total_sentiment', 'reddit_average_sentiment']

# apply ewm to reddit columns
for col in reddit_col:
  df[col + '_ewm_5h'] = df[col].ewm(span=5, adjust=False).mean()
  df[col + '_ewm_24h'] = df[col].ewm(span=24, adjust=False).mean()

df.drop(columns=reddit_col, inplace=True)

### Save dataset with feature engineering

In [13]:
df.to_parquet("full_dataset_feature_engineering_v2.parquet", index=True)

In [14]:
print(df.shape)

(35058, 147)


In [15]:
df.head()

Unnamed: 0,news_count,tanalysis_absa,economy_absa,regulation_absa,technology_absa,adoption_absa,cybersecurity_absa,return_forward,sma_5_20_diff,sma_20_50_diff,...,NER_michael saylor_last24,NER_microstrategy_last3,NER_microstrategy_last24,NER_sec_last3,NER_sec_last24,NER_us_last3,reddit_total_sentiment_ewm_5h,reddit_total_sentiment_ewm_24h,reddit_average_sentiment_ewm_5h,reddit_average_sentiment_ewm_24h
2021-01-01 06:00:00,2.0,0.85,0.0,0.0,0.0,0.15,0.0,-0.007795,219.4,351.4194,...,0,0,0,0,0,0,-0.891,-0.891,-0.111375,-0.111375
2021-01-01 07:00:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.003292,253.0935,336.2693,...,0,0,0,0,0,0,-0.594,-0.81972,-0.07425,-0.102465
2021-01-01 08:00:00,2.0,0.4,0.0,0.0,0.0,0.5,0.0,-0.001883,284.941,309.6284,...,1,1,1,0,0,0,-0.396,-0.754142,-0.0495,-0.094268
2021-01-01 09:00:00,2.0,-0.05,0.0,0.0,0.0,0.4,0.0,-0.001648,326.589,301.568,...,1,1,1,0,0,0,-0.264,-0.693811,-0.033,-0.086726
2021-01-01 10:00:00,3.0,0.766667,0.0,0.0,0.0,0.3,0.0,0.000118,351.167,289.6874,...,1,1,1,0,0,0,-0.176,-0.638306,-0.022,-0.079788


In [16]:
df.tail()

Unnamed: 0,news_count,tanalysis_absa,economy_absa,regulation_absa,technology_absa,adoption_absa,cybersecurity_absa,return_forward,sma_5_20_diff,sma_20_50_diff,...,NER_michael saylor_last24,NER_microstrategy_last3,NER_microstrategy_last24,NER_sec_last3,NER_sec_last24,NER_us_last3,reddit_total_sentiment_ewm_5h,reddit_total_sentiment_ewm_24h,reddit_average_sentiment_ewm_5h,reddit_average_sentiment_ewm_24h
2024-12-31 19:00:00,7.0,0.342857,0.157143,0.114286,0.014286,0.471429,0.0,-0.006308,339.3,312.86,...,1,1,1,1,1,1,0.369066,0.208662,0.026292,0.015598
2024-12-31 20:00:00,6.0,0.216667,0.016667,0.083333,0.0,0.483333,0.0,0.002858,-30.2,309.9,...,1,0,1,1,1,1,0.246044,0.191969,0.017528,0.01435
2024-12-31 21:00:00,3.0,0.1,0.233333,0.2,0.0,0.3,0.0,-0.004142,-238.45,313.73,...,1,0,1,1,1,1,0.164029,0.176611,0.011685,0.013202
2024-12-31 22:00:00,4.0,0.2,0.2,0.475,0.0,0.925,0.0,0.000933,-365.3,310.76,...,1,0,1,0,1,1,0.319753,0.212978,0.017354,0.014441
2024-12-31 23:00:00,4.0,0.1,0.15,0.2,0.0,0.4,0.0,0.00015,-434.5,301.66,...,1,0,1,0,1,1,0.213169,0.19594,0.011569,0.013286
