In [43]:
import pandas as pd
import torch
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.nn.functional import softmax
from tqdm import tqdm
from sklearn.preprocessing import MinMaxScaler
import tensorflow as tf
from tensorflow.keras import layers, models, callbacks
import matplotlib.pyplot as plt
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from sklearn.metrics import mean_squared_error, mean_absolute_error
from tensorflow.keras.regularizers import l2
from tensorflow.keras.optimizers import AdamW
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dropout, Dense

In [None]:
df = pd.read_csv("bitcoin_sentiments_21_24.csv")
print(df.head)
print(df.columns)

<bound method NDFrame.head of                       Date                                  Short Description
0      2021-11-05 04:42:00  Bitcoin price is consolidating near the USD 62...
1      2021-11-05 08:15:00  Congress could finally approve or reject the m...
2      2021-11-05 10:24:00  Bitcoin increasingly becoming a political inst...
3      2021-11-05 16:58:00  There is still potential for the price of bitc...
4      2021-11-05 21:00:00  'Several companies' are looking to Latin Ameri...
...                    ...                                                ...
11290  2024-09-12 00:00:00  The government of El Salvador has been buying ...
11291  2024-09-12 00:00:00  According to data from mempool.space, transact...
11292  2024-09-12 00:00:00  Time Magazine reporter Vera Bergengruen believ...
11293  2024-09-12 00:00:00   in bitcoin is concentrated at around $58,500,...
11294  2024-09-12 00:00:00  Solv has integrated Chainlink's Cross-Chain In...

[11295 rows x 2 columns]>
Index([

In [None]:
df.drop('Accurate Sentiments', axis=1, inplace=True)
df.to_csv("bitcoin_sentiments_21_24.csv", index=False)

print(df.head())

In [None]:
headlines = df['Short Description'].astype(str).tolist()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")
model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert").to(device)

def sentiment_score(texts):
    inputs = tokenizer(texts, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
        probability = softmax(outputs.logits, dim=1)
        #score = positive sentiment probability - negative sentimnet probability
        score = probability[:, 0] - probability[:, 1]
    return score.cpu().numpy()

batch_size = 32
scores = []

for i in tqdm(range(0, len(headlines), batch_size)):
    batch = headlines[i:i+batch_size]
    batch_scores = sentiment_score(batch)
    scores.extend(batch_scores)

df['Sentiment'] = scores
df.to_csv('bitcoin_finbert_processed.csv', index=False)

print(df.head())

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/252 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/758 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

  3%|▎         | 10/353 [00:00<00:22, 15.40it/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

100%|██████████| 353/353 [00:18<00:00, 19.40it/s]


                  Date                                  Short Description  \
0  2021-11-05 04:42:00  Bitcoin price is consolidating near the USD 62...   
1  2021-11-05 08:15:00  Congress could finally approve or reject the m...   
2  2021-11-05 10:24:00  Bitcoin increasingly becoming a political inst...   
3  2021-11-05 16:58:00  There is still potential for the price of bitc...   
4  2021-11-05 21:00:00  'Several companies' are looking to Latin Ameri...   

   Sentiment  
0  -0.934449  
1  -0.632842  
2   0.177122  
3   0.928608  
4   0.451377  


In [None]:
df = pd.read_csv('bitcoin_finbert_processed.csv')

df['Date'] = pd.to_datetime(df['Date'])
all_days = pd.date_range(start=df['Date'].min().normalize(),
                         end=df['Date'].max().normalize(),
                         freq='D')

existing_days = df['Date'].dt.normalize().unique()
missing_days = all_days.difference(existing_days)

missing_df = pd.DataFrame({
    'Date': missing_days,
    'Short Description': 'None',
    'Sentiment': 0.0
})

final_df = pd.concat([df, missing_df], ignore_index=True)
final_df = final_df.sort_values(by='Date')

final_df.to_csv('bitcoin_dataset1.csv', index=False)

In [None]:
df = pd.read_csv("bitcoin_dataset1.csv")
df["Date"] = pd.to_datetime(df["Date"]).dt.date

df_avg = df.groupby("Date")["Sentiment"].mean().reset_index()
df_avg.to_csv("bitcoin_average_sentiment.csv", index=False)

print(df_avg.head())

         Date  Sentiment
0  2021-11-05  -0.002037
1  2021-11-06   0.112744
2  2021-11-07   0.000000
3  2021-11-08  -0.362709
4  2021-11-09   0.253761


In [None]:
df = pd.read_csv("bitcoin_average_sentiment.csv")
df["Date"] = pd.to_datetime(df["Date"], errors = "coerce")
print("start date : ", df["Date"].min())
print("end date : ", df["Date"].max())

start date :  2021-11-05 00:00:00
end date :  2024-09-12 00:00:00


In [None]:
import yfinance as yf
start_date = "2021-11-05"
end_date = "2024-09-13" #it will go till 2024-09-12

df_btc = yf.download("BTC-USD", start=start_date, end=end_date)

df_btc.to_csv("bitcoin_price_dataset.csv")
print(df_btc.head())
print(df_btc.tail())

  df_btc = yf.download("BTC-USD", start=start_date, end=end_date)
[*********************100%***********************]  1 of 1 completed

Price              Close          High           Low          Open  \
Ticker           BTC-USD       BTC-USD       BTC-USD       BTC-USD   
Date                                                                 
2021-11-05  61125.675781  62541.468750  60844.609375  61460.078125   
2021-11-06  61527.480469  61590.683594  60163.781250  61068.875000   
2021-11-07  63326.988281  63326.988281  61432.488281  61554.921875   
2021-11-08  67566.828125  67673.742188  63344.066406  63344.066406   
2021-11-09  66971.828125  68530.335938  66382.062500  67549.734375   

Price            Volume  
Ticker          BTC-USD  
Date                     
2021-11-05  30605102446  
2021-11-06  29094934221  
2021-11-07  24726754302  
2021-11-08  41125608330  
2021-11-09  42357991721  
Price              Close          High           Low          Open  \
Ticker           BTC-USD       BTC-USD       BTC-USD       BTC-USD   
Date                                                                 
2024-09-08  54841.566




In [None]:
df_btc = pd.read_csv("bitcoin_price_dataset.csv")
print(df_btc.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1045 entries, 0 to 1044
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Price   1045 non-null   object
 1   Close   1044 non-null   object
 2   High    1044 non-null   object
 3   Low     1044 non-null   object
 4   Open    1044 non-null   object
 5   Volume  1044 non-null   object
dtypes: object(6)
memory usage: 49.1+ KB
None


In [None]:
df_btc = df_btc.iloc[2:].reset_index(drop=True)
df_btc = df_btc.rename(columns={"Price": "Date"})

cols = ["Close", "High", "Low", "Open", "Volume"]
for col in cols:
    df_btc[col] = pd.to_numeric(df_btc[col], errors='coerce')

df_btc["Date"] = pd.to_datetime(df_btc["Date"]).dt.date

df_btc.to_csv("bitcoin_price_dataset.csv", index=False)
df_btc.head()

Unnamed: 0,Date,Close,High,Low,Open,Volume
0,2021-11-05,61125.675781,62541.46875,60844.609375,61460.078125,30605102446
1,2021-11-06,61527.480469,61590.683594,60163.78125,61068.875,29094934221
2,2021-11-07,63326.988281,63326.988281,61432.488281,61554.921875,24726754302
3,2021-11-08,67566.828125,67673.742188,63344.066406,63344.066406,41125608330
4,2021-11-09,66971.828125,68530.335938,66382.0625,67549.734375,42357991721


In [None]:
df_btc = pd.read_csv("bitcoin_price_dataset.csv")
print(df_btc.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1043 entries, 0 to 1042
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Date    1043 non-null   object 
 1   Close   1043 non-null   float64
 2   High    1043 non-null   float64
 3   Low     1043 non-null   float64
 4   Open    1043 non-null   float64
 5   Volume  1043 non-null   int64  
dtypes: float64(4), int64(1), object(1)
memory usage: 49.0+ KB
None


In [None]:
df_btc["Return"] = df_btc["Close"].pct_change()
df_btc = df_btc.dropna()

df_btc.to_csv("bitcoin_price_dataset_returns.csv", index=False)
print(df_btc.head())

         Date         Close          High           Low          Open  \
3  2021-11-08  67566.828125  67673.742188  63344.066406  63344.066406   
4  2021-11-09  66971.828125  68530.335938  66382.062500  67549.734375   
5  2021-11-10  64995.230469  68789.625000  63208.113281  66953.335938   
6  2021-11-11  64949.960938  65579.015625  64180.488281  64978.890625   
7  2021-11-12  64155.941406  65460.816406  62333.914062  64863.980469   

        Volume    Return  
3  41125608330  0.066952  
4  42357991721 -0.008806  
5  48730828378 -0.029514  
6  35880633236 -0.000697  
7  36084893887 -0.012225  


In [None]:
df_btcr = pd.read_csv("bitcoin_price_dataset_returns.csv")
print(df_btcr.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1040 entries, 0 to 1039
Data columns (total 7 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Date    1040 non-null   object 
 1   Close   1040 non-null   float64
 2   High    1040 non-null   float64
 3   Low     1040 non-null   float64
 4   Open    1040 non-null   float64
 5   Volume  1040 non-null   int64  
 6   Return  1040 non-null   float64
dtypes: float64(5), int64(1), object(1)
memory usage: 57.0+ KB
None


In [None]:
df_sentiment = pd.read_csv("bitcoin_average_sentiment.csv")
df_sentiment["Date"] = pd.to_datetime(df_sentiment["Date"]).dt.date

In [None]:
df_btc["Date"] = pd.to_datetime(df_btc["Date"]).dt.date

df_final = df_btc.merge(df_sentiment, on="Date", how="inner")[["Date", "Sentiment", "Return"]]
df_final.to_csv("final_dataset.csv", index=False)
df_final.head()

Unnamed: 0,Date,Sentiment,Return
0,2021-11-08,-0.362709,0.066952
1,2021-11-09,0.253761,-0.008806
2,2021-11-10,-0.217932,-0.029514
3,2021-11-11,-0.275028,-0.000697
4,2021-11-12,0.101275,-0.012225


In [None]:
df = pd.read_csv("final_dataset.csv")
df["Date"] = pd.to_datetime(df["Date"])
df = df.sort_values("Date").reset_index(drop=True)
split_ratio = 0.8
split_index = int(len(df) * split_ratio)

df_train = df.iloc[:split_index]
df_test = df.iloc[split_index:]

df_train.to_csv("train_dataset.csv", index=False)
df_test.to_csv("test_dataset.csv", index=False)

In [None]:
print(df_train.head())

        Date  Sentiment    Return
0 2021-11-08  -0.362709  0.066952
1 2021-11-09   0.253761 -0.008806
2 2021-11-10  -0.217932 -0.029514
3 2021-11-11  -0.275028 -0.000697
4 2021-11-12   0.101275 -0.012225


In [None]:
print(df_test.head())

          Date  Sentiment    Return
832 2024-02-18   0.000000  0.008895
833 2024-02-19   0.793511 -0.006588
834 2024-02-20   0.000000  0.009767
835 2024-02-21  -0.957304 -0.008524
836 2024-02-22   0.000000 -0.010305


In [3]:
train = pd.read_csv('train_dataset.csv')
test = pd.read_csv('test_dataset.csv')

print(train.head())
print(test.head())

         Date  Sentiment    Return
0  2021-11-08  -0.362709  0.066952
1  2021-11-09   0.253761 -0.008806
2  2021-11-10  -0.217932 -0.029514
3  2021-11-11  -0.275028 -0.000697
4  2021-11-12   0.101275 -0.012225
         Date  Sentiment    Return
0  2024-02-18   0.000000  0.008895
1  2024-02-19   0.793511 -0.006588
2  2024-02-20   0.000000  0.009767
3  2024-02-21  -0.957304 -0.008524
4  2024-02-22   0.000000 -0.010305


In [34]:
window = 4
features = ['Sentiment', 'Return']
target_col = 1

train['Date'] = pd.to_datetime(train['Date'])
train = train.sort_values('Date')
test['Date'] = pd.to_datetime(test['Date'])
test = test.sort_values('Date')

train['LogReturn'] = np.log1p(train['Return'].clip(-0.999999, None))
test['LogReturn']  = np.log1p(test['Return'].clip(-0.999999, None))

scaler_X = MinMaxScaler()
scaler_y = MinMaxScaler()

train_scaled_X = scaler_X.fit_transform(train[features])
test_scaled_X  = scaler_X.transform(test[features])

train_scaled_y = scaler_y.fit_transform(train[['LogReturn']])
test_scaled_y  = scaler_y.transform(test[['LogReturn']])

def create_sequences(X, y, window):
    Xs, ys = [], []
    if len(X) <= window:
        return np.array(Xs), np.array(ys)
    for i in range(window, len(X)):
        Xs.append(X[i-window:i])
        ys.append(y[i])
    return np.array(Xs), np.array(ys)

combined_test_X = np.vstack([train_scaled_X[-window:], test_scaled_X])
combined_test_y = np.vstack([train_scaled_y[-window:], test_scaled_y])

X_train, y_train = create_sequences(train_scaled_X, train_scaled_y, window)
X_test, y_test   = create_sequences(combined_test_X, combined_test_y, window)

print("X_train shape:", X_train.shape)

X_train shape: (828, 4, 2)


In [35]:
model = Sequential([
    LSTM(
        50,
        return_sequences=False,
        input_shape=(window, len(features)),
        dropout=0.2,
        recurrent_dropout=0.2,
        kernel_regularizer=l2(1e-4)
    ),
    Dropout(0.2),
    Dense(
        1,
        kernel_regularizer=l2(1e-4)
    )
])

optimizer = AdamW(
    learning_rate=0.001,
    weight_decay=1e-4,
    clipnorm=1.0
)

model.compile(optimizer=optimizer, loss='mean_squared_error')
print(model.summary())

  super().__init__(**kwargs)


None


In [36]:
split_idx = int(len(X_train) * 0.8)
X_train_real, X_val = X_train[:split_idx], X_train[split_idx:]
y_train_real, y_val = y_train[:split_idx], y_train[split_idx:]

callbacks = [
    EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True, verbose=1),
    ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5, verbose=1)
]

In [37]:
history = model.fit(
    X_train_real, y_train_real,
    epochs=100,
    batch_size=32,
    validation_data=(X_val, y_val),
    callbacks=callbacks,
    verbose=1
)

Epoch 1/100
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 40ms/step - loss: 0.2013 - val_loss: 0.0106 - learning_rate: 0.0010
Epoch 2/100
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step - loss: 0.0354 - val_loss: 0.0067 - learning_rate: 0.0010
Epoch 3/100
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 30ms/step - loss: 0.0325 - val_loss: 0.0068 - learning_rate: 0.0010
Epoch 4/100
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 28ms/step - loss: 0.0279 - val_loss: 0.0065 - learning_rate: 0.0010
Epoch 5/100
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step - loss: 0.0237 - val_loss: 0.0071 - learning_rate: 0.0010
Epoch 6/100
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step - loss: 0.0230 - val_loss: 0.0061 - learning_rate: 0.0010
Epoch 7/100
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step - loss: 0.0188 - val_loss: 0.0066 - learning_rate:

In [38]:
pred_scaled = model.predict(X_test)



[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 79ms/step


In [39]:
pred_logr = scaler_y.inverse_transform(pred_scaled)
actual_logr = scaler_y.inverse_transform(y_test)

pred_real = np.expm1(pred_logr)
actuals_real = np.expm1(actual_logr)

In [46]:
rmse = np.sqrt(mean_squared_error(actuals_real, pred_real))
mae = mean_absolute_error(actuals_real, pred_real)

print(f"RMSE:{rmse}")
print(f"MAE:{mae}")

RMSE:0.030002672735107098
MAE:0.02203442553514423
