In [None]:
import pandas as pd
import torch
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.nn.functional import softmax
from tqdm import tqdm
from sklearn.preprocessing import MinMaxScaler
import tensorflow as tf
from tensorflow.keras import layers, models, callbacks
import matplotlib.pyplot as plt
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from sklearn.metrics import mean_squared_error, mean_absolute_error
from tensorflow.keras.regularizers import l2
from tensorflow.keras.optimizers import AdamW
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dropout, Dense

In [None]:
df = pd.read_csv("bitcoin_sentiments_21_24.csv")
print(df.head)
print(df.columns)

In [None]:
df.drop('Accurate Sentiments', axis=1, inplace=True)
df.to_csv("bitcoin_sentiments_21_24.csv", index=False)

print(df.head())

In [None]:
headlines = df['Short Description'].astype(str).tolist()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")
model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert").to(device)

def sentiment_score(texts):
    inputs = tokenizer(texts, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
        probability = softmax(outputs.logits, dim=1)
        #score = positive sentiment probability - negative sentimnet probability
        score = probability[:, 0] - probability[:, 1]
    return score.cpu().numpy()

batch_size = 32
scores = []

for i in tqdm(range(0, len(headlines), batch_size)):
    batch = headlines[i:i+batch_size]
    batch_scores = sentiment_score(batch)
    scores.extend(batch_scores)

df['Sentiment'] = scores
df.to_csv('bitcoin_finbert_processed.csv', index=False)

print(df.head())

In [None]:
df = pd.read_csv('bitcoin_finbert_processed.csv')

df['Date'] = pd.to_datetime(df['Date'])
all_days = pd.date_range(start=df['Date'].min().normalize(),
                         end=df['Date'].max().normalize(),
                         freq='D')

existing_days = df['Date'].dt.normalize().unique()
missing_days = all_days.difference(existing_days)

missing_df = pd.DataFrame({
    'Date': missing_days,
    'Short Description': 'None',
    'Sentiment': 0.0
})

final_df = pd.concat([df, missing_df], ignore_index=True)
final_df = final_df.sort_values(by='Date')

final_df.to_csv('bitcoin_dataset1.csv', index=False)

In [None]:
df = pd.read_csv("bitcoin_dataset1.csv")
df["Date"] = pd.to_datetime(df["Date"]).dt.date

df_avg = df.groupby("Date")["Sentiment"].mean().reset_index()
df_avg.to_csv("bitcoin_average_sentiment.csv", index=False)

print(df_avg.head())

In [None]:
df = pd.read_csv("bitcoin_average_sentiment.csv")
df["Date"] = pd.to_datetime(df["Date"], errors = "coerce")
print("start date : ", df["Date"].min())
print("end date : ", df["Date"].max())

In [None]:
import yfinance as yf
start_date = "2021-11-05"
end_date = "2024-09-13" #it will go till 2024-09-12

df_btc = yf.download("BTC-USD", start=start_date, end=end_date)

df_btc.to_csv("bitcoin_price_dataset.csv")
print(df_btc.head())
print(df_btc.tail())

In [None]:
df_btc = pd.read_csv("bitcoin_price_dataset.csv")
print(df_btc.info())

In [None]:
df_btc = df_btc.iloc[2:].reset_index(drop=True)
df_btc = df_btc.rename(columns={"Price": "Date"})

cols = ["Close", "High", "Low", "Open", "Volume"]
for col in cols:
    df_btc[col] = pd.to_numeric(df_btc[col], errors='coerce')

df_btc["Date"] = pd.to_datetime(df_btc["Date"]).dt.date

df_btc.to_csv("bitcoin_price_dataset.csv", index=False)
df_btc.head()

In [None]:
df_btc = pd.read_csv("bitcoin_price_dataset.csv")
print(df_btc.info())

In [None]:
df_btc["Return"] = df_btc["Close"].pct_change()
df_btc = df_btc.dropna()

df_btc.to_csv("bitcoin_price_dataset_returns.csv", index=False)
print(df_btc.head())

In [None]:
df_btcr = pd.read_csv("bitcoin_price_dataset_returns.csv")
print(df_btcr.info())

In [None]:
df_sentiment = pd.read_csv("bitcoin_average_sentiment.csv")
df_sentiment["Date"] = pd.to_datetime(df_sentiment["Date"]).dt.date

In [None]:
df_btc["Date"] = pd.to_datetime(df_btc["Date"]).dt.date

df_final = df_btc.merge(df_sentiment, on="Date", how="inner")[["Date", "Sentiment", "Return"]]
df_final.to_csv("final_dataset.csv", index=False)
df_final.head()

In [None]:
df = pd.read_csv("final_dataset.csv")
df["Date"] = pd.to_datetime(df["Date"])
df = df.sort_values("Date").reset_index(drop=True)
split_ratio = 0.8
split_index = int(len(df) * split_ratio)

df_train = df.iloc[:split_index]
df_test = df.iloc[split_index:]

df_train.to_csv("train_dataset.csv", index=False)
df_test.to_csv("test_dataset.csv", index=False)

In [None]:
print(df_train.head())

In [None]:
print(df_test.head())

In [None]:
train = pd.read_csv('train_dataset.csv')
test = pd.read_csv('test_dataset.csv')

print(train.head())
print(test.head())

In [None]:
window = 4
features = ['Sentiment', 'Return']
target_col = 1

train['Date'] = pd.to_datetime(train['Date'])
train = train.sort_values('Date')
test['Date'] = pd.to_datetime(test['Date'])
test = test.sort_values('Date')

train['LogReturn'] = np.log1p(train['Return'].clip(-0.999999, None))
test['LogReturn']  = np.log1p(test['Return'].clip(-0.999999, None))

scaler_X = MinMaxScaler()
scaler_y = MinMaxScaler()

train_scaled_X = scaler_X.fit_transform(train[features])
test_scaled_X  = scaler_X.transform(test[features])

train_scaled_y = scaler_y.fit_transform(train[['LogReturn']])
test_scaled_y  = scaler_y.transform(test[['LogReturn']])

def create_sequences(X, y, window):
    Xs, ys = [], []
    if len(X) <= window:
        return np.array(Xs), np.array(ys)
    for i in range(window, len(X)):
        Xs.append(X[i-window:i])
        ys.append(y[i])
    return np.array(Xs), np.array(ys)

combined_test_X = np.vstack([train_scaled_X[-window:], test_scaled_X])
combined_test_y = np.vstack([train_scaled_y[-window:], test_scaled_y])

X_train, y_train = create_sequences(train_scaled_X, train_scaled_y, window)
X_test, y_test   = create_sequences(combined_test_X, combined_test_y, window)

print("X_train shape:", X_train.shape)

In [None]:
model = Sequential([
    LSTM(
        50,
        return_sequences=False,
        input_shape=(window, len(features)),
        dropout=0.2,
        recurrent_dropout=0.2,
        kernel_regularizer=l2(1e-4)
    ),
    Dropout(0.2),
    Dense(
        1,
        kernel_regularizer=l2(1e-4)
    )
])

optimizer = AdamW(
    learning_rate=0.001,
    weight_decay=1e-4,
    clipnorm=1.0
)

model.compile(optimizer=optimizer, loss='mean_squared_error')
print(model.summary())

In [None]:
split_idx = int(len(X_train) * 0.8)
X_train_real, X_val = X_train[:split_idx], X_train[split_idx:]
y_train_real, y_val = y_train[:split_idx], y_train[split_idx:]

callbacks = [
    EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True, verbose=1),
    ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5, verbose=1)
]

In [None]:
history = model.fit(
    X_train_real, y_train_real,
    epochs=100,
    batch_size=32,
    validation_data=(X_val, y_val),
    callbacks=callbacks,
    verbose=1
)

In [None]:
pred_scaled = model.predict(X_test)

In [None]:
pred_logr = scaler_y.inverse_transform(pred_scaled)
actual_logr = scaler_y.inverse_transform(y_test)

pred_real = np.expm1(pred_logr)
actuals_real = np.expm1(actual_logr)

In [None]:
rmse = np.sqrt(mean_squared_error(actuals_real, pred_real))
mae = mean_absolute_error(actuals_real, pred_real)

print(f"RMSE:{rmse}")
print(f"MAE:{mae}")