In [2]:
import sqlite3
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from transformers import BertTokenizer, BertModel, AdamW
import torch
from torch.utils.data import Dataset, DataLoader

# Load data from SQLite database
conn = sqlite3.connect("twitterTesla.db")
query = """
SELECT t.id_str, t.created_at, t.full_text, s.date, s.open, s.close
FROM tweets t
JOIN tesla s ON t.created_at = s.date
"""
df = pd.read_sql_query(query, conn)

# Preprocess Data
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

class StockDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = tokenizer(
            text,
            add_special_tokens=True,
            max_length=128,
            padding='max_length',
            return_tensors='pt',
            truncation=True
        )
        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'labels': torch.tensor(label, dtype=torch.float32)
        }

# Prepare Data
dataset = StockDataset(df['full_text'].tolist(), df['close'].values)
train_dataset, test_dataset = train_test_split(dataset, test_size=0.2, random_state=42)

# DataLoader
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)

# Fine-Tune BERT Model for Regression
class StockRegressionModel(torch.nn.Module):
    def __init__(self):
        super(StockRegressionModel, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.linear = torch.nn.Linear(self.bert.config.hidden_size, 1)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids, attention_mask=attention_mask)
        last_hidden_states = outputs.last_hidden_state[:, 0, :]
        logits = self.linear(last_hidden_states)
        return logits

model = StockRegressionModel()
optimizer = AdamW(model.parameters(), lr=5e-5)

# Training
for epoch in range(3):  # You may need to adjust the number of epochs
    model.train()
    for batch in train_loader:
        optimizer.zero_grad()
        outputs = model(batch['input_ids'], attention_mask=batch['attention_mask'])
        loss = torch.nn.functional.mse_loss(outputs.squeeze(), batch['labels'])
        loss.backward()
        optimizer.step()
    print(f'Epoch: {epoch + 1}, Loss: {loss.item()}')
# Evaluation
model.eval()
with torch.no_grad():
    all_preds, all_labels = [], []
    for batch in test_loader:
        outputs = model(batch['input_ids'], attention_mask=batch['attention_mask'])
        predictions = outputs.squeeze().cpu().numpy()
        labels = batch['labels'].cpu().numpy()
        all_preds.extend(predictions)
        all_labels.extend(labels)


# Save the trained model
torch.save(model.state_dict(), 'stock_prediction_model.pth')


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch: 1, Loss: 84724.0859375
Epoch: 2, Loss: 87428.9765625
Epoch: 3, Loss: 78396.9765625


In [4]:
import torch
from transformers import BertTokenizer, BertModel
import pandas as pd

# Load the saved model
model = StockRegressionModel()
model.load_state_dict(torch.load('stock_prediction_model.pth'))
model.eval()

# Load the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# New tweet text
new_tweet_text = "tesla is bad."  # replace with the actual tweet text

# Preprocess the new tweet
encoding = tokenizer(
    new_tweet_text,
    add_special_tokens=True,
    max_length=128,
    padding='max_length',
    return_tensors='pt',
    truncation=True
)

# Model prediction
with torch.no_grad():
    new_prediction = model(encoding['input_ids'], attention_mask=encoding['attention_mask']).squeeze().item()

print(f'Predicted Stock Price: {new_prediction}')



Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Predicted Stock Price: 28.940671920776367
