
# Install and import required libraries
We install open-source Python packages (Hugging Face Transformers, Datasets, Plotly, etc.)
and import them. These tools allow us to:
- Load and process text data
- Fine-tune pre-trained AI models
- Visualize results with interactive charts

In [None]:
!pip install -q datasets transformers torch accelerate evaluate scikit-learn plotly kaggle

import pandas as pd, numpy as np, re, warnings
from datasets import Dataset
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.metrics import accuracy_score, f1_score
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from transformers import pipeline

warnings.filterwarnings('ignore')
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Device: {device}")

# Download financial and Bitcoin tweet datasets
We authenticate with Kaggle and download two open-source datasets:
1. **Financial Sentiment Dataset** (~5.8k labeled sentences: Positive/Negative/Neutral)  
   → Used to train our sentiment classifier
2. **Bitcoin Twitter Dataset** (~1M tweets with timestamps)  
   → Used to analyze real trader sentiment over time

We sample 10,000 tweets for fast inference while preserving date range (2021).

In [None]:
from google.colab import files
uploaded = files.upload()                     # upload kaggle.json
!mkdir -p ~/.kaggle
!cp {list(uploaded.keys())[0]} ~/.kaggle/kaggle.json
!chmod 600 ~/.kaggle/kaggle.json

# 1. Labeled financial data
!kaggle datasets download -d sbhatti/financial-sentiment-analysis -p /content --unzip
df_labeled = pd.read_csv('/content/data.csv')
df_labeled = df_labeled[['Sentence','Sentiment']].rename(columns={'Sentence':'text','Sentiment':'label'})
df_labeled['label'] = df_labeled['label'].str.lower().map({'positive':0, 'negative':1, 'neutral':2})
df_labeled = df_labeled.dropna()
print("Labeled shape:", df_labeled.shape)
print(df_labeled['label'].value_counts())

# 2. Bitcoin tweets
!kaggle datasets download -d gautamchettiar/bitcoin-sentiment-analysis-twitter-data -p /content --unzip
df_btc = pd.read_csv('/content/bitcoin_tweets1000000.csv', encoding='latin-1')
df_btc = df_btc[['date','text']].dropna()
df_btc = df_btc.sample(n=10000, random_state=42)
df_btc['date'] = pd.to_datetime(df_btc['date'])
print("BTC sample shape:", df_btc.shape)
print("Date range:", df_btc['date'].min(), "→", df_btc['date'].max())

# Clean text by removing noise
We remove URLs, mentions (@user), hashtags, special characters, and short/irrelevant text.  
This ensures:
- The model focuses on meaningful words
- No junk affects model performance
- Consistent input format for tokenization

In [None]:
def clean(txt):
    txt = re.sub(r'http\S+|www\S+|https\S+', '', txt, flags=re.MULTILINE)
    txt = re.sub(r'@\w+|#\w+', '', txt)
    txt = re.sub(r'[^a-zA-Z\s]', '', txt)
    txt = txt.lower().strip()
    return txt if len(txt.split()) > 2 else ''

df_labeled['text'] = df_labeled['text'].apply(clean)
df_labeled = df_labeled[df_labeled['text']!='']

df_btc['text'] = df_btc['text'].apply(clean)
df_btc = df_btc[df_btc['text']!='']
print("Cleaned – labeled:", len(df_labeled), "btc:", len(df_btc))

### HYPERPARAMETER SEARCH – 6 PROFESSIONAL-GRADE EXPERIMENTS


We don't just train one model — we run a full ablation study:

• 2 state-of-the-art architectures:
       → FinBERT (finance-specialized, ProsusAI)
       → RoBERTa-base (general-purpose SOTA)

• 3 carefully designed training configurations:
       A. Conservative  → low LR, small batch, 5 epochs
       B. Balanced      → optimal speed/accuracy
       C. Aggressive    → high LR, fast convergence

 Total: 6 independent training runs on full 5.8k labeled dataset
 Metrics: Weighted F1 Score (primary) + Accuracy
 Goal: Find the single best model for real-world Bitcoin sentiment

 The winner is automatically selected and used in the final dashboard.


In [None]:

import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.metrics import accuracy_score, f1_score
import pandas as pd
from datasets import Dataset
from itertools import product

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device}")

def compute_metrics(p):
    preds = p.predictions.argmax(-1)
    f1 = f1_score(p.label_ids, preds, average='weighted')
    acc = accuracy_score(p.label_ids, preds)
    return {"accuracy": acc, "f1": f1}

# Define 3 different training configurations
configs = [
    {
        "name": "Config A: Conservative",
        "lr": 1e-5,
        "batch_size": 16,
        "epochs": 5,
        "warmup": 200,
        "weight_decay": 0.01
    },
    {
        "name": "Config B: Balanced ",
        "lr": 1.5e-5,
        "batch_size": 32,
        "epochs": 4,
        "warmup": 100,
        "weight_decay": 0.01
    },
    {
        "name": "Config C: Aggressive",
        "lr": 3e-5,
        "batch_size": 32,
        "epochs": 3,
        "warmup": 50,
        "weight_decay": 0.05
    }
]

models_to_test = ["ProsusAI/finbert", "roberta-base"]
results = []

print("Starting 6-model hyperparameter search...\n")

for model_name, config in product(models_to_test, configs):
    short_name = "FinBERT" if "finbert" in model_name else "RoBERTa"
    print(f"Training {short_name} → {config['name']}...")

    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3)
    model.to(device)

    # Split data
    train_df = df_labeled.sample(frac=0.8, random_state=42)
    val_df = df_labeled.drop(train_df.index)

    def tokenize(batch):
        return tokenizer(batch['text'], padding=True, truncation=True, max_length=128)

    train_ds = Dataset.from_pandas(train_df).map(tokenize, batched=True)
    val_ds = Dataset.from_pandas(val_df).map(tokenize, batched=True)
    for ds in (train_ds, val_ds):
        ds.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])

    args = TrainingArguments(
        output_dir=f"./{short_name}_{config['name'].replace(' ', '_')}",
        eval_strategy='epoch',
        save_strategy='epoch',
        learning_rate=config['lr'],
        per_device_train_batch_size=config['batch_size'],
        per_device_eval_batch_size=config['batch_size'],
        num_train_epochs=config['epochs'],
        weight_decay=config['weight_decay'],
        load_best_model_at_end=True,
        metric_for_best_model='f1',
        warmup_steps=config['warmup'],
        lr_scheduler_type='linear',
        fp16=True,
        report_to=[],
        logging_steps=10,
    )

    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=train_ds,
        eval_dataset=val_ds,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
    )

    trainer.train()
    eval_result = trainer.evaluate()

    f1 = eval_result['eval_f1']
    acc = eval_result['eval_accuracy']

    results.append({
        "Model": short_name,
        "Config": config['name'],
        "Learning Rate": config['lr'],
        "Batch Size": config['batch_size'],
        "Epochs": config['epochs'],
        "F1 Score": round(f1, 4),
        "Accuracy": round(acc, 4),
        "Trainer": trainer
    })

    print(f"Done → F1: {f1:.4f}\n")


#  FINAL COMPARISON TABLE

results_df = pd.DataFrame(results).drop(columns="Trainer")
results_df = results_df.sort_values("F1 Score", ascending=False).reset_index(drop=True)
print("\n" + "="*80)
print("FINAL HYPERPARAMETER SEARCH RESULTS (6 Models)")
print("="*80)
print(results_df.to_string(index=False))

# Select the BEST model
best_row = results_df.iloc[0]
best_trainer = [r["Trainer"] for r in results if r["F1 Score"] == best_row["F1 Score"]][0]
best_name = f"{best_row['Model']} ({best_row['Config']})"

print(f"\nBEST MODEL SELECTED: {best_name}")
print(f"→ F1 Score: {best_row['F1 Score']:.4f} | Accuracy: {best_row['Accuracy']:.4f}")

# Classify  Bitcoin tweets using the best model
We:
- Tokenize tweets using the best model’s tokenizer
- Run inference on GPU
- Assign labels: **Positive / Negative / Neutral**
- Compute **total counts and percentages**
- Display **one real example from each class**

This shows both **overall trader mood** and **concrete evidence**.

In [None]:


# Clean tweets
df_btc_clean = df_btc[df_btc['text'].apply(lambda x: isinstance(x, str) and len(x.strip()) > 0)].copy()
print(f"Clean BTC tweets: {len(df_btc_clean)}")

# Use the tokenizer from the best trainer (avoids deprecation warning)
tokenizer = best_trainer.tokenizer

def tokenize_batch(batch):
    return tokenizer(
        batch['text'],
        padding=True,
        truncation=True,
        max_length=128,
        return_tensors=None
    )

print("\nTokenizing Bitcoin tweets...")
btc_ds = Dataset.from_pandas(df_btc_clean[['text']]).map(tokenize_batch, batched=True)
btc_ds.set_format('torch', columns=['input_ids', 'attention_mask'])

# Predict
print(f"Predicting with BEST MODEL: {best_name}...")
preds = best_trainer.predict(btc_ds)
df_btc_clean['pred'] = preds.predictions.argmax(-1)
df_btc_clean['sentiment'] = df_btc_clean['pred'].map({0: 'Positive', 1: 'Negative', 2: 'Neutral'})
df_btc_clean['score'] = df_btc_clean['pred'].map({0: 1, 1: -1, 2: 0})

# TOTAL COUNTS
total = len(df_btc_clean)
counts = df_btc_clean['sentiment'].value_counts()
print(f"\nTOTAL CLASSIFICATION RESULTS ({best_name}):")
for label in ['Positive', 'Negative', 'Neutral']:
    n = counts.get(label, 0)
    pct = (n / total) * 100
    print(f"  {label:8}: {n:4} tweets ({pct:5.1f}%)")

# 1 SAMPLE FROM EACH CLASS
print(f"\nBALANCED SAMPLE (1 of each sentiment):")
samples = []
for label in ['Positive', 'Negative', 'Neutral']:
    sample = df_btc_clean[df_btc_clean['sentiment'] == label].sample(1, random_state=42)
    row = sample.iloc[0]
    samples.append({
        'date': row['date'].strftime('%Y-%m-%d %H:%M:%S'),
        'text': row['text'],
        'sentiment': row['sentiment']
    })

samples_df = pd.DataFrame(samples)
print(samples_df.to_string(index=False))

# Visualize results with interactive charts
We build a two-panel dashboard using Plotly:
1. **Pie Chart** – % of Positive, Negative, Neutral tweets
2. **Line Chart** – Daily net sentiment score over time  
   (Positive = +1, Negative = -1, Neutral = 0)



In [None]:
dist = df_btc_clean['sentiment'].value_counts(normalize=True) * 100
df_btc_clean['day'] = df_btc_clean['date'].dt.date
daily = df_btc_clean.groupby('day')['score'].mean().reset_index()
daily['day'] = pd.to_datetime(daily['day'])

fig = make_subplots(
    rows=1, cols=2,
    subplot_titles=(f'Sentiment % ({best_name})', 'Daily Net Sentiment'),
    specs=[[{"type":"pie"}, {"type":"scatter"}]]
)
fig.add_trace(go.Pie(labels=dist.index, values=dist.values,
                     marker_colors=['#2ca02c','#d62728','#1f77b4'], hole=0.4), row=1, col=1)
fig.add_trace(go.Scatter(x=daily['day'], y=daily['score'],
                         mode='lines+markers', line=dict(color='purple')), row=1, col=2)
fig.update_layout(title_text=f"Bitcoin Trader Sentiment – Best Model: {best_name}", height=500)
fig.show()