In [None]:
!pip install -q pandas scikit-learn transformers textblob

In [None]:
import pandas as pd
from textblob import TextBlob
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
import torch
import numpy as np
import os
import json

# 1. Load the dataset
csv_path = "crypto_10k_tweets_(2021_2022Nov).csv"
df = pd.read_csv(csv_path, encoding='utf-8', engine='python', on_bad_lines='skip')
print("Sample data:")
print(df.head())

# 2. Auto-label sentiment using TextBlob
def get_sentiment(text):
    try:
        polarity = TextBlob(str(text)).sentiment.polarity
        if polarity > 0.1:
            return 'Positive'
        elif polarity < -0.1:
            return 'Negative'
        else:
            return 'Neutral'
    except:
        return 'Neutral'

df['Sentiment'] = df['Content'].apply(get_sentiment)
print(df[['Content', 'Sentiment']].head())

# 3. Clean data
df = df[df['Content'].notnull()]
df = df[df['Content'].apply(lambda x: isinstance(x, str) and x.strip() != '')]
df = df.reset_index(drop=True)

# 4. Map sentiment to labels
label_map = {'Negative': 0, 'Neutral': 1, 'Positive': 2}
df['label'] = df['Sentiment'].map(label_map)

# 5. Train/test split
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df['Content'].tolist(), df['label'].tolist(), test_size=0.2, random_state=42, stratify=df['label']
)

# 6. Tokenization
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

class TweetDataset(torch.utils.data.Dataset):
    def __init__(self, texts, labels):
        self.encodings = tokenizer(texts, truncation=True, padding=True, max_length=128)
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = TweetDataset(train_texts, train_labels)
test_dataset = TweetDataset(test_texts, test_labels)

# 7. Model
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3)


Sample data:
  Unnamed: 0                       Date         Username  \
0          0  2022-11-30 11:53:21+00:00   0xEthereumYoda   
1          1  2022-11-30 11:53:21+00:00  Lawrenc32984128   
2          2  2022-11-30 11:53:21+00:00  NITESHP55784410   
3          3  2022-11-30 11:53:20+00:00          817coin   
4          4  2022-11-30 11:53:18+00:00        slamtoken   

                                             Content  \
0  #Ethereum price update: \n\n#ETH $1269.23 USD\...   
1  @mtmalinen @ecb Do well to understand that eve...   
2  Kayla #Ethereum Harold #世界杯 Egbert #百家乐 Lavern...   
3  #Bitcoin https://t.co/2koLlCvCri https://t.co/...   
4  Yesterday we made a 3,000 SLAM buyback and loc...   

                                                 URL  \
0  https://twitter.com/0xEthereumYoda/status/1597...   
1  https://twitter.com/Lawrenc32984128/status/159...   
2  https://twitter.com/NITESHP55784410/status/159...   
3  https://twitter.com/817coin/status/15979217174...   
4  https:

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
50,0.7572
100,0.741
150,0.532
200,0.5432
250,0.5225
300,0.4627
350,0.4269
400,0.4052
450,0.4038
500,0.4318


Evaluation Results: {'eval_loss': 0.3063485026359558, 'eval_accuracy': 0.9175, 'eval_f1': 0.9171481521318946, 'eval_runtime': 6.8059, 'eval_samples_per_second': 293.862, 'eval_steps_per_second': 36.733, 'epoch': 2.0}
{
  "Negative": {
    "precision": 0.7096774193548387,
    "recall": 0.6534653465346535,
    "f1-score": 0.6804123711340206,
    "support": 101.0
  },
  "Neutral": {
    "precision": 0.9468390804597702,
    "recall": 0.9441260744985673,
    "f1-score": 0.945480631276901,
    "support": 1396.0
  },
  "Positive": {
    "precision": 0.8757281553398059,
    "recall": 0.8966202783300199,
    "f1-score": 0.8860510805500982,
    "support": 503.0
  },
  "accuracy": 0.9175,
  "macro avg": {
    "precision": 0.8440815517181383,
    "recall": 0.8314038997877469,
    "f1-score": 0.83731469432034,
    "support": 2000.0
  },
  "weighted avg": {
    "precision": 0.9169780189063002,
    "recall": 0.9175,
    "f1-score": 0.9171481521318946,
    "support": 2000.0
  }
}
Confusion Matrix:
 [[

In [None]:

# 8. Training setup
# training_args = TrainingArguments(
#     output_dir='./results',
#     num_train_epochs=2,
#     per_device_train_batch_size=8,
#     per_device_eval_batch_size=8,
#     evaluation_strategy='epoch',
#     save_strategy='epoch',
#     logging_dir='./logs',
#     logging_steps=50,
#     load_best_model_at_end=True,
#     metric_for_best_model='eval_loss',
#     save_total_limit=1,
#     report_to=[]  # disables wandb/tensorboard
# )

# training_args = TrainingArguments(
#     output_dir='./results',
#     num_train_epochs=2,
#     per_device_train_batch_size=8,
#     per_device_eval_batch_size=8,
#     logging_dir='./logs',
#     logging_steps=50,
#     load_best_model_at_end=True,
#     metric_for_best_model='eval_loss',
#     save_total_limit=1,
#     report_to=[]
# )
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=2,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    logging_dir='./logs',
    logging_steps=50,
    save_total_limit=1,
    report_to=[]
)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)
    return {
        'accuracy': accuracy_score(labels, preds),
        'f1': f1_score(labels, preds, average='weighted')
    }

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)

# 9. Train
trainer.train()

# 10. Evaluate
results = trainer.evaluate()
print("Evaluation Results:", results)

# 11. Detailed stats
preds = np.argmax(trainer.predict(test_dataset).predictions, axis=1)
report = classification_report(test_labels, preds, target_names=['Negative', 'Neutral', 'Positive'], output_dict=True)
print(json.dumps(report, indent=2))
print("Confusion Matrix:\n", confusion_matrix(test_labels, preds))

# 12. Save model and metrics
model.save_pretrained("sentiment_model")
tokenizer.save_pretrained("sentiment_model")
with open("classification_report.json", "w") as f:
    json.dump(report, f, indent=2)
np.save("confusion_matrix.npy", confusion_matrix(test_labels, preds))
print("Model and metrics saved in the ML folder.")

Step,Training Loss
50,0.2837
100,0.2647
150,0.2198
200,0.1936
250,0.2376
300,0.1511
350,0.1629
400,0.1775
450,0.1594
500,0.2112


Evaluation Results: {'eval_loss': 0.336852490901947, 'eval_accuracy': 0.932, 'eval_f1': 0.9313223399221568, 'eval_runtime': 6.8485, 'eval_samples_per_second': 292.033, 'eval_steps_per_second': 36.504, 'epoch': 2.0}
{
  "Negative": {
    "precision": 0.7816091954022989,
    "recall": 0.6732673267326733,
    "f1-score": 0.723404255319149,
    "support": 101.0
  },
  "Neutral": {
    "precision": 0.954253037884203,
    "recall": 0.9563037249283668,
    "f1-score": 0.9552772808586762,
    "support": 1396.0
  },
  "Positive": {
    "precision": 0.896887159533074,
    "recall": 0.9165009940357853,
    "f1-score": 0.9065880039331367,
    "support": 503.0
  },
  "accuracy": 0.932,
  "macro avg": {
    "precision": 0.8775831309398586,
    "recall": 0.8486906818989418,
    "f1-score": 0.8617565133703207,
    "support": 2000.0
  },
  "weighted avg": {
    "precision": 0.9311070054335577,
    "recall": 0.932,
    "f1-score": 0.9313223399221568,
    "support": 2000.0
  }
}
Confusion Matrix:
 [[  68