<a href="https://colab.research.google.com/github/amritanshu009/DSAexperiments/blob/main/ensemble.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers datasets



In [None]:
from google.colab import files
uploaded = files.upload()

Saving Twitter_Data.csv.csv to Twitter_Data.csv (2).csv


In [None]:
# ✅ Disable WandB logs in Colab
os.environ["WANDB_DISABLED"] = "true"

# ✅ Load and clean dataset from Colab path
df = pd.read_csv("/content/Twitter_Data.csv.csv")
print("Available columns:", df.columns.tolist())
# Adjust column names below as per your dataset
# For example, use 'category' if it exists; else use 'sentiment' or 'label'
df = df[['text', 'sentiment']].dropna()
df.rename(columns={'text': 'clean_comment', 'sentiment': 'category'}, inplace=True)
df.rename(columns={'text': 'clean_comment'}, inplace=True)

# ✅ Train/test split for classic ML
X_train, X_test, y_train, y_test = train_test_split(df['clean_comment'], df['category'], test_size=0.2, random_state=42)
vectorizer = TfidfVectorizer(max_features=6000, ngram_range=(1,2))
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# ✅ Logistic Regression
logistic_model = LogisticRegression(C=5.0, max_iter=1000, solver='liblinear')
logistic_model.fit(X_train_vec, y_train)
y_pred = logistic_model.predict(X_test_vec)
print("\nLogistic Regression Performance:")
print(classification_report(y_test, y_pred))

# ✅ Random Forest Classifier
rf_model = RandomForestClassifier(n_estimators=300, max_depth=50, random_state=42)
rf_model.fit(X_train_vec, y_train)
y_pred_rf = rf_model.predict(X_test_vec)
print("\nRandom Forest Performance:")
print(classification_report(y_test, y_pred_rf))

# ✅ Inference using Logistic Regression
def predict_sentiment_logistic(text):
    vec = vectorizer.transform([text])
    return logistic_model.predict(vec)[0]

print("Predicted Sentiment (Logistic):", predict_sentiment_logistic("Buddhism is very peaceful and calming"))

# ✅ Label Mapping for RoBERTa
label_map = {-1: 0, 0: 1, 1: 2}
reverse_map = {0: -1, 1: 0, 2: 1}
df['label'] = df['category'].map(label_map)

# ✅ Convert to HuggingFace Dataset
hf_dataset = Dataset.from_pandas(df[['clean_comment', 'label']])
hf_dataset = hf_dataset.train_test_split(test_size=0.2)

# ✅ Tokenization for RoBERTa
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

def tokenize(example):
    return tokenizer(example["clean_comment"], truncation=True, padding="max_length", max_length=128)

tokenized_dataset = hf_dataset.map(tokenize, batched=True)

# ✅ Load RoBERTa model and set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=3)
model.to(device)

# ✅ Metrics for Trainer
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted', zero_division=0)
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall}

# ✅ Training Arguments
training_args = TrainingArguments(
    output_dir="./roberta_results",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=6,
    weight_decay=0.01,
    learning_rate=2e-5,
    warmup_steps=500,
    logging_dir="./logs"
)

# ✅ Trainer Setup
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    compute_metrics=compute_metrics
)

# ✅ Train the RoBERTa model
trainer.train()

# ✅ Evaluate the RoBERTa model
metrics = trainer.evaluate()
print("📊 RoBERTa Evaluation Metrics:")
roberta_results = {}
for k, v in metrics.items():
    if 'accuracy' in k or 'f1' in k or 'precision' in k or 'recall' in k:
        print(f"{k.capitalize()}: {v:.4f}")
        roberta_results[k] = v

# ✅ Summary comparison table
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

log_acc = accuracy_score(y_test, y_pred)
log_prec, log_rec, log_f1, _ = precision_recall_fscore_support(y_test, y_pred, average='weighted', zero_division=0)

rf_acc = accuracy_score(y_test, y_pred_rf)
rf_prec, rf_rec, rf_f1, _ = precision_recall_fscore_support(y_test, y_pred_rf, average='weighted', zero_division=0)

print("📋 Accuracy Comparison (Weighted Avg):")
print(f"Logistic Regression -> Accuracy: {log_acc:.4f}, Precision: {log_prec:.4f}, Recall: {log_rec:.4f}, F1: {log_f1:.4f}")
print(f"Random Forest       -> Accuracy: {rf_acc:.4f}, Precision: {rf_prec:.4f}, Recall: {rf_rec:.4f}, F1: {rf_f1:.4f}")
print(f"RoBERTa             -> Accuracy: {roberta_results['eval_accuracy']:.4f}, Precision: {roberta_results['eval_precision']:.4f}, Recall: {roberta_results['eval_recall']:.4f}, F1: {roberta_results['eval_f1']:.4f}")

# ✅ RoBERTa Inference Function
def predict_sentiment_roberta(text):
    model.eval()
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs)
    predicted_class = torch.argmax(outputs.logits, dim=1).item()
    return reverse_map[predicted_class]

# ✅ Test RoBERTa Prediction
sample = "I love the peaceful teachings of Buddhism."
print("Predicted Sentiment (RoBERTa):", predict_sentiment_roberta(sample))


Available columns: ['text', 'sentiment']

Logistic Regression Performance:
              precision    recall  f1-score   support

          -1       0.85      0.89      0.87       137
           0       0.85      0.89      0.87       165
           1       0.55      0.25      0.34        24

    accuracy                           0.84       326
   macro avg       0.75      0.68      0.70       326
weighted avg       0.83      0.84      0.83       326


Random Forest Performance:
              precision    recall  f1-score   support

          -1       0.80      0.88      0.84       137
           0       0.85      0.86      0.86       165
           1       0.57      0.17      0.26        24

    accuracy                           0.82       326
   macro avg       0.74      0.64      0.65       326
weighted avg       0.81      0.82      0.80       326

Predicted Sentiment (Logistic): -1


Map:   0%|          | 0/1304 [00:00<?, ? examples/s]

Map:   0%|          | 0/326 [00:00<?, ? examples/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Step,Training Loss


📊 RoBERTa Evaluation Metrics:
Eval_accuracy: 0.8957
Eval_f1: 0.8956
Eval_precision: 0.8961
Eval_recall: 0.8957
📋 Accuracy Comparison (Weighted Avg):
Logistic Regression -> Accuracy: 0.8436, Precision: 0.8313, Recall: 0.8436, F1: 0.8330
Random Forest       -> Accuracy: 0.8190, Precision: 0.8070, Recall: 0.8190, F1: 0.8039
RoBERTa             -> Accuracy: 0.8957, Precision: 0.8961, Recall: 0.8957, F1: 0.8956
Predicted Sentiment (RoBERTa): 1
