<a href="https://colab.research.google.com/github/anms5519/3D-Virtual-Tour/blob/main/HealthSentimentProject.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Installing dependencies.........

In [None]:
!pip install -q --upgrade transformers datasets evaluate gradio accelerate scikit-learn tensorboard matplotlib seaborn

#Mounting Google Drive.........

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.2/40.2 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.5/10.5 MB[0m [31m76.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.5/491.5 kB[0m [31m32.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m54.2/54.2 MB[0m [31m14.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m323.1/323.1 kB[0m [31m24.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m362.1/362.1 kB[0m [31m24.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.5/5.5 MB[0m [31m94.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

#Defining my project folder in my personal google drive........

In [None]:
DRIVE_PROJECT_PATH = "/content/drive/MyDrive/HealthSentimentProject"
import os
os.makedirs(DRIVE_PROJECT_PATH, exist_ok=True)

#Importing importing libraries and defining Globals..............

In [None]:
import pandas as pd
import torch
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    EarlyStoppingCallback,
    pipeline
)
import evaluate
import gradio as gr

#Loading my data set from my Google Drive..............

In [None]:
csv_path = os.path.join(DRIVE_PROJECT_PATH, "DrugReviews.csv")
df = pd.read_csv(csv_path)
print(f"Loaded {len(df)} rows from {csv_path}")

Loaded 392510 rows from /content/drive/MyDrive/HealthSentimentProject/DrugReviews.csv


#Feature Engineering & Labeling..............

#Parsing dates/times ..........

In [None]:
df['ReviewDate']  = pd.to_datetime(df['ReviewDate'], errors='coerce')
df['Year']        = df['ReviewDate'].dt.year.fillna(0).astype(int)
df['Month']       = df['ReviewDate'].dt.month.fillna(0).astype(int)
df['IntakeTime']  = pd.to_datetime(df['IntakeTime'], format="%H:%M", errors='coerce')
df['Hour']        = df['IntakeTime'].dt.hour.fillna(0).astype(int)

  df['ReviewDate']  = pd.to_datetime(df['ReviewDate'], errors='coerce')


Saved cleaned data to /content/drive/MyDrive/HealthSentimentProject/clean_drugreviews.csv


# Binarize ratings...........

In [None]:
df = df[df.Rating.notnull()]
df['label'] = df.Rating.apply(lambda r: 1 if r >= 7 else (0 if r <= 4 else None))
df = df.dropna(subset=['label']).reset_index(drop=True)
df['label'] = df['label'].astype(int)

# Save cleaned subset to Drive

In [None]:
clean_path = os.path.join(DRIVE_PROJECT_PATH, "clean_drugreviews.csv")
df.to_csv(clean_path, index=False)
print(f"Saved cleaned data to {clean_path}")

# Stratified Train/Val/Test Split...........

In [None]:
train_df, temp_df = train_test_split(df, test_size=0.2, stratify=df.label, random_state=42)
val_df, test_df  = train_test_split(temp_df, test_size=0.5, stratify=temp_df.label, random_state=42)
print(f"Splits: train={len(train_df)}, val={len(val_df)}, test={len(test_df)}")

Splits: train=286365, val=35796, test=35796


# Preparing HuggingFace Dataset & Tokenizer

In [None]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
def tokenize_fn(batch):
    return tokenizer(batch["Reviews"], padding="max_length",
                     truncation=True, max_length=128)
hf_train = Dataset.from_pandas(train_df[["Reviews","label"]]).map(tokenize_fn, batched=True)
hf_val   = Dataset.from_pandas(val_df[["Reviews","label"]]).map(tokenize_fn, batched=True)
hf_test  = Dataset.from_pandas(test_df[["Reviews","label"]]).map(tokenize_fn, batched=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/286365 [00:00<?, ? examples/s]

Map:   0%|          | 0/35796 [00:00<?, ? examples/s]

Map:   0%|          | 0/35796 [00:00<?, ? examples/s]

# Metrics & Model Setup............

In [None]:
metric_acc = evaluate.load("accuracy")
metric_f1  = evaluate.load("f1")
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(-1)
    return {
        "accuracy": metric_acc.compute(predictions=preds, references=labels)["accuracy"],
        "f1":       metric_f1.compute(predictions=preds, references=labels)["f1"],
    }
model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=2
)

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.79k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# TrainingArguments with TensorBoard & Drive Output............


In [None]:
tb_log_dir = os.path.join(DRIVE_PROJECT_PATH, "logs")
ckpt_dir   = os.path.join(DRIVE_PROJECT_PATH, "model_ckpt")
training_args = TrainingArguments(
    output_dir=ckpt_dir,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="steps",
    logging_steps=200,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    gradient_accumulation_steps=2,
    learning_rate=2e-5,
    weight_decay=0.01,
    num_train_epochs=4,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    save_total_limit=2,
    report_to="tensorboard",
    run_name="HealthSentimentDistilBERT",
    logging_dir=tb_log_dir
)

# Trainer & Early Stopping...........

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=hf_train,
    eval_dataset=hf_val,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

  trainer = Trainer(


# Train & Log to TensorBoard.................

In [None]:
trainer.train()
print("✓ Training complete. Check TensorBoard in:", tb_log_dir)

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.2767,0.248715,0.894737,0.917067
2,0.1831,0.217188,0.914711,0.932716
3,0.138,0.233682,0.924852,0.940926
4,0.1061,0.251366,0.929713,0.944166


✓ Training complete. Check TensorBoard in: /content/drive/MyDrive/HealthSentimentProject/logs


# Evaluation on Test Set & Save Reports............

In [None]:
metrics = trainer.evaluate(hf_test)
print("Test Metrics:", metrics)

Test Metrics: {'eval_loss': 0.24872921407222748, 'eval_accuracy': 0.9304112191306291, 'eval_f1': 0.944773306728744, 'eval_runtime': 114.5182, 'eval_samples_per_second': 312.579, 'eval_steps_per_second': 9.771, 'epoch': 4.0}
Saved classification report and confusion matrix at /content/drive/MyDrive/HealthSentimentProject


# Classification report & confusion matrix...................

In [None]:
y_true = test_df.label.to_list()
y_pred = trainer.predict(hf_test).predictions.argmax(-1).tolist()

# Save classification report..............

In [None]:
clf_report = classification_report(y_true, y_pred, target_names=["Neg","Pos"])
with open(os.path.join(DRIVE_PROJECT_PATH, "classification_report.txt"), "w") as f:
    f.write(clf_report)

# Plot & save confusion matrix.................

In [None]:
cm = confusion_matrix(y_true, y_pred)
plt.figure(figsize=(6,4))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues",
            xticklabels=["Neg","Pos"], yticklabels=["Neg","Pos"])
plt.xlabel("Predicted")
plt.ylabel("True")
plt.title("Confusion Matrix")
fig_path = os.path.join(DRIVE_PROJECT_PATH, "confusion_matrix.png")
plt.savefig(fig_path, bbox_inches="tight")
plt.close()
print(f"Saved classification report and confusion matrix at {DRIVE_PROJECT_PATH}")

# Export Final Model to Drive...........

In [None]:
best_model_path = os.path.join(DRIVE_PROJECT_PATH, "best_model")

In [None]:
model.save_pretrained(best_model_path)
tokenizer.save_pretrained(best_model_path)
print(f"Best model & tokenizer saved to {best_model_path}")

# Gradio Interface (live in Colab)..........

In [None]:
device = 0 if torch.cuda.is_available() else -1
sent_pipe = pipeline(
    "sentiment-analysis",
    model=best_model_path,
    tokenizer=best_model_path,
    device=device
)
def predict_sentiment(review: str):
    out = sent_pipe(review)[0]
    label = "Positive" if out["label"] in ["LABEL_1","POSITIVE"] else "Negative"
    return {label: float(out["score"])}
iface = gr.Interface(
    fn=predict_sentiment,
    inputs=gr.Textbox(lines=4, label="Enter a drug review…"),
    outputs=gr.Label(num_top_classes=2, label="Sentiment"),
    title="Health Review Sentiment Analyzer",
    description="Fine-tuned DistilBERT with TensorBoard logging and Drive persistence.",
    examples=[
        ["This medicine eased my pain within hours."],
        ["Worst side effects ever; felt sick all day."]
    ]
)
iface.launch(share=True)

Device set to use cpu


Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://16abb1ccc0fa7a270f.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


