In [None]:
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
from transformers import BertTokenizer, TFBertForSequenceClassification, create_optimizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

df = pd.read_csv("/content/git_final_vadar.csv")

if "cleaned_comments" not in df.columns or "sentiment_category" not in df.columns:
    raise ValueError("Dataset must contain 'cleaned_comments' and 'sentiment_category' columns.")

df = df.dropna(subset=["cleaned_comments", "sentiment_category"])
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

X = df["cleaned_comments"].tolist()
y = df["sentiment_category"]

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

X_train_encoded = tokenizer(X_train, padding=True, truncation=True, max_length=128, return_tensors="tf")
X_test_encoded = tokenizer(X_test, padding=True, truncation=True, max_length=128, return_tensors="tf")

model = TFBertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=len(set(y)))

batch_size = 16
epochs = 10
num_train_steps = (len(X_train) // batch_size) * epochs
num_warmup_steps = int(0.1 * num_train_steps)

optimizer, schedule = create_optimizer(init_lr=2e-5, num_train_steps=num_train_steps, num_warmup_steps=num_warmup_steps, weight_decay_rate=0.01)

model.compile(optimizer=optimizer,
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=["accuracy"])

history = model.fit(
    [X_train_encoded["input_ids"], X_train_encoded["attention_mask"]],
    y_train,
    validation_data=([X_test_encoded["input_ids"], X_test_encoded["attention_mask"]], y_test),
    epochs=epochs,
    batch_size=batch_size
)

loss, accuracy = model.evaluate([X_test_encoded["input_ids"], X_test_encoded["attention_mask"]], y_test)
print(f"Test Accuracy: {accuracy:.4f}")

def plot_training_history(history):
    fig, axs = plt.subplots(1, 2, figsize=(12, 5))

    axs[0].plot(history.history["accuracy"], label="Train Accuracy", marker="o")
    axs[0].plot(history.history["val_accuracy"], label="Validation Accuracy", marker="o")
    axs[0].set_title("Model Accuracy")
    axs[0].set_xlabel("Epoch")
    axs[0].set_ylabel("Accuracy")
    axs[0].legend()
    axs[0].grid()

    axs[1].plot(history.history["loss"], label="Train Loss", marker="o")
    axs[1].plot(history.history["val_loss"], label="Validation Loss", marker="o")
    axs[1].set_title("Model Loss")
    axs[1].set_xlabel("Epoch")
    axs[1].set_ylabel("Loss")
    axs[1].legend()
    axs[1].grid()

    plt.show()

plot_training_history(history)

sentiment_counts = df["sentiment_category"].value_counts(normalize=True) * 100

plt.figure(figsize=(8, 5))
plt.bar(sentiment_counts.index, sentiment_counts.values, color=["red", "blue", "green"])
plt.xlabel("Sentiment")
plt.ylabel("Percentage")
plt.title("Sentiment Distribution in Dataset")
plt.grid(axis="y", linestyle="--", alpha=0.7)
plt.show()
