In [2]:
import pandas as pd
import re
import nltk
import pickle
import seaborn as sns
import matplotlib.pyplot as plt
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import ipywidgets as widgets
from IPython.display import display, clear_output, HTML
import time

# nltk.download('punkt')
# nltk.download('stopwords')

# Custom CSS styling
css_style = """
<style>
.sentiment-header {
    color: yellow;
    font-family: 'Segoe UI';
    font-size: 24px;
    font-weight: bold;
    text-align: center;
}
.sentiment-subheader {
    color: gray;
    text-align: center;
    margin-bottom: 30px;
}
.result-positive { color: #27ae60; font-weight: bold; }
.result-negative { color: #e74c3c; font-weight: bold; }
</style>
"""
display(HTML(css_style))
display(HTML('<div class="sentiment-header">Urdu Sentiment Analysis</div>'))
display(HTML('<div class="sentiment-subheader">Using Support Vector Machine (SVM)</div>'))

output = widgets.Output()
text_input = widgets.Textarea(description="Urdu Text:", layout=widgets.Layout(width='90%', height='100px'))
train_button = widgets.Button(description="Train Model", button_style='success')
test_button = widgets.Button(description="Analyze Sentiment", button_style='primary')
clear_button = widgets.Button(description="Clear", button_style='warning')
progress_bar = widgets.FloatProgress(min=0, max=100, description='Progress:', layout=widgets.Layout(width='90%'))
result_display = widgets.Output()
button_box = widgets.HBox([train_button, test_button, clear_button])

# Urdu Preprocessor
def urdu_preprocessor(text):
    text = re.sub(r'[۔،؛؟!٭ء]', ' ', str(text))
    text = re.sub(r'\s+', ' ', text).strip()
    try:
        tokens = word_tokenize(text)
        stop_words = set(stopwords.words('urdu')) if 'urdu' in stopwords.fileids() else set()
        tokens = [t for t in tokens if t not in stop_words]
        return ' '.join(tokens)
    except:
        return text

# Train Model
def train_model(b):
    with output:
        clear_output()
        progress_bar.value = 10
        print("Loading dataset...")
        df = pd.read_excel("/kaggle/input/dataset/Dataset_Final.xlsx")
        df.columns = df.columns.str.strip()
        df.dropna(subset=['Sentence_text', 'Sentiment(P/N)'], inplace=True)

        progress_bar.value = 30
        df['cleaned_text'] = df['Sentence_text'].apply(urdu_preprocessor)

        X_train, X_test, y_train, y_test = train_test_split(
            df['cleaned_text'], df['Sentiment(P/N)'], test_size=0.2, random_state=42)

        progress_bar.value = 50
        vectorizer = TfidfVectorizer(ngram_range=(1,2), max_df=0.9, min_df=5)
        X_train_vec = vectorizer.fit_transform(X_train)
        X_test_vec = vectorizer.transform(X_test)

        progress_bar.value = 70
        model = LinearSVC()
        model.fit(X_train_vec, y_train)

        y_pred = model.predict(X_test_vec)
        acc = accuracy_score(y_test, y_pred)

        with result_display:
            clear_output()
            print(f"Model Accuracy: {acc:.2%}\n")
            print(classification_report(y_test, y_pred))
            sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt='d')
            plt.show()

        with open("urdu_svm_model.pkl", "wb") as f:
            pickle.dump(model, f)
        with open("urdu_vectorizer.pkl", "wb") as f:
            pickle.dump(vectorizer, f)

        progress_bar.value = 100
        time.sleep(1)
        progress_bar.value = 0

# Test Model
def test_model(b):
    with output:
        clear_output()
        progress_bar.value = 30
        try:
            with open("urdu_svm_model.pkl", "rb") as f:
                model = pickle.load(f)
            with open("urdu_vectorizer.pkl", "rb") as f:
                vectorizer = pickle.load(f)
        except:
            print("Error loading model. Please train first.")
            progress_bar.value = 0
            return

        input_text = text_input.value.strip()
        if not input_text:
            print("Please enter some Urdu text.")
            return

        if re.search('[a-zA-Z]', input_text):
            print("❌ Please enter Urdu text only.")
            return

        processed_input = urdu_preprocessor(input_text)
        input_vec = vectorizer.transform([processed_input])
        prediction = model.predict(input_vec)[0]

        with result_display:
            clear_output()
            label = '✅ Positive' if prediction == 'P' else '❌ Negative'
            print(f"Prediction: {label}")
            print(f"Input: {input_text}")

        progress_bar.value = 100
        time.sleep(1)
        progress_bar.value = 0

# Clear Function
def clear_all(b):
    text_input.value = ""
    with output: clear_output()
    with result_display: clear_output()
    progress_bar.value = 0

train_button.on_click(train_model)
test_button.on_click(test_model)
clear_button.on_click(clear_all)

# Display UI
display(widgets.VBox([
    text_input,
    button_box,
    progress_bar,
    output,
    result_display
]))


VBox(children=(Textarea(value='', description='Urdu Text:', layout=Layout(height='100px', width='90%')), HBox(…