## Loading the required libraries and Models

In [1]:
import gradio as gr
import pandas as pd
import os
import io
from PIL import Image
from wordcloud import WordCloud
import joblib
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

import json
from typing import List, Tuple, Dict, Set
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import classification_report, f1_score
from sklearn.utils import class_weight
import joblib

from google.colab import drive
drive.mount('/content/drive')

import sys
sys.path.append('/content/drive/MyDrive/AAI-590/')
from text_helpers import build_text, safe_get

print("build_text and safe_get functions imported from text_helpers.py")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
build_text and safe_get functions imported from text_helpers.py


In [2]:
attr_model_path = "/content/drive/MyDrive/AAI-590/models_attributes.pkl"
sentiment_model_path = "/content/drive/MyDrive/AAI-590/models_sentiment.pkl"

loaded_attr_clf = None
loaded_mlb = None
loaded_sentiment_clf = None

try:
    loaded_attr_model_data = joblib.load(attr_model_path)
    loaded_attr_clf = loaded_attr_model_data["pipeline"]
    loaded_mlb = loaded_attr_model_data["mlb"]
    print("Attribute model and MultiLabelBinarizer loaded successfully.")
except Exception as e:
    print(f"Error loading attribute model: {e}")

try:
    loaded_sentiment_clf = joblib.load(sentiment_model_path)
    print("Sentiment model loaded successfully.")
except Exception as e:
    print(f"Error loading sentiment model: {e}")

Attribute model and MultiLabelBinarizer loaded successfully.
Sentiment model loaded successfully.


In [3]:


def process_uploaded_csv(file):
    if loaded_attr_clf is None or loaded_mlb is None or loaded_sentiment_clf is None:
        return pd.DataFrame({'Error': ['ML models not loaded. Please ensure the model files exist and are accessible.']}), None, None

    if file is None:
        return pd.DataFrame({'Message': ['Please upload a CSV file.']}), None, None

    try:
        df_uploaded = pd.read_csv(file.name)
    except Exception as e:
        return pd.DataFrame({'Error': [f'Failed to read CSV: {e}']}), None, None

    # Ensure 'Title' and 'ReviewText' columns exist, fill missing with empty string
    if 'Title' not in df_uploaded.columns:
        df_uploaded['Title'] = ''
    if 'ReviewText' not in df_uploaded.columns:
        df_uploaded['ReviewText'] = ''

    # Apply build_text and safe_get for combined text
    df_uploaded["__text__"] = df_uploaded.apply(lambda row: build_text(
        safe_get(row, "Title"),
        safe_get(row, "ReviewText")
    ), axis=1)

    # --- ML Attribute Extraction ---
    try:
        # Ensure TF-IDF transform is applied before decision_function if using pipeline's decision_function
        attr_proba = loaded_attr_clf.predict_proba(df_uploaded["__text__"].values)
    except Exception:
        # Fallback for decision_function + per-class sigmoid for consistency with training
        # Need to transform text using the pipeline's TFIDF vectorizer first
        scores = loaded_attr_clf.decision_function(loaded_attr_clf.named_steps['tfidf'].transform(df_uploaded["__text__"].values))
        attr_proba = expit(scores)

    attr_pred_bin = (attr_proba >= 0.5).astype(int)
    ml_attr_tags = loaded_mlb.inverse_transform(attr_pred_bin)
    df_uploaded['Extracted_Attributes'] = ["; ".join(tags) for tags in ml_attr_tags]

    # --- ML Sentiment Analysis ---
    try:
        proba_all_senti = loaded_sentiment_clf.predict_proba(df_uploaded["__text__"].values)
        senti_classes = loaded_sentiment_clf.named_steps["logreg"].classes_
        prob_df_senti = pd.DataFrame(proba_all_senti, columns=[f"ML_Sentiment_Prob_{c}" for c in senti_classes])
    except Exception:
        scores_senti = loaded_sentiment_clf.decision_function(df_uploaded["__text__"].values)
        if scores_senti.ndim == 1:
            scores_senti = np.vstack([scores_senti, -scores_senti]).T
            senti_classes = np.array(["Positive", "Negative"])
        else:
            senti_classes = loaded_sentiment_clf.named_steps["logreg"].classes_
        proba_all_senti = softmax(scores_senti, axis=1)
        prob_df_senti = pd.DataFrame(proba_all_senti, columns=[f"ML_Sentiment_Prob_{c}" for c in senti_classes])

    senti_pred = loaded_sentiment_clf.predict(df_uploaded["__text__"].values)
    df_uploaded["ML_Sentiment_Label"] = senti_pred

    pos_col = [c for c in prob_df_senti.columns if c.endswith("positive")]
    neg_col = [c for c in prob_df_senti.columns if c.endswith("negative")]
    pos_prob = prob_df_senti[pos_col[0]] if pos_col else 0.0
    neg_prob = prob_df_senti[neg_col[0]] if neg_col else 0.0
    df_uploaded["ML_Sentiment_Score"] = (pos_prob - neg_prob).fillna(0.0)

    # Select relevant columns for display
    output_df = df_uploaded[['Title', 'ReviewText', 'Extracted_Attributes', 'ML_Sentiment_Label', 'ML_Sentiment_Score']]

    # --- Generate Bar Chart for Sentiment Distribution ---
    sentiment_counts = df_uploaded['ML_Sentiment_Label'].value_counts().sort_index()
    plt.figure(figsize=(8, 5))
    sns.barplot(x=sentiment_counts.index, y=sentiment_counts.values, palette='viridis')
    plt.title('Distribution of ML Sentiment Labels')
    plt.xlabel('Sentiment Label')
    plt.ylabel('Count')
    plt.tight_layout()
    sentiment_plot_bytes = io.BytesIO()
    plt.savefig(sentiment_plot_bytes, format='png')
    plt.close() # Close the plot to free memory
    sentiment_plot_bytes.seek(0)
    sentiment_plot_pil = Image.open(sentiment_plot_bytes) # Convert to PIL Image

    # --- Generate Word Cloud for Extracted Attributes ---
    # Filter out empty strings and then join
    all_attributes = " ".join([attr for attr_list in df_uploaded['Extracted_Attributes'].tolist() for attr in attr_list.split('; ') if attr.strip()])

    wordcloud = WordCloud(width=800, height=400, background_color='white').generate(all_attributes)
    wordcloud_bytes = io.BytesIO()
    wordcloud.to_image().save(wordcloud_bytes, format='png')
    wordcloud_bytes.seek(0)
    wordcloud_pil = Image.open(wordcloud_bytes) # Convert to PIL Image

    # --- Save output_df to a temporary CSV for download ---
    temp_csv_path = "processed_reviews.csv"
    output_df.to_csv(temp_csv_path, index=False)

    return output_df, sentiment_plot_pil, wordcloud_pil, temp_csv_path

# Define custom CSS and head content
custom_css = """
body { font-family: 'Arial', sans-serif; background-color: #f0f2f5; }
.gradio-container { max-width: 1200px; margin: auto; padding: 20px; box-shadow: 0 4px 8px rgba(0,0,0,0.1); border-radius: 8px; background-color: #bbc8f2; }
.gradio-input label, .gradio-output label { font-weight: bold; color: #333; }
.gradio-title { color: #7faaeb; text-align: center; margin-bottom: 20px; }
.gradio-description { text-align: center; color: #555; margin-bottom: 30px; }
.gr-button { background-color: #007bff; color: white; border: none; padding: 10px 20px; border-radius: 5px; cursor: pointer; }
.gr-button:hover { background-color: #0056b3; }
/* CSS for text wrapping in DataFrame cells */
.gradio-output--grid table td { white-space: normal !important; }
"""

custom_head = "<title>AAI590-Capstone Project</title>"

# Create the Gradio Blocks interface
with gr.Blocks(css=custom_css, head=custom_head, title="AAI590:ML-based Attribute and Sentiment Extraction and Visualization") as demo:

    with gr.Row():
        with gr.Column(scale=1):
            gr.Image("USD_Logo.png", width=50, elem_id="logo", show_download_button=False)
        with gr.Column(scale=20):
            gr.Markdown("# AAI590: ML-based Attribute and Sentiment Extraction and Visualization")

    gr.Markdown("### By Aditya, Deepak, Rajesh")
    gr.Markdown("Upload a CSV file containing 'Title' and 'ReviewText' columns to extract product attributes and sentiment, and visualize their distribution and common attributes.")

    with gr.Row():
        file_input = gr.File(label="Upload CSV File")
        process_button = gr.Button("Process CSV")

    with gr.Row(): # New row for plots
        sentiment_plot_component = gr.Image(label="ML Sentiment Distribution", type="pil")
        wordcloud_plot_component = gr.Image(label="Extracted Attributes Word Cloud", type="pil")

    with gr.Row(): # New row for DataFrame and download with width control
        with gr.Column(): # 3/4 width for DataFrame
            output_df_component = gr.DataFrame(label="Processed Reviews with Extracted Attributes and Sentiment")
    with gr.Row(): # 1/4 width for download button
            download_csv_component = gr.File(label="Download Processed Data (.csv)")

    process_button.click(
        fn=process_uploaded_csv,
        inputs=[file_input],
        outputs=[output_df_component, sentiment_plot_component, wordcloud_plot_component, download_csv_component]
    )

# Launch the interface
demo.launch()

It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://bc7a9b3bdde4f92c4b.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


