## Loading the required libraries and Models

In [4]:
import gradio as gr
import pandas as pd
import os
import io
from PIL import Image
from wordcloud import WordCloud
import joblib
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
from tokenizers import Tokenizer, models, pre_tokenizers, trainers, normalizers, processors
import json
from typing import List, Tuple, Dict, Set
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import classification_report, f1_score
from sklearn.utils import class_weight
import joblib
import torch
import torch.nn as nn
import torch.nn.functional as F # Added this import

from google.colab import drive
drive.mount('/content/drive')

import sys
sys.path.append('/content/drive/MyDrive/AAI-590/')
from text_helpers import build_text, safe_get

print("build_text and safe_get functions imported from text_helpers.py")

Mounted at /content/drive
build_text and safe_get functions imported from text_helpers.py


In [5]:
# Load the tokenizer when needed
tokenizer = Tokenizer.from_file("/content/drive/MyDrive/AAI-590/tokenizer.json")

In [6]:
attr_model_path = "/content/drive/MyDrive/AAI-590/models_attributes.pkl"
sentiment_tfidf_model_path = "/content/drive/MyDrive/AAI-590/models_sentiment.pkl"
sentiment_model_path = "/content/drive/MyDrive/AAI-590/sentiment_analysis_model.pth"

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
vocab_size= 12000
cfg = {
    "vocab_size":vocab_size,
    "emb_dim":128,
    "hidden_dim": 512,
    "num_layers":5,
    "bidirectional": True,
    "dropout": 0.15,
    "seq_len":256,
}

def clean_text(text):
    text = text.lower()
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'#\w+', '', text)
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'[^a-z\s]', '', text)
    text = text.strip()
    return text

class SentimentAnalysisModel(nn.Module):
  def __init__(self, cfg):
      super(SentimentAnalysisModel, self).__init__()
      self.embedding = nn.Embedding(cfg['vocab_size'], cfg['emb_dim'])
      self.lstm = nn.LSTM(input_size=cfg['emb_dim'],
                          hidden_size=cfg['hidden_dim'],
                          num_layers=cfg['num_layers'],
                          batch_first=True,
                          bidirectional=cfg['bidirectional'],
                          dropout=cfg['dropout'])
      self.conv1 = nn.Conv1d(in_channels=cfg['hidden_dim'] * 2,
                            out_channels=128,
                            kernel_size=3)
      self.pool = nn.MaxPool1d(kernel_size=2)
      #input: (batch_size,128,128)

      # Correct calculation for in_features for fc1:
      # (seq_len - kernel_size + 1) // pool_kernel_size = (256 - 3 + 1) // 2 = 254 // 2 = 127
      # 128 * 127 = 16256
      self.fc1 = nn.Linear(128 * 127, 64)
      #output: (batch_size , 64)

      # Final layer for 3 sentiment classes (negative, neutral, positive)
      self.fc2 = nn.Linear(64, 3)
      #output: (batch_size, 3) - raw logits

      self.dropout = nn.Dropout(cfg['dropout'])
      # No sigmoid here, CrossEntropyLoss expects raw logits

  def forward(self, x):
      x = self.embedding(x)
      lstm_out, _ = self.lstm(x)
      lstm_out = lstm_out.permute(0, 2, 1)
      conv_out = self.conv1(lstm_out)
      pooled_out = self.pool(conv_out)

      # flatten
      flattened = pooled_out.view(pooled_out.size(0), -1)

      x = self.dropout(torch.relu(self.fc1(flattened)))
      # Output raw logits for CrossEntropyLoss
      x = self.fc2(x)
      return x

loaded_attr_clf = None
loaded_mlb = None
loaded_sentiment_clf = None
loaded_sentiment_tdfif_clf = None

try:
    loaded_attr_model_data = joblib.load(attr_model_path)
    loaded_attr_clf = loaded_attr_model_data["pipeline"]
    loaded_mlb = loaded_attr_model_data["mlb"]
    print("Attribute model and MultiLabelBinarizer loaded successfully.")
except Exception as e:
    print(f"Error loading attribute model: {e}")

try:
    loaded_sentiment_tdfif_clf = joblib.load(sentiment_tfidf_model_path)
    print("Sentiment TF-IDF model loaded successfully.")
except Exception as e:
    print(f"Error loading sentiment TF-IDF model: {e}")

try:
    loaded_sentiment_clf = SentimentAnalysisModel(cfg)
    loaded_sentiment_clf.load_state_dict(torch.load(sentiment_model_path, map_location=device))
    loaded_sentiment_clf.eval()
    print("Sentiment model loaded successfully.")
except Exception as e:
    print(f"Error loading sentiment model: {e}")

Attribute model and MultiLabelBinarizer loaded successfully.
Sentiment TF-IDF model loaded successfully.
Sentiment model loaded successfully.


In [7]:
def inference(model, text, tokenizer, device, cfg):
    model.eval()
    cleaned_text = clean_text(text)
    encoded = tokenizer.encode(cleaned_text)
    input_ids = torch.tensor([encoded.ids]).to(device)

    with torch.no_grad():
        output = model(input_ids)

    # Apply softmax to get probabilities
    probabilities = F.softmax(output, dim=1)

    # Get the predicted class index and its probability
    predicted_probability, predicted_index = torch.max(probabilities, dim=1)

    # Map numerical index back to sentiment label
    label_map_reverse = {0: 'negative', 1: 'neutral', 2: 'positive'}
    sentiment = label_map_reverse[predicted_index.item()]

    return sentiment, predicted_probability.item()

In [8]:
text_to_analyze = "design looked good but the cloth is bad. Don't buy it"
sentiment, probability = inference(loaded_sentiment_clf, text_to_analyze, tokenizer, device, cfg)
print(f"Sentiment: {sentiment}")
print(f"Probability: {probability:.4f}")

Sentiment: negative
Probability: 0.9999


In [9]:

def process_uploaded_csv(file):
    if loaded_attr_clf is None or loaded_mlb is None or loaded_sentiment_clf is None:
        return pd.DataFrame({'Error': ['ML models not loaded. Please ensure the model files exist and are accessible.']}), None, None, None

    if file is None:
        return pd.DataFrame({'Message': ['Please upload a CSV file.']}), None, None, None

    try:
        df_uploaded = pd.read_csv(file.name)
        # Drop rows from df_uploaded where Title and ReviewText both are empty or Null
        df_uploaded = df_uploaded.dropna(subset=['Title', 'ReviewText'], how='all').reset_index(drop=True)
    except Exception as e:
        return pd.DataFrame({'Error': [f'Failed to read CSV: {e}']}), None, None, None

    # Ensure 'Title' and 'ReviewText' columns exist, fill missing with empty string
    if 'Title' not in df_uploaded.columns:
        df_uploaded['Title'] = ''
    if 'ReviewText' not in df_uploaded.columns:
        df_uploaded['ReviewText'] = ''

    # Apply build_text and safe_get for combined text
    df_uploaded["__text__"] = df_uploaded.apply(lambda row: build_text(
        safe_get(row, "Title"),
        safe_get(row, "ReviewText")
    ), axis=1)

    # --- ML Attribute Extraction ---
    try:
        # Ensure TF-IDF transform is applied before decision_function if using pipeline's decision_function
        attr_proba = loaded_attr_clf.predict_proba(df_uploaded["__text__"].values)
    except Exception:
        # Fallback for decision_function + per-class sigmoid for consistency with training
        # Need to transform text using the pipeline's TFIDF vectorizer first
        scores = loaded_attr_clf.named_steps['tfidf'].transform(df_uploaded["__text__"].values)
        attr_proba = expit(scores)

    attr_pred_bin = (attr_proba >= 0.5).astype(int)
    ml_attr_tags = loaded_mlb.inverse_transform(attr_pred_bin)
    df_uploaded['Extracted_Attributes'] = ["; ".join(tags) for tags in ml_attr_tags]

    # --- ML Sentiment Analysis ---
    all_sentiments = []
    all_probabilities = []
    # Loop through each text entry and call the inference function
    for text_item in df_uploaded["__text__"].values:
        sentiment_label, sentiment_prob = inference(loaded_sentiment_clf, text_item, tokenizer, device, cfg)
        all_sentiments.append(sentiment_label)
        all_probabilities.append(sentiment_prob)

    df_uploaded["ML_Sentiment_Label"] = all_sentiments
    df_uploaded["ML_Sentiment_Score"] = all_probabilities

    # --- TFIDF Sentiment Analysis ---
    try:
        proba_all_senti = loaded_sentiment_tdfif_clf.predict_proba(df_uploaded["__text__"].values)
        senti_classes = loaded_sentiment_tdfif_clf.named_steps["logreg"].classes_
        prob_df_senti = pd.DataFrame(proba_all_senti, columns=[f"ML_Sentiment_Prob_{c}" for c in senti_classes])
    except Exception:
        scores_senti = loaded_sentiment_tdfif_clf.decision_function(df_uploaded["__text__"].values)
        if scores_senti.ndim == 1:
            scores_senti = np.vstack([scores_senti, -scores_senti]).T
            senti_classes = np.array(["Positive", "Negative"])
        else:
            senti_classes = loaded_sentiment_tdfif_clf.named_steps["logreg"].classes_
        proba_all_senti = softmax(scores_senti, axis=1)
        prob_df_senti = pd.DataFrame(proba_all_senti, columns=[f"ML_Sentiment_Prob_{c}" for c in senti_classes])

    senti_pred = loaded_sentiment_tdfif_clf.predict(df_uploaded["__text__"].values)
    df_uploaded["TFIDF_Sentiment_Label"] = senti_pred

    pos_col = [c for c in prob_df_senti.columns if c.endswith("positive")]
    neg_col = [c for c in prob_df_senti.columns if c.endswith("negative")]
    pos_prob = prob_df_senti[pos_col[0]] if pos_col else 0.0
    neg_prob = prob_df_senti[neg_col[0]] if neg_col else 0.0
    df_uploaded["TFIDF_Sentiment_Score"] = (pos_prob - neg_prob).fillna(0.0)

    # Select relevant columns for display
    output_df = df_uploaded[['Title', 'ReviewText', 'Extracted_Attributes', 'ML_Sentiment_Label', 'ML_Sentiment_Score', 'TFIDF_Sentiment_Label']]

    # --- Generate Bar Chart for Sentiment Distribution ---
    sentiment_counts = df_uploaded['ML_Sentiment_Label'].value_counts().sort_index()
    plt.figure(figsize=(8, 5))
    sns.barplot(x=sentiment_counts.index, y=sentiment_counts.values, palette='viridis')
    plt.title('Distribution of ML Sentiment Labels')
    plt.xlabel('Sentiment Label')
    plt.ylabel('Count')
    plt.tight_layout()
    sentiment_plot_bytes = io.BytesIO()
    plt.savefig(sentiment_plot_bytes, format='png')
    plt.close() # Close the plot to free memory
    sentiment_plot_bytes.seek(0)
    sentiment_plot_pil = Image.open(sentiment_plot_bytes) # Convert to PIL Image

    # --- Generate Word Cloud for Extracted Attributes ---
    # Filter out empty strings and then join
    all_attributes = " ".join([attr for attr_list in df_uploaded['Extracted_Attributes'].tolist() for attr in attr_list.split('; ') if attr.strip()])

    wordcloud = WordCloud(width=800, height=400, background_color='white').generate(all_attributes)
    wordcloud_bytes = io.BytesIO()
    wordcloud.to_image().save(wordcloud_bytes, format='png')
    wordcloud_bytes.seek(0)
    wordcloud_pil = Image.open(wordcloud_bytes) # Convert to PIL Image

    # --- Save output_df to a temporary CSV for download ---
    temp_csv_path = "processed_reviews.csv"
    output_df.to_csv(temp_csv_path, index=False)

    #--- Create a list of top 10 most used attributes
    all_attributes_list = [attr.strip() for attr_list in df_uploaded['Extracted_Attributes'].tolist() for attr in attr_list.split('; ') if attr.strip()]
    attribute_counts = pd.Series(all_attributes_list).value_counts().head(10)
    top_10_attributes = attribute_counts.index.tolist()


    return output_df, sentiment_plot_pil, wordcloud_pil, temp_csv_path, top_10_attributes

In [15]:
# Define custom CSS and head content
custom_css = """
body { font-family: 'Arial', sans-serif; background-color: #f0f2f5; }
.gradio-container { max-width: 1200px; margin: auto; padding: 20px; box-shadow: 0 4px 8px rgba(0,0,0,0.1); border-radius: 8px; background-color: #bbc8f2; }
.gradio-input label, .gradio-output label { font-weight: bold; color: #333; }
.gradio-title { color: #7faaeb; text-align: center; margin-bottom: 20px; }
.gradio-description { text-align: center; color: #555; margin-bottom: 30px; }
.gr-button { color: white; border: none; padding: 10px 20px; border-radius: 5px; cursor: pointer; }
.gr-button:hover { background-color: #28a745; }
/* CSS for text wrapping in DataFrame cells */
.gradio-output--grid table td { white-space: normal !important; }

/* Custom styles for process button states using an ID for higher specificity */
#process_csv_button.no-file-button {
    background-color: #28a745 !important; /* Grey when no file is uploaded */
}
#process_csv_button.enabled-button {
    background-color: #28a745 !important; /* Green when active */
}
#process_csv_button.enabled-button:hover {
    background-color: #28a745 !important;
}
"""

custom_head = "<title>AAI590-Capstone Project</title>"

# Create the Gradio Blocks interface
with gr.Blocks(css=custom_css, head=custom_head, title="AAI590:ML-based Attribute and Sentiment Extraction and Visualization") as demo:

    with gr.Row():
        with gr.Column(scale=20):
            gr.Markdown("# AAI590: ML-based Attribute and Sentiment Extraction and Visualization")
        with gr.Column(scale=1):
            gr.Image("/content/drive/MyDrive/AAI-590/USD_Logo.png", width=50, elem_id="logo", show_download_button=False)

    gr.Markdown("### By Aditya, Deepak, Rajesh")
    gr.Markdown("Upload a CSV file containing 'Title' and 'ReviewText' columns to extract product attributes and sentiment, and visualize their distribution and common attributes.")

    with gr.Row():
        file_input = gr.File(label="Upload CSV File")
        # Initially grey but interactive
        process_button = gr.Button(
            "Process CSV",
            elem_id="process_csv_button", # Added unique ID here
            elem_classes=["no-file-button"],
            interactive=True
        )

    with gr.Row(): # New row for plots
        sentiment_plot_component = gr.Image(label="ML Sentiment Distribution", type="pil")
        wordcloud_plot_component = gr.Image(label="Extracted Attributes Word Cloud", type="pil")

    with gr.Row(): # New row for DataFrame and download with width control
        with gr.Column(): # 3/4 width for DataFrame
            output_df_component = gr.DataFrame(label="Processed Reviews with Extracted Attributes and Sentiment")
    with gr.Row(): # 1/4 width for download button
            download_csv_component = gr.File(label="Download Processed Data (.csv)")
    with gr.Row():
        top_10_attributes_component = gr.Textbox(label="Top 10 Most Used Attributes")

    # Function to update button state and style based on file input
    def update_button_state(file):
        if file is not None:
            return gr.Button.update(interactive=True, elem_classes=["enabled-button"])
        else:
            return gr.Button.update(interactive=True, elem_classes=["no-file-button"])

    # Attach the update function to the file input's change event
    file_input.change(
        fn=update_button_state,
        inputs=[file_input],
        outputs=[process_button],
        queue=False # Important for responsiveness
    )

    process_button.click(
        fn=process_uploaded_csv,
        inputs=[file_input],
        outputs=[output_df_component, sentiment_plot_component, wordcloud_plot_component, download_csv_component, top_10_attributes_component]
    )

# Launch the interface
demo.launch()

It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://0d4152435936b8718f.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


