In [None]:
# Excel Sentiment Analyzer for Google Colab
# Upload this notebook to Google Colab and run each cell

# Cell 1: Install required packages
!pip install transformers torch pandas openpyxl langdetect arabic-reshaper python-bidi

Collecting langdetect
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/981.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m36.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting arabic-reshaper
  Downloading arabic_reshaper-3.0.0-py3-none-any.whl.metadata (12 kB)
Collecting python-bidi
  Downloading python_bidi-0.6.6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.9 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12

In [None]:
# Cell 2: Import libraries and setup
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
import re
import warnings
from langdetect import detect
from google.colab import files
import io
import zipfile
from IPython.display import display, HTML, clear_output
import ipywidgets as widgets
from IPython.display import FileLink
import os

warnings.filterwarnings("ignore")

class ColabSentimentAnalyzer:
    def __init__(self):
        self.arabic_pipeline = None
        self.english_pipeline = None
        self.df = None

        # Model configurations
        self.models = {
            "Arabic": {
                "AraBERT v2": "aubmindlab/bert-base-arabertv2",
                "XLM-RoBERTa": "cardiffnlp/twitter-xlm-roberta-base-sentiment",
                "CAMeLBERT": "CAMeL-Lab/bert-base-arabic-camelbert-mix"
            },
            "English": {
                "RoBERTa": "cardiffnlp/twitter-roberta-base-sentiment-latest",
                "DistilBERT": "distilbert-base-uncased-finetuned-sst-2-english",
                "XLM-RoBERTa": "cardiffnlp/twitter-xlm-roberta-base-sentiment"
            }
        }

    def load_models(self, arabic_model="AraBERT v2", english_model="RoBERTa"):
        """Load both Arabic and English models"""
        print("🔄 Loading models...")

        try:
            # Load Arabic model
            arabic_model_name = self.models["Arabic"][arabic_model]
            print(f"Loading Arabic model: {arabic_model}")

            arabic_tokenizer = AutoTokenizer.from_pretrained(arabic_model_name)
            arabic_model_obj = AutoModelForSequenceClassification.from_pretrained(arabic_model_name)

            self.arabic_pipeline = pipeline(
                "sentiment-analysis",
                model=arabic_model_obj,
                tokenizer=arabic_tokenizer,
                device=0 if torch.cuda.is_available() else -1
            )
            print("✅ Arabic model loaded successfully")

            # Load English model
            english_model_name = self.models["English"][english_model]
            print(f"Loading English model: {english_model}")

            english_tokenizer = AutoTokenizer.from_pretrained(english_model_name)
            english_model_obj = AutoModelForSequenceClassification.from_pretrained(english_model_name)

            self.english_pipeline = pipeline(
                "sentiment-analysis",
                model=english_model_obj,
                tokenizer=english_tokenizer,
                device=0 if torch.cuda.is_available() else -1
            )
            print("✅ English model loaded successfully")
            print("🎉 All models ready!")

        except Exception as e:
            print(f"❌ Error loading models: {str(e)}")
            raise e

    def detect_language(self, text):
        """Detect if text is Arabic or English"""
        try:
            if not text or not str(text).strip():
                return 'unknown'

            text = str(text)

            # Count Arabic and English characters
            arabic_chars = len(re.findall(r'[\u0600-\u06FF]', text))
            english_chars = len(re.findall(r'[a-zA-Z]', text))

            if arabic_chars > english_chars:
                return 'arabic'
            elif english_chars > arabic_chars:
                return 'english'
            else:
                # Use langdetect as fallback
                try:
                    detected = detect(text)
                    return 'arabic' if detected == 'ar' else 'english'
                except:
                    return 'english'

        except Exception:
            return 'english'

    def clean_text(self, text):
        """Clean and preprocess text"""
        if pd.isna(text) or not text:
            return ""

        text = str(text)

        # Remove URLs
        text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
        # Remove mentions and hashtags
        text = re.sub(r'@\w+|#\w+', '', text)
        # Remove extra whitespace
        text = re.sub(r'\s+', ' ', text).strip()

        return text

    def analyze_sentiment(self, text, language):
        """Analyze sentiment based on detected language"""
        try:
            if language == 'arabic' and self.arabic_pipeline:
                result = self.arabic_pipeline(text)[0]
            elif language == 'english' and self.english_pipeline:
                result = self.english_pipeline(text)[0]
            else:
                return "Unknown", 0.0

            # Standardize labels
            label_mapping = {
                'POSITIVE': 'Positive',
                'NEGATIVE': 'Negative',
                'NEUTRAL': 'Neutral',
                'LABEL_0': 'Negative',
                'LABEL_1': 'Neutral',
                'LABEL_2': 'Positive'
            }

            sentiment = label_mapping.get(result['label'], result['label'])
            confidence = result['score']

            return sentiment, confidence

        except Exception:
            return "Error", 0.0

    def upload_excel(self):
        """Upload Excel file in Colab"""
        print("📁 Please upload your Excel file:")
        uploaded = files.upload()

        if uploaded:
            filename = list(uploaded.keys())[0]
            print(f"✅ File uploaded: {filename}")

            try:
                self.df = pd.read_excel(filename)
                print(f"📊 Excel loaded: {len(self.df)} rows, {len(self.df.columns)} columns")
                print(f"📋 Columns: {list(self.df.columns)}")
                return filename
            except Exception as e:
                print(f"❌ Error reading Excel file: {str(e)}")
                return None

        return None

    def show_preview(self, num_rows=5):
        """Show preview of the data"""
        if self.df is not None:
            print("📋 Data Preview:")
            display(self.df.head(num_rows))
        else:
            print("❌ No data loaded. Please upload an Excel file first.")

    def process_excel(self, comments_column, add_language=True, add_confidence=True,
                     clean_text=True, skip_empty=True):
        """Process the Excel file and add sentiment labels"""

        if self.df is None:
            print("❌ No Excel file loaded. Please upload a file first.")
            return None

        if comments_column not in self.df.columns:
            print(f"❌ Column '{comments_column}' not found. Available columns: {list(self.df.columns)}")
            return None

        if not self.arabic_pipeline and not self.english_pipeline:
            print("❌ No models loaded. Please load models first.")
            return None

        print("🚀 Starting sentiment analysis...")

        # Create result dataframe
        df_result = self.df.copy()

        # Initialize new columns
        df_result['Sentiment'] = ''
        if add_confidence:
            df_result['Confidence'] = 0.0
        if add_language:
            df_result['Detected_Language'] = ''

        total_rows = len(df_result)
        processed = 0

        # Process each row
        for index, row in df_result.iterrows():
            comment = row[comments_column]

            # Skip empty rows if requested
            if skip_empty and (pd.isna(comment) or not str(comment).strip()):
                processed += 1
                if processed % 100 == 0:
                    print(f"⏳ Processed {processed}/{total_rows} rows...")
                continue

            # Clean text if requested
            if clean_text:
                cleaned_comment = self.clean_text(comment)
            else:
                cleaned_comment = str(comment) if not pd.isna(comment) else ""

            if not cleaned_comment:
                processed += 1
                continue

            # Detect language
            language = self.detect_language(cleaned_comment)

            # Analyze sentiment
            sentiment, confidence = self.analyze_sentiment(cleaned_comment, language)

            # Update dataframe
            df_result.at[index, 'Sentiment'] = sentiment
            if add_confidence:
                df_result.at[index, 'Confidence'] = confidence
            if add_language:
                df_result.at[index, 'Detected_Language'] = language.title()

            processed += 1

            # Progress update
            if processed % 100 == 0:
                print(f"⏳ Processed {processed}/{total_rows} rows...")

        print(f"✅ Processing complete! Processed {processed}/{total_rows} rows.")

        # Show summary
        self.show_summary(df_result)

        return df_result

    def show_summary(self, df_result):
        """Show processing summary"""
        total_processed = df_result['Sentiment'].notna().sum()

        if total_processed > 0:
            positive_count = (df_result['Sentiment'] == 'Positive').sum()
            negative_count = (df_result['Sentiment'] == 'Negative').sum()
            neutral_count = (df_result['Sentiment'] == 'Neutral').sum()

            print("\n📊 SENTIMENT ANALYSIS SUMMARY")
            print("=" * 40)
            print(f"Total Comments: {len(df_result)}")
            print(f"Processed: {total_processed}")
            print(f"Positive: {positive_count} ({positive_count/total_processed*100:.1f}%)")
            print(f"Negative: {negative_count} ({negative_count/total_processed*100:.1f}%)")
            print(f"Neutral: {neutral_count} ({neutral_count/total_processed*100:.1f}%)")

            if 'Detected_Language' in df_result.columns:
                arabic_count = (df_result['Detected_Language'] == 'Arabic').sum()
                english_count = (df_result['Detected_Language'] == 'English').sum()
                print(f"\nLanguage Distribution:")
                print(f"Arabic: {arabic_count} ({arabic_count/total_processed*100:.1f}%)")
                print(f"English: {english_count} ({english_count/total_processed*100:.1f}%)")

        print("=" * 40)

    def save_results(self, df_result, filename="sentiment_results.xlsx"):
        """Save results to Excel file"""
        if df_result is not None:
            df_result.to_excel(filename, index=False)
            print(f"💾 Results saved to: {filename}")

            # Provide download link
            files.download(filename)
            print("📥 File download started...")
        else:
            print("❌ No results to save.")

    def analyze_single_comment(self, comment):
        """Analyze a single comment"""
        if not self.arabic_pipeline and not self.english_pipeline:
            print("❌ No models loaded. Please load models first.")
            return

        cleaned_comment = self.clean_text(comment)
        language = self.detect_language(cleaned_comment)
        sentiment, confidence = self.analyze_sentiment(cleaned_comment, language)

        print(f"\n📝 Comment: {comment}")
        print(f"🔍 Cleaned: {cleaned_comment}")
        print(f"🌐 Language: {language.title()}")
        print(f"😊 Sentiment: {sentiment}")
        print(f"📊 Confidence: {confidence:.2%}")

# Cell 3: Initialize the analyzer
analyzer = ColabSentimentAnalyzer()

# Cell 4: Load models (this will take a few minutes)
print("Select models to load:")
print("Arabic models: AraBERT v2 (recommended), XLM-RoBERTa, CAMeLBERT")
print("English models: RoBERTa (recommended), DistilBERT, XLM-RoBERTa")

# Load default models (you can change these)
analyzer.load_models(arabic_model="AraBERT v2", english_model="RoBERTa")

# Cell 5: Upload Excel file
filename = analyzer.upload_excel()

# Cell 6: Preview data
analyzer.show_preview(10)

# Cell 7: Process Excel file
if analyzer.df is not None:
    # Show available columns
    print("Available columns:", list(analyzer.df.columns))

    # YOU NEED TO SPECIFY THE COLUMN NAME HERE
    comments_column = "comments"  # CHANGE THIS to your actual column name

    # Process the data
    results = analyzer.process_excel(
        comments_column=comments_column,  # Column containing comments
        add_language=True,               # Add language detection column
        add_confidence=True,             # Add confidence score column
        clean_text=True,                 # Clean text (remove URLs, etc.)
        skip_empty=True                  # Skip empty rows
    )

    # Show sample results
    if results is not None:
        print("\n📋 Sample Results:")
        display(results[['Sentiment', 'Confidence', 'Detected_Language']].head(10))

# Cell 8: Save results
if 'results' in locals() and results is not None:
    analyzer.save_results(results, "sentiment_analysis_results.xlsx")

# Cell 9: Test single comment analysis
print("🧪 Testing single comment analysis:")
test_comment_arabic = "هذا المنتج رائع جداً وأنصح الجميع بشرائه"
test_comment_english = "This product is amazing and I highly recommend it!"

analyzer.analyze_single_comment(test_comment_arabic)
analyzer.analyze_single_comment(test_comment_english)

# Cell 10: Interactive analysis (optional)
def interactive_analysis():
    comment_input = widgets.Textarea(
        value='Enter your comment here...',
        placeholder='Type a comment in Arabic or English',
        description='Comment:',
        layout=widgets.Layout(width='70%', height='100px')
    )

    analyze_button = widgets.Button(description="Analyze", button_style='primary')
    output_area = widgets.Output()

    def on_analyze_click(b):
        with output_area:
            clear_output()
            if comment_input.value.strip():
                analyzer.analyze_single_comment(comment_input.value)

    analyze_button.on_click(on_analyze_click)

    display(widgets.VBox([comment_input, analyze_button, output_area]))

# Uncomment the line below to enable interactive analysis
# interactive_analysis()

print("\n🎉 Setup complete! You can now:")
print("1. Upload Excel files and process them")
print("2. Analyze individual comments")
print("3. Download results with sentiment labels")

Select models to load:
Arabic models: AraBERT v2 (recommended), XLM-RoBERTa, CAMeLBERT
English models: RoBERTa (recommended), DistilBERT, XLM-RoBERTa
🔄 Loading models...
Loading Arabic model: AraBERT v2


tokenizer_config.json:   0%|          | 0.00/611 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/384 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/720k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.31M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/543M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at aubmindlab/bert-base-arabertv2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Device set to use cuda:0


✅ Arabic model loaded successfully
Loading English model: RoBERTa


config.json:   0%|          | 0.00/929 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


pytorch_model.bin:   0%|          | 0.00/501M [00:00<?, ?B/s]

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cuda:0


✅ English model loaded successfully
🎉 All models ready!
📁 Please upload your Excel file:


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/501M [00:00<?, ?B/s]

Saving Book 1.xlsx to Book 1.xlsx
✅ File uploaded: Book 1.xlsx
📊 Excel loaded: 4 rows, 1 columns
📋 Columns: ['Comments']
📋 Data Preview:


Unnamed: 0,Comments
0,المنتج سيء جداً ولا أنصح به أبداً
1,This product is amazing and works perfectly!
2,This is the worst product I've ever bought
3,هذا المنتج رائع جداً وأنصح الجميع بشرائه


Available columns: ['Comments']
❌ Column 'comments' not found. Available columns: ['Comments']
🧪 Testing single comment analysis:

📝 Comment: هذا المنتج رائع جداً وأنصح الجميع بشرائه
🔍 Cleaned: هذا المنتج رائع جداً وأنصح الجميع بشرائه
🌐 Language: Arabic
😊 Sentiment: Neutral
📊 Confidence: 50.02%

📝 Comment: This product is amazing and I highly recommend it!
🔍 Cleaned: This product is amazing and I highly recommend it!
🌐 Language: English
😊 Sentiment: positive
📊 Confidence: 98.70%

🎉 Setup complete! You can now:
1. Upload Excel files and process them
2. Analyze individual comments
3. Download results with sentiment labels


In [None]:
# STEP 2: Import libraries
import pandas as pd
import torch
from transformers import pipeline
import re
from langdetect import detect
from google.colab import files
import warnings
warnings.filterwarnings("ignore")

# STEP 3: Define helper functions
def detect_language(text):
    """Detect if text is Arabic or English"""
    try:
        if not text or not str(text).strip():
            return 'english'

        text = str(text)
        arabic_chars = len(re.findall(r'[\u0600-\u06FF]', text))
        english_chars = len(re.findall(r'[a-zA-Z]', text))

        if arabic_chars > english_chars:
            return 'arabic'
        else:
            return 'english'
    except:
        return 'english'

def clean_text(text):
    """Clean text by removing URLs and extra whitespace"""
    if pd.isna(text) or not text:
        return ""

    text = str(text)
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)
    text = re.sub(r'@\w+|#\w+', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def standardize_sentiment(label):
    """Convert model labels to standard format"""
    mapping = {
        'POSITIVE': 'Positive',
        'NEGATIVE': 'Negative',
        'NEUTRAL': 'Neutral',
        'LABEL_0': 'Negative',
        'LABEL_1': 'Neutral',
        'LABEL_2': 'Positive'
    }
    return mapping.get(label, label)

# STEP 4: Load models
print("🔄 Loading models...")

# Load Arabic model (AraBERT)
try:
    arabic_model = pipeline(
        "sentiment-analysis",
        model="aubmindlab/bert-base-arabertv2",
        device=0 if torch.cuda.is_available() else -1
    )
    print("✅ Arabic model loaded (AraBERT)")
except Exception as e:
    print(f"❌ Error loading Arabic model: {e}")
    arabic_model = None

# Load English model (RoBERTa)
try:
    english_model = pipeline(
        "sentiment-analysis",
        model="cardiffnlp/twitter-roberta-base-sentiment-latest",
        device=0 if torch.cuda.is_available() else -1
    )
    print("✅ English model loaded (RoBERTa)")
except Exception as e:
    print(f"❌ Error loading English model: {e}")
    english_model = None

print("🎉 Models ready!")

# STEP 5: Upload Excel file
print("\n📁 Upload your Excel file:")
uploaded = files.upload()

if uploaded:
    filename = list(uploaded.keys())[0]
    print(f"✅ File uploaded: {filename}")

    # Read Excel file
    df = pd.read_excel(filename)
    print(f"📊 Data loaded: {len(df)} rows, {len(df.columns)} columns")
    print(f"📋 Columns: {list(df.columns)}")

    # Show preview
    print("\n📋 Data Preview:")
    display(df.head())

# STEP 6: Configure processing
print("\n⚙️  CONFIGURATION:")
print("Available columns:", list(df.columns))

# CHANGE THIS: Specify your comments column name
COMMENTS_COLUMN = input("Enter the name of your comments column: ").strip()

if COMMENTS_COLUMN not in df.columns:
    print(f"❌ Column '{COMMENTS_COLUMN}' not found!")
    print("Available columns:", list(df.columns))
else:
    print(f"✅ Using column: '{COMMENTS_COLUMN}'")

# STEP 7: Process the data
if COMMENTS_COLUMN in df.columns:
    print(f"\n🚀 Processing {len(df)} comments...")

    # Initialize result columns
    df['Sentiment'] = ''
    df['Confidence'] = 0.0
    df['Language'] = ''

    processed = 0

    for index, row in df.iterrows():
        comment = row[COMMENTS_COLUMN]

        # Skip empty comments
        if pd.isna(comment) or not str(comment).strip():
            processed += 1
            continue

        # Clean text
        cleaned_comment = clean_text(comment)
        if not cleaned_comment:
            processed += 1
            continue

        # Detect language
        language = detect_language(cleaned_comment)

        # Analyze sentiment based on language
        try:
            if language == 'arabic' and arabic_model:
                result = arabic_model(cleaned_comment)[0]
            elif language == 'english' and english_model:
                result = english_model(cleaned_comment)[0]
            else:
                result = {'label': 'Unknown', 'score': 0.0}

            sentiment = standardize_sentiment(result['label'])
            confidence = result['score']

            # Update dataframe
            df.at[index, 'Sentiment'] = sentiment
            df.at[index, 'Confidence'] = confidence
            df.at[index, 'Language'] = language.title()

        except Exception as e:
            df.at[index, 'Sentiment'] = 'Error'
            df.at[index, 'Confidence'] = 0.0
            df.at[index, 'Language'] = language.title()

        processed += 1

        # Progress update
        if processed % 100 == 0:
            print(f"⏳ Processed {processed}/{len(df)} comments...")

    print(f"✅ Processing complete! Processed {processed} comments.")

# STEP 8: Show results summary
if 'Sentiment' in df.columns:
    total_processed = df['Sentiment'].notna().sum()

    if total_processed > 0:
        positive_count = (df['Sentiment'] == 'Positive').sum()
        negative_count = (df['Sentiment'] == 'Negative').sum()
        neutral_count = (df['Sentiment'] == 'Neutral').sum()

        print("\n📊 RESULTS SUMMARY")
        print("=" * 30)
        print(f"Total Comments: {len(df)}")
        print(f"Processed: {total_processed}")
        print(f"Positive: {positive_count} ({positive_count/total_processed*100:.1f}%)")
        print(f"Negative: {negative_count} ({negative_count/total_processed*100:.1f}%)")
        print(f"Neutral: {neutral_count} ({neutral_count/total_processed*100:.1f}%)")

        arabic_count = (df['Language'] == 'Arabic').sum()
        english_count = (df['Language'] == 'English').sum()
        print(f"\nLanguage Distribution:")
        print(f"Arabic: {arabic_count} ({arabic_count/total_processed*100:.1f}%)")
        print(f"English: {english_count} ({english_count/total_processed*100:.1f}%)")
        print("=" * 30)

        # Show sample results
        print("\n📋 Sample Results:")
        sample_cols = [COMMENTS_COLUMN, 'Language', 'Sentiment', 'Confidence']
        display(df[sample_cols].head(10))

# STEP 9: Save results
output_filename = "sentiment_analysis_results.xlsx"
df.to_excel(output_filename, index=False)
print(f"\n💾 Results saved to: {output_filename}")

# Download the file
files.download(output_filename)
print("📥 Download started!")

# STEP 10: Test individual comments (optional)
print("\n🧪 Test individual comments:")

def analyze_comment(comment):
    """Analyze a single comment"""
    cleaned = clean_text(comment)
    language = detect_language(cleaned)

    try:
        if language == 'arabic' and arabic_model:
            result = arabic_model(cleaned)[0]
        elif language == 'english' and english_model:
            result = english_model(cleaned)[0]
        else:
            result = {'label': 'Unknown', 'score': 0.0}

        sentiment = standardize_sentiment(result['label'])
        confidence = result['score']

        print(f"Comment: {comment}")
        print(f"Language: {language.title()}")
        print(f"Sentiment: {sentiment}")
        print(f"Confidence: {confidence:.2%}")
        print("-" * 40)

    except Exception as e:
        print(f"Error analyzing comment: {e}")

# Test with sample comments
test_comments = [
    "هذا المنتج رائع جداً وأنصح الجميع بشرائه",  # Arabic positive
    "المنتج سيء جداً ولا أنصح به أبداً",  # Arabic negative
    "This product is amazing and works perfectly!",  # English positive
    "This is the worst product I've ever bought",  # English negative
]

for comment in test_comments:
    analyze_comment(comment)

print("🎉 Analysis complete!")

🔄 Loading models...


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at aubmindlab/bert-base-arabertv2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Device set to use cuda:0


✅ Arabic model loaded (AraBERT)


Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cuda:0


✅ English model loaded (RoBERTa)
🎉 Models ready!

📁 Upload your Excel file:


Saving Book 1.xlsx to Book 1 (1).xlsx
✅ File uploaded: Book 1 (1).xlsx
📊 Data loaded: 4 rows, 1 columns
📋 Columns: ['Comments']

📋 Data Preview:


Unnamed: 0,Comments
0,المنتج سيء جداً ولا أنصح به أبداً
1,This product is amazing and works perfectly!
2,This is the worst product I've ever bought
3,هذا المنتج رائع جداً وأنصح الجميع بشرائه



⚙️  CONFIGURATION:
Available columns: ['Comments']
Enter the name of your comments column: Comments
✅ Using column: 'Comments'

🚀 Processing 4 comments...
✅ Processing complete! Processed 4 comments.

📊 RESULTS SUMMARY
Total Comments: 4
Processed: 4
Positive: 0 (0.0%)
Negative: 2 (50.0%)
Neutral: 0 (0.0%)

Language Distribution:
Arabic: 2 (50.0%)
English: 2 (50.0%)

📋 Sample Results:


Unnamed: 0,Comments,Language,Sentiment,Confidence
0,المنتج سيء جداً ولا أنصح به أبداً,Arabic,Negative,0.701022
1,This product is amazing and works perfectly!,English,positive,0.987398
2,This is the worst product I've ever bought,English,negative,0.951403
3,هذا المنتج رائع جداً وأنصح الجميع بشرائه,Arabic,Negative,0.777015



💾 Results saved to: sentiment_analysis_results.xlsx


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

📥 Download started!

🧪 Test individual comments:
Comment: هذا المنتج رائع جداً وأنصح الجميع بشرائه
Language: Arabic
Sentiment: Negative
Confidence: 77.70%
----------------------------------------
Comment: المنتج سيء جداً ولا أنصح به أبداً
Language: Arabic
Sentiment: Negative
Confidence: 70.10%
----------------------------------------
Comment: This product is amazing and works perfectly!
Language: English
Sentiment: positive
Confidence: 98.74%
----------------------------------------
Comment: This is the worst product I've ever bought
Language: English
Sentiment: negative
Confidence: 95.14%
----------------------------------------
🎉 Analysis complete!


In [None]:
# STREAMING SENTIMENT ANALYSIS WITH ARABIC TO ENGLISH TRANSLATION
# This code monitors an Excel file for new comments and processes them in real-time

import pandas as pd
import torch
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
import re
import time
import os
from datetime import datetime
import threading
import warnings
from google.colab import files
import hashlib

warnings.filterwarnings("ignore")

class StreamingSentimentAnalyzer:
    def __init__(self):
        self.sentiment_pipeline = None
        self.translation_pipeline = None
        self.input_file = None
        self.output_file = None
        self.last_processed_row = 0
        self.comments_column = None
        self.is_running = False
        self.processed_hashes = set()  # Track processed comments to avoid duplicates

    def load_models(self):
        """Load translation and sentiment analysis models"""
        print("🔄 Loading models...")

        try:
            # Load Arabic to English translation model
            print("Loading Arabic-English translation model...")
            self.translation_pipeline = pipeline(
                "translation",
                model="Helsinki-NLP/opus-mt-ar-en",
                device=0 if torch.cuda.is_available() else -1
            )
            print("✅ Translation model loaded (Arabic → English)")

            # Load English sentiment analysis model (more reliable than multilingual)
            print("Loading English sentiment analysis model...")
            self.sentiment_pipeline = pipeline(
                "sentiment-analysis",
                model="cardiffnlp/twitter-roberta-base-sentiment-latest",
                device=0 if torch.cuda.is_available() else -1
            )
            print("✅ Sentiment model loaded (RoBERTa)")

        except Exception as e:
            print(f"❌ Error loading models: {str(e)}")
            raise e

        print("🎉 All models ready!")

    def detect_language(self, text):
        """Detect if text is Arabic or English"""
        try:
            if not text or not str(text).strip():
                return 'english'

            text = str(text)
            arabic_chars = len(re.findall(r'[\u0600-\u06FF]', text))
            english_chars = len(re.findall(r'[a-zA-Z]', text))

            if arabic_chars > english_chars:
                return 'arabic'
            else:
                return 'english'
        except:
            return 'english'

    def clean_text(self, text):
        """Clean and preprocess text"""
        if pd.isna(text) or not text:
            return ""

        text = str(text)
        # Remove URLs
        text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
        # Remove mentions and hashtags
        text = re.sub(r'@\w+|#\w+', '', text)
        # Remove extra whitespace
        text = re.sub(r'\s+', ' ', text).strip()

        return text

    def translate_arabic_to_english(self, arabic_text):
        """Translate Arabic text to English"""
        try:
            if not arabic_text or len(arabic_text.strip()) == 0:
                return ""

            # Split long text into chunks (model has token limits)
            max_length = 500
            if len(arabic_text) > max_length:
                chunks = [arabic_text[i:i+max_length] for i in range(0, len(arabic_text), max_length)]
                translated_chunks = []
                for chunk in chunks:
                    result = self.translation_pipeline(chunk)[0]
                    translated_chunks.append(result['translation_text'])
                return ' '.join(translated_chunks)
            else:
                result = self.translation_pipeline(arabic_text)[0]
                return result['translation_text']

        except Exception as e:
            print(f"⚠️  Translation error: {str(e)}")
            return arabic_text  # Return original if translation fails

    def analyze_sentiment(self, text):
        """Analyze sentiment of English text"""
        try:
            if not text or len(text.strip()) == 0:
                return "Neutral", 0.0

            result = self.sentiment_pipeline(text)[0]

            # Standardize labels
            label_mapping = {
                'POSITIVE': 'Positive',
                'NEGATIVE': 'Negative',
                'NEUTRAL': 'Neutral',
                'LABEL_0': 'Negative',
                'LABEL_1': 'Neutral',
                'LABEL_2': 'Positive'
            }

            sentiment = label_mapping.get(result['label'], result['label'])
            confidence = result['score']

            return sentiment, confidence

        except Exception as e:
            print(f"⚠️  Sentiment analysis error: {str(e)}")
            return "Error", 0.0

    def get_comment_hash(self, comment):
        """Generate hash for comment to track duplicates"""
        return hashlib.md5(str(comment).encode()).hexdigest()

    def process_comment(self, comment):
        """Process a single comment: clean, translate if needed, analyze sentiment"""
        if pd.isna(comment) or not str(comment).strip():
            return {
                'original_text': comment,
                'cleaned_text': '',
                'language': 'unknown',
                'translated_text': '',
                'sentiment': '',
                'confidence': 0.0,
                'processed_at': datetime.now().strftime('%Y-%m-%d %H:%M:%S')
            }

        # Clean text
        cleaned_comment = self.clean_text(comment)
        if not cleaned_comment:
            return {
                'original_text': comment,
                'cleaned_text': '',
                'language': 'unknown',
                'translated_text': '',
                'sentiment': '',
                'confidence': 0.0,
                'processed_at': datetime.now().strftime('%Y-%m-%d %H:%M:%S')
            }

        # Detect language
        language = self.detect_language(cleaned_comment)

        # Translate if Arabic
        if language == 'arabic':
            translated_text = self.translate_arabic_to_english(cleaned_comment)
            text_for_sentiment = translated_text
        else:
            translated_text = cleaned_comment  # Keep original if English
            text_for_sentiment = cleaned_comment

        # Analyze sentiment
        sentiment, confidence = self.analyze_sentiment(text_for_sentiment)

        return {
            'original_text': comment,
            'cleaned_text': cleaned_comment,
            'language': language.title(),
            'translated_text': translated_text,
            'sentiment': sentiment,
            'confidence': confidence,
            'processed_at': datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        }

    def setup_files(self, input_filename, comments_column, output_filename=None):
        """Setup input and output files"""
        self.input_file = input_filename
        self.comments_column = comments_column

        if output_filename is None:
            self.output_file = f"streaming_results_{datetime.now().strftime('%Y%m%d_%H%M%S')}.xlsx"
        else:
            self.output_file = output_filename

        # Check if input file exists
        if not os.path.exists(self.input_file):
            raise FileNotFoundError(f"Input file '{self.input_file}' not found!")

        # Read initial data and setup output file
        try:
            initial_df = pd.read_excel(self.input_file)
            print(f"📊 Input file loaded: {len(initial_df)} rows")
            print(f"📋 Columns: {list(initial_df.columns)}")

            if self.comments_column not in initial_df.columns:
                raise ValueError(f"Column '{self.comments_column}' not found in input file!")

            # Create output file with headers
            output_columns = list(initial_df.columns) + [
                'Original_Text', 'Cleaned_Text', 'Detected_Language',
                'Translated_Text', 'Sentiment', 'Confidence', 'Processed_At'
            ]
            empty_df = pd.DataFrame(columns=output_columns)
            empty_df.to_excel(self.output_file, index=False)

            print(f"✅ Output file created: {self.output_file}")

        except Exception as e:
            print(f"❌ Error setting up files: {str(e)}")
            raise e

    def process_new_rows(self):
        """Process new rows from input file"""
        try:
            # Read current input file
            current_df = pd.read_excel(self.input_file)

            # Read current output file
            if os.path.exists(self.output_file):
                try:
                    output_df = pd.read_excel(self.output_file)
                    processed_count = len(output_df)
                except:
                    processed_count = 0
            else:
                processed_count = 0

            # Find new rows
            if len(current_df) > processed_count:
                new_rows = current_df.iloc[processed_count:]
                print(f"🔍 Found {len(new_rows)} new rows to process...")

                # Process each new row
                results = []
                for index, row in new_rows.iterrows():
                    comment = row[self.comments_column]
                    comment_hash = self.get_comment_hash(comment)

                    # Skip if already processed (duplicate check)
                    if comment_hash in self.processed_hashes:
                        print(f"⏭️  Skipping duplicate comment at row {index}")
                        continue

                    print(f"🔄 Processing row {index + 1}...")

                    # Process comment
                    result = self.process_comment(comment)

                    # Combine original row data with results
                    new_row = row.to_dict()
                    new_row.update({
                        'Original_Text': result['original_text'],
                        'Cleaned_Text': result['cleaned_text'],
                        'Detected_Language': result['language'],
                        'Translated_Text': result['translated_text'],
                        'Sentiment': result['sentiment'],
                        'Confidence': result['confidence'],
                        'Processed_At': result['processed_at']
                    })

                    results.append(new_row)
                    self.processed_hashes.add(comment_hash)

                    # Show progress
                    print(f"✅ Row {index + 1} processed: {result['language']} → {result['sentiment']} ({result['confidence']:.2%})")

                if results:
                    # Append to output file
                    new_results_df = pd.DataFrame(results)

                    if os.path.exists(self.output_file) and os.path.getsize(self.output_file) > 0:
                        # Append to existing file
                        with pd.ExcelWriter(self.output_file, mode='a', if_sheet_exists='overlay') as writer:
                            existing_df = pd.read_excel(self.output_file)
                            combined_df = pd.concat([existing_df, new_results_df], ignore_index=True)
                            combined_df.to_excel(writer, index=False)
                    else:
                        # Create new file
                        new_results_df.to_excel(self.output_file, index=False)

                    print(f"💾 {len(results)} new results saved to {self.output_file}")

                    # Show summary for new results
                    self.show_processing_summary(new_results_df)

            else:
                print("⏳ No new rows found...")

        except Exception as e:
            print(f"❌ Error processing new rows: {str(e)}")

    def show_processing_summary(self, df):
        """Show summary of processed results"""
        if len(df) == 0:
            return

        processed_count = len(df)
        positive_count = (df['Sentiment'] == 'Positive').sum()
        negative_count = (df['Sentiment'] == 'Negative').sum()
        neutral_count = (df['Sentiment'] == 'Neutral').sum()

        arabic_count = (df['Detected_Language'] == 'Arabic').sum()
        english_count = (df['Detected_Language'] == 'English').sum()

        print(f"\n📊 PROCESSING SUMMARY")
        print("=" * 30)
        print(f"New Comments Processed: {processed_count}")
        print(f"Positive: {positive_count} ({positive_count/processed_count*100:.1f}%)")
        print(f"Negative: {negative_count} ({negative_count/processed_count*100:.1f}%)")
        print(f"Neutral: {neutral_count} ({neutral_count/processed_count*100:.1f}%)")
        print(f"Arabic: {arabic_count} ({arabic_count/processed_count*100:.1f}%)")
        print(f"English: {english_count} ({english_count/processed_count*100:.1f}%)")
        print("=" * 30)

    def start_streaming(self, check_interval=30):
        """Start streaming monitoring"""
        self.is_running = True
        print(f"🚀 Starting streaming analysis...")
        print(f"📁 Input file: {self.input_file}")
        print(f"📁 Output file: {self.output_file}")
        print(f"⏰ Check interval: {check_interval} seconds")
        print("🔄 Monitoring for new comments... (Press Ctrl+C to stop)")

        try:
            while self.is_running:
                self.process_new_rows()
                time.sleep(check_interval)

        except KeyboardInterrupt:
            print("\n⏹️  Stopping streaming analysis...")
            self.is_running = False
        except Exception as e:
            print(f"❌ Streaming error: {str(e)}")
            self.is_running = False

    def stop_streaming(self):
        """Stop streaming monitoring"""
        self.is_running = False
        print("⏹️  Streaming stopped.")

# USAGE EXAMPLE AND SETUP

# Initialize the analyzer
print("🔧 Initializing Streaming Sentiment Analyzer...")
analyzer = StreamingSentimentAnalyzer()

# Load models
analyzer.load_models()

# Upload input Excel file
print("\n📁 Please upload your input Excel file (the one that gets updated):")
uploaded = files.upload()

if uploaded:
    input_filename = list(uploaded.keys())[0]
    print(f"✅ Input file uploaded: {input_filename}")

    # Read file to show columns
    temp_df = pd.read_excel(input_filename)
    print(f"📋 Available columns: {list(temp_df.columns)}")

    # Get comments column name
    comments_column = input("Enter the name of your comments column: ").strip()

    # Setup files
    try:
        analyzer.setup_files(input_filename, comments_column)

        # Process any existing data first
        print("\n🔄 Processing existing data...")
        analyzer.process_new_rows()

        # Ask user about streaming
        start_streaming = input("\nStart streaming monitoring? (y/n): ").strip().lower()

        if start_streaming == 'y':
            check_interval = input("Check interval in seconds (default 30): ").strip()
            check_interval = int(check_interval) if check_interval.isdigit() else 30

            # Start streaming in a separate thread so it doesn't block
            streaming_thread = threading.Thread(
                target=analyzer.start_streaming,
                args=(check_interval,)
            )
            streaming_thread.daemon = True
            streaming_thread.start()

            print("✅ Streaming started! The analyzer will monitor for new comments.")
            print("📥 Download the results file anytime:")
            files.download(analyzer.output_file)

        else:
            print("✅ Initial processing complete!")
            print("📥 Downloading results file...")
            files.download(analyzer.output_file)

    except Exception as e:
        print(f"❌ Setup error: {str(e)}")

# Manual processing function
def process_single_comment(comment_text):
    """Test function to process a single comment"""
    if 'analyzer' in globals():
        result = analyzer.process_comment(comment_text)
        print(f"\n📝 Original: {result['original_text']}")
        print(f"🔍 Cleaned: {result['cleaned_text']}")
        print(f"🌐 Language: {result['language']}")
        print(f"🔄 Translated: {result['translated_text']}")
        print(f"😊 Sentiment: {result['sentiment']}")
        print(f"📊 Confidence: {result['confidence']:.2%}")
    else:
        print("❌ Analyzer not initialized!")

# Test with sample comments
print("\n🧪 Testing with sample comments:")
test_comments = [
    "هذا المنتج رائع جداً وأنصح الجميع بشرائه",  # Arabic positive
    "المنتج سيء جداً ولا أنصح به أبداً",  # Arabic negative
    "This product is amazing!",  # English positive
    "Terrible quality, waste of money"  # English negative
]

for comment in test_comments:
    if 'analyzer' in globals():
        process_single_comment(comment)

print("\n🎉 Setup complete!")
print("💡 The analyzer will continuously monitor your input file for new comments")
print("💡 All Arabic comments will be translated to English before sentiment analysis")
print("💡 Results are saved with original text, translation, and sentiment scores")

🔧 Initializing Streaming Sentiment Analyzer...
🔄 Loading models...
Loading Arabic-English translation model...


config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/308M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/308M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/917k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/802k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.13M [00:00<?, ?B/s]

Device set to use cuda:0


✅ Translation model loaded (Arabic → English)
Loading English sentiment analysis model...


config.json:   0%|          | 0.00/929 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


pytorch_model.bin:   0%|          | 0.00/501M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


model.safetensors:   0%|          | 0.00/501M [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Device set to use cuda:0


✅ Sentiment model loaded (RoBERTa)
🎉 All models ready!

📁 Please upload your input Excel file (the one that gets updated):


Saving Book 1.xlsx to Book 1.xlsx
✅ Input file uploaded: Book 1.xlsx
📋 Available columns: ['Comments']
Enter the name of your comments column: Comments
📊 Input file loaded: 4 rows
📋 Columns: ['Comments']
✅ Output file created: streaming_results_20250602_232511.xlsx

🔄 Processing existing data...
🔍 Found 4 new rows to process...
🔄 Processing row 1...
✅ Row 1 processed: Arabic → negative (94.62%)
🔄 Processing row 2...
✅ Row 2 processed: English → positive (98.74%)
🔄 Processing row 3...
✅ Row 3 processed: English → negative (95.14%)
🔄 Processing row 4...
✅ Row 4 processed: Arabic → positive (98.86%)
💾 4 new results saved to streaming_results_20250602_232511.xlsx

📊 PROCESSING SUMMARY
New Comments Processed: 4
Positive: 0 (0.0%)
Negative: 0 (0.0%)
Neutral: 0 (0.0%)
Arabic: 2 (50.0%)
English: 2 (50.0%)

Start streaming monitoring? (y/n): y


KeyboardInterrupt: Interrupted by user