In [3]:
pip install transformers torch pandas numpy scipy tqdm


Note: you may need to restart the kernel to use updated packages.


In [7]:
import pandas as pd
import numpy as np
from scipy.special import softmax
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForMaskedLM, AutoModelForSeq2SeqLM, AutoModelForCausalLM
import torch


In [None]:
# Set the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
# Load the Arabic model for sentiment analysis
tokenizer_sa = AutoTokenizer.from_pretrained("facebook/mbart-large-cc25")
model_sa = AutoModelForMaskedLM.from_pretrained("facebook/mbart-large-cc25").to(device)


In [None]:
# Load GPT-2 model and tokenizer (for summarization)
tokenizer_sum = AutoTokenizer.from_pretrained("aubmindlab/aragpt2-medium")
model_sum = AutoModelForCausalLM.from_pretrained("aubmindlab/aragpt2-medium").to(device)


In [8]:
# Load the Excel file
file_path = 'Desktop/First_tes.xlsx'
df = pd.read_excel(file_path)
df.columns = ['S_ID', 'ConcatenatedResponse']


FileNotFoundError: [Errno 2] No such file or directory: 'Desktop/First_tes.xlsx'

In [None]:
def polarity_scores_arabic(text):
    try:
        encoded_text = tokenizer_sa(text, return_tensors='pt').to(device)
        output = model_sa(**encoded_text)

        # Ensure that output is in expected format
        if hasattr(output, 'logits'):
            scores = output.logits[0].detach().cpu().numpy()
        else:
            scores = output[0][0].detach().cpu().numpy()

        scores = softmax(scores)

        # Ensure scores are in correct order
        if len(scores) >= 3:
            scores_dict = {
                'arabic_neg': scores[0],
                'arabic_neu': scores[1],
                'arabic_pos': scores[2]
            }
        else:
            raise ValueError("Unexpected number of score values")
    except Exception as e:
        print(f"Error processing text: {text}, error: {e}")
        scores_dict = {
            'arabic_neg': 0.0,
            'arabic_neu': 0.0,
            'arabic_pos': 0.0
        }
    return scores_dict

def summarize_text(text):
    try:
        inputs = tokenizer_sum.encode("summarize: " + text, return_tensors='pt', max_length=512, truncation=True).to(device)
        summary_ids = model_sum.generate(inputs, max_length=150, min_length=40, length_penalty=2.0, num_beams=4, early_stopping=True)
        summary = tokenizer_sum.decode(summary_ids[0], skip_special_tokens=True)
    except Exception as e:
        print(f"Error summarizing text: {text}, error: {e}")
        summary = "Error in summarization"
    return summary


In [None]:
# Initialize an empty list to store individual results
results = []
individual_summaries = []


In [None]:

# Iterate through the rows of the DataFrame
for index, row in tqdm(df.iterrows(), total=df.shape[0]):
    sentiment_scores = polarity_scores_arabic(row['ConcatenatedResponse'])

    # Ensure that sentiment_scores is a dictionary with valid scores
    if isinstance(sentiment_scores, dict):
        # Convert any array-like values to scalars
        for sentiment, score in sentiment_scores.items():
            if isinstance(score, np.ndarray):
                # If score is an array, take the first element or use .item() to extract scalar
                sentiment_scores[sentiment] = score.item() if score.size == 1 else float(score[0])

        # Use max to find the sentiment with the highest score
        predicted_sentiment = max(sentiment_scores, key=sentiment_scores.get)

        # Map the predicted sentiment to a human-readable format
        if predicted_sentiment == 'arabic_neg':
            sentiment = 'سلبي'  # Negative
        elif predicted_sentiment == 'arabic_neu':
            sentiment = 'محايد'  # Neutral
        else:
            sentiment = 'إيجابي'  # Positive
    else:
        print(f"Error: Sentiment scores format is invalid for row {index}: {sentiment_scores}")
        continue  # Skip this row if the format is incorrect

    # Summarize the text
    summary = summarize_text(row['ConcatenatedResponse'])

    # Append the results to the list
    results.append({
        'S_ID': row['S_ID'],
        'ConcatenatedResponse': row['ConcatenatedResponse'],
        'التصنيف': sentiment,  # Classification
        'الملخص': summary  # Summary
    })


In [None]:
 # Collect individual summaries
    individual_summaries.append(summary)

# Create a new DataFrame from the results
results_df = pd.DataFrame(results)

# Save the individual results to an Excel file
output_file_path = 'E:/1.0.0.0.1 DEPI/Final Project/Docs/results.xlsx'
results_df.to_excel(output_file_path, index=False)

# Combine all individual summaries into one comprehensive summary
comprehensive_summary = summarize_text(' '.join(individual_summaries))

# Save the comprehensive summary to a text file
comprehensive_output_file_path = 'E:/1.0.0.0.1 DEPI/Final Project/Docs/comprehensive_summary.txt'
with open(comprehensive_output_file_path, 'w', encoding='utf-8') as file:
    file.write(comprehensive_summary)

# Display the comprehensive summary
print(comprehensive_summary)

In [None]:
# Load your data
df = pd.read_excel('D:/Data/SQL/First_tes.xlsx')
df.columns = ['S_ID', 'C_ID', 'ConcatenatedResponse', 'True_Sentiment']

# Assuming 'True_Sentiment' is your ground truth labels
# Create a function to get predictions
def get_predictions(row):
    sentiment_scores = polarity_scores_arabic(row['ConcatenatedResponse'])
    predicted_sentiment = max(sentiment_scores, key=sentiment_scores.get)
    if predicted_sentiment == 'arabic_neg':
        return 'سلبي'
    elif predicted_sentiment == 'arabic_neu':
        return 'محايد'
    else:
        return 'إيجابي'

# Get predictions
df['Predicted_Sentiment'] = df.apply(get_predictions, axis=1)

# Convert sentiments to numerical labels if needed
label_map = {'سلبي': 0, 'محايد': 1, 'إيجابي': 2}
y_true = df['True_Sentiment'].map(label_map)
y_pred = df['Predicted_Sentiment'].map(label_map)

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# Calculate metrics
accuracy = accuracy_score(y_true, y_pred)
precision = precision_score(y_true, y_pred, average='weighted')
recall = recall_score(y_true, y_pred, average='weighted')
f1 = f1_score(y_true, y_pred, average='weighted')

# Print metrics
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

# Confusion matrix for more insights
conf_matrix = confusion_matrix(y_true, y_pred)
print(conf_matrix)