<a href="https://colab.research.google.com/github/ayadassouki/ayadassouki-ACL-Task-A-Sentiment-Analysis/blob/main/pre_fineTuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
import google.generativeai as genai
import json
import time
import pandas as pd
from tqdm import tqdm
from getpass import getpass

# Securely input your API key
API_KEY = getpass('Enter your API key: ')
genai.configure(api_key=API_KEY)

# Initialize model
model = genai.GenerativeModel("gemini-1.5-flash")  # Ensure 'gemini-1.5-flash' is the correct model name


In [None]:
import pandas as pd
def load_data(filePath):
    data = pd.read_csv(filePath)
    return data

In [None]:
import re

def clean_text(text):
  # Remove special characters and punctuation
  text = re.sub(r"[^\w\s]", " ", text)

  # Remove single characters
  text = re.sub(r"\b[a-zA-Z]\b", " ", text)

  # Remove HTML tags
  text = re.sub(r"<[^>]*>", " ", text)

  # Lowercase the text
  text = text.lower()

  # Remove extra whitespace
  text = re.sub(r"\s+", " ", text)

  # Trim leading and trailing spaces
  text = text.strip()

  return text

In [None]:
def clean_df(data):
  texts = data['text'].to_list()
  clean_texts = [clean_text(t) for t in texts]
  data['clean_text'] = clean_texts
  data.head()

In [None]:
# Necessary packages
import pathlib
import textwrap

import google.generativeai as genai

from IPython.display import display
from IPython.display import Markdown


def to_markdown(text):
  text = text.replace('•', '  *')
  return Markdown(textwrap.indent(text, '> ', predicate=lambda _: True))

# Used to securely store your API key
from google.colab import userdata

In [None]:
def add_pred_colunm(data):
  pred_emotion = ['p_anger', 'p_fear', 'p_joy', 'p_sadness', 'p_surprise']
  for emotion in pred_emotion:
    data[emotion] = ''
  data.head()

In [None]:
def df_to_json(data):
  json_data = data[['clean_text','p_anger', 'p_fear', 'p_joy', 'p_sadness', 'p_surprise']].to_json(orient='records')
  return json_data

In [None]:
full_data = load_data('eng.csv')
data = full_data.head(100)
clean_df(data)
add_pred_colunm(data)
json_data = df_to_json(data)
data.head()


In [None]:
json_data

In [None]:
len(json_data)

In [None]:
prompt = f"""
You are an expert linguist, who is good at classifying customer review sentiments into specific emotions.
Help me classify customer reviews into the following emotions: Anger, Fear, Surprise, Sadness, and Joy.
The customer reviews are provided between three back ticks.
In your output, only return the Json code back as output - which is provided between three backticks.
Your task is to update the predicted labels under 'p_anger', 'p_fear', 'p_surprise', 'p_sadness', and 'p_joy' in the Json code as values of 0 or 1.
Don't make any changes to the Json code format, please.
```
{json_data}
```
"""
prompt

In [None]:
response = model.generate_content(prompt)

print(response.text)

In [None]:
import json
json_data = response.text.strip('```json\n').strip('\n```')
llm_data = json.loads(json_data)

In [None]:
llm_data

In [None]:
sub_df = pd.DataFrame(llm_data)
sub_df

In [None]:
pred_emotion = ['p_anger', 'p_fear', 'p_joy', 'p_sadness', 'p_surprise']
for emotion in pred_emotion:
  data[emotion] = sub_df[emotion].values
data.head()

In [None]:
from sklearn.metrics import classification_report

# Assuming you have true labels in the DataFrame as 'true_anger', 'true_fear', etc.
true_labels = ['Anger', 'Fear', 'Joy', 'Sadness', 'Surprise']
pred_emotion = ['p_anger', 'p_fear', 'p_joy', 'p_sadness', 'p_surprise']

# Create an empty dictionary to store classification reports for each emotion
classification_reports = {}

# Iterate through each emotion and calculate the classification report
for true_label, pred_label in zip(true_labels, pred_emotion):
    true_values = data[true_label]  # True labels for the emotion
    pred_values = data[pred_label]  # Predicted labels for the emotion

    # Convert predicted values to integers to match true values
    pred_values = pred_values.astype(int)

    # Generate the classification report
    report = classification_report(true_values, pred_values)

    # Store the report in the dictionary
    classification_reports[true_label] = report

# Print the classification reports
for emotion, report in classification_reports.items():
    print(f"Classification Report for {emotion}:")
    print(report)
    print("\n")

In [None]:
def json_batch(json_data, batch_size):
  batch_list = []
  for i in range(0, len(json_data), batch_size):
    batch = json_data[i:i+batch_size]
    batch_list.append(batch)
  return batch_list

In [None]:
import json
data = pd.read_csv('eng.csv')
clean_df(data)
add_pred_colunm(data)
json_data = df_to_json(data)
item_dict = json.loads(json_data)
json_batch_list = json_batch(item_dict, 5)
len(json_batch_list)


In [None]:
prompt = f"""
  You are an expert linguist, who is good at classifying customer review sentiments into specific emotions.
  Help me classify customer reviews into the following emotions: Anger, Fear, Surprise, Sadness, and Joy.
  The customer reviews are provided between three back ticks.
  In your output, only return the Json code back as output - which is provided between three backticks.
  Your task is to update the predicted labels under 'p_anger', 'p_fear', 'p_surprise', 'p_sadness', and 'p_joy' in the Json code as values of 0 or 1.
  Don't make any changes to the Json code format, please.
  ```
  {json_batch_list[0]}
  ```
  """
prompt

In [None]:
response = model.generate_content(prompt)
response.text

In [None]:
import time
predictions = []
for i in range(len(json_batch_list)):
    prompt = f"""
    You are an expert linguist, who is good at classifying customer review sentiments into specific emotions.
    Help me classify customer reviews into the following emotions: Anger, Fear, Surprise, Sadness, and Joy.
    The customer reviews are provided between three back ticks.
    In your output, only return the Json code back as output - which is provided between three backticks.
    Your task is to update the predicted labels under 'p_anger', 'p_fear', 'p_surprise', 'p_sadness', and 'p_joy' in the Json code as values of 0 or 1.
    Don't make any changes to the Json code format, please.
    ```
    {json_batch_list[i]}
    ```
    """
    try:
        response = model.generate_content(prompt)
        json_data = response.text.strip('```json\n').strip('\n```')
        predictions.append(json_data)
    except Exception as e:
        print(f"Error for batch {i}: {e}")
    time.sleep(4)  # Adjust delay based on API rate limits


In [None]:
predictions

In [None]:
import json
import re

def clean_predictions(predictions):
    cleaned_predictions = []
    error_batches = []

    for batch in predictions:
        # Step 1: Standardize quotes (replace single quotes with double quotes where necessary)
        batch_fixed = batch.replace("'", '"')

        # Step 2: Remove unwanted line breaks and spaces
        batch_fixed = re.sub(r'\s+', ' ', batch_fixed)

        # Step 3: Ensure valid JSON format using regex
        batch_fixed = re.sub(r'(?<![:,\[\{])\s*"\s*(.*?)\s*"\s*:\s*', r'"\1":', batch_fixed)
        batch_fixed = re.sub(r',\s*}', '}', batch_fixed)  # Remove trailing commas before closing braces
        batch_fixed = re.sub(r',\s*]', ']', batch_fixed)  # Remove trailing commas before closing brackets

        # Step 4: Wrap in brackets if needed
        if not batch_fixed.startswith('[') and not batch_fixed.endswith(']'):
            batch_fixed = '[' + batch_fixed + ']'

        # Attempt JSON parsing
        try:
            cleaned_batch = json.loads(batch_fixed)
            cleaned_predictions.extend(cleaned_batch)
        except json.JSONDecodeError as e:
            print(f"JSON decoding error: {e} on batch: {batch_fixed[:200]}...")
            error_batches.append(batch_fixed)

    return cleaned_predictions, error_batches

# Example usage:
fixed_predictions, errors = clean_predictions(predictions)

# Check cleaned predictions and remaining errors
print(f"Number of valid predictions: {len(fixed_predictions)}")
print(f"Number of batches with errors: {len(errors)}")


In [None]:
def fix_error_batches(error_batches):
    corrected_batches = []
    remaining_errors = []

    for batch in error_batches:
        # Try additional regex cleanup for common errors
        batch = re.sub(r',\s*([\]}])', r'\1', batch)  # Remove commas before } or ]
        batch = re.sub(r'([{\[,])\s*,', r'\1', batch)  # Remove extra commas after {, [ or ,
        batch = re.sub(r'([a-zA-Z0-9])\s*:\s*(?=["\'])', r'"\1": ', batch)  # Fix missing quotes on keys

        # Attempt parsing again
        try:
            corrected_batches.extend(json.loads(batch))
        except json.JSONDecodeError as e:
            print(f"Still failing: {e} on batch: {batch[:200]}...")
            remaining_errors.append(batch)

    return corrected_batches, remaining_errors

# Try to fix the error batches
corrected_predictions, remaining_issues = fix_error_batches(errors)

# Add corrected predictions to the cleaned list
final_predictions = fixed_predictions + corrected_predictions

print(f"Final number of valid predictions: {len(final_predictions)}")
print(f"Remaining problematic batches: {len(remaining_issues)}")


In [None]:
pred_df = pd.DataFrame(fixed_predictions)
pred_df

In [None]:
# Import pandas
import pandas as pd

pred_emotion = ['p_anger', 'p_fear', 'p_joy', 'p_sadness', 'p_surprise']

# Ensure pred_df and data have compatible indexes before assignment
# This will reindex pred_df to match data's index, filling missing values with NaN
pred_df = pred_df.reindex(data.index)

for emotion in pred_emotion:
  data[emotion] = pred_df[emotion].values

data.head()

In [None]:
from sklearn.metrics import classification_report

# Assuming you have true labels in the DataFrame as 'true_anger', 'true_fear', etc.
true_labels = ['Anger', 'Fear', 'Joy', 'Sadness', 'Surprise']
pred_emotion = ['p_anger', 'p_fear', 'p_joy', 'p_sadness', 'p_surprise']

# Create an empty dictionary to store classification reports for each emotion
classification_reports = {}

# Iterate through each emotion and calculate the classification report
for true_label, pred_label in zip(true_labels, pred_emotion):
    true_values = data[true_label]  # True labels for the emotion
    pred_values = data[pred_label]  # Predicted labels for the emotion

    # Convert predicted values to integers to match true values
    pred_values = pred_values.fillna(0)
    pred_values = pred_values.astype(int)

    # Generate the classification report
    report = classification_report(true_values, pred_values)

    # Store the report in the dictionary
    classification_reports[true_label] = report

# Print the classification reports
for emotion, report in classification_reports.items():
    print(f"Classification Report for {emotion}:")
    print(report)
    print("\n")