<a href="https://colab.research.google.com/github/ayadassouki/ayadassouki-ACL-Task-A-Sentiment-Analysis/blob/main/gemini_fineTuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#installing dependencies
import numpy as np
import pandas as pd
import google.generativeai as genai
import json

In [None]:
# api key set up
from google.colab import userdata
api_key = userdata.get('new_key_semeval')
genai.configure(api_key= api_key)
model = genai.GenerativeModel('gemini-1.5-flash')
prompt = 'this is a test prompt, generate anything'
response = model.generate_content(prompt)
print(response.text)

**Reading the data and making functions to turn to json and llm specific formats**

In [None]:
dataframe = pd.read_csv('eng.csv')

In [None]:
dataframe.head()

In [None]:
dataframe.drop(['id'], inplace=True, axis=1)

In [None]:
dataframe.head()

In [None]:
# split the data into training, validation and testing data
from sklearn.model_selection import train_test_split
train_df, temp_df = train_test_split(dataframe, test_size=0.4, random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

In [None]:
train_df.head()

In [None]:
len(train_df), len(test_df), len(val_df)

In [None]:
# convert data frame to json
def df_to_json(df):
  json_data = df.to_json(orient='records')
  dict_data = json.loads(json_data)
  return dict_data

In [None]:
def input_output(json_data):
    formatted_data = []
    for entry in json_data:
        # Extract the text input
        text = entry['text']
        # Flatten the emotion dictionary into a single string
        emotions = ", ".join([f"{key}: {value}" for key, value in entry.items() if key in ["Anger", "Fear", "Joy", "Sadness", "Surprise"]])
        # Append the formatted input-output pair
        formatted_data.append({"text_input": text, "output": emotions})
    return formatted_data

In [None]:
train_json = df_to_json(train_df)
train_inout = input_output(train_json)

In [None]:
len(train_inout)

In [None]:
train_inout[0]

**Fine Tuning Gemini**

In [None]:
# fine tuning gemini
import time

# Define base model and prepare training data
base_model = "models/gemini-1.5-flash-001-tuning"

# Start fine-tuning
operation = genai.create_tuned_model(
    display_name="sentiment_analysis_finetune",
    source_model=base_model,
    epoch_count=10,  # Adjust epochs as needed
    batch_size=4,    # Adjust batch size as needed
    learning_rate=0.001,
    training_data=train_inout
)

# Wait for the fine-tuning process to complete
for status in operation.wait_bar():
    time.sleep(10)

# Retrieve the tuned model details
result = operation.result()
print(f"Fine-tuned model created: {result.name}")

In [None]:
def input_val_test(json_data):
  formatted_data = []
  for entry in json_data:
    text = entry['text']
    emotions = "Anger: 0, Fear: 0, Joy: 0, Sadness: 0, Surprise: 0"
    formatted_data.append({"text_input": text})
  return formatted_data

In [None]:
val_json = df_to_json(val_df)
val_inout = input_val_test(val_json)

In [None]:
val_inout[0]

**Getting Predictions**

In [None]:
import time

# Initialize predictions list
predictions = []

# Define the fine-tuned model name
fine_tuned_model = "tunedModels/sentimentanalysisfinetune-bxndg9w0jfgu"
model = genai.GenerativeModel(fine_tuned_model)

# Total number of entries
total_entries = len(val_inout)
# Iterate through validation data
for i, entry in enumerate(val_inout):
# Construct prompt
    prompt = f"""
    You are a fine-tuned model trained to predict emotions in a given text. Always provide the output strictly in the specified dictionary format.

    Analyze the following text:
    {entry}

    Your response must be exactly in this format:
    {{
      'text_input': 'text you did analysis on',
      'output': 'Anger: , Fear: , Joy: , Sadness: , Surprise: '
    }}


    Only return the output in the specified format. Do not add or remove anything.
    All emotion tags must be present in the output.

    Now process the input text and provide your response.


    """

    try:
        # Generate content using the fine-tuned model
        response = model.generate_content(prompt)
        predictions.append(response.text)  # Store response
    except Exception as e:
        print(f"Error for entry {i + 1}/{total_entries}: {e}")  # Log error details

    # Display progress
    progress = ((i + 1) / total_entries) * 100
    print(f"Progress: {i + 1}/{total_entries} ({progress:.2f}%)")

    # Adjust delay based on API rate limits
    time.sleep(10)  # Set this to match the API's recommended rate limit


In [None]:
predictions

In [None]:
len(predictions)

**Cleaning LLM Predictions into dataframe acceptable format**

In [None]:
import re
import pandas as pd

# Preprocessing function
def preprocess_llm_responses(responses):
    cleaned_responses = []

    for response in responses:
        # Step 1: Remove unwanted prefixes (e.g., "0: 0" or "1.")
        response = re.sub(r"^\d+[:.]\s*", "", response)

        # Step 2: Remove duplicate emotion keys (keep the first occurrence)
        keys_seen = set()
        clean_response = []
        for pair in response.split(","):
            key_value = pair.strip()
            key = key_value.split(":")[0].strip()
            if key not in keys_seen:
                keys_seen.add(key)
                clean_response.append(key_value)

        # Step 3: Extract key-value pairs using a regex
        matches = re.findall(r"(Anger|Fear|Joy|Sadness|Surprise):\s*(\d)", ", ".join(clean_response))

        # Step 4: Create a standardized dictionary
        emotions = {key: int(value) for key, value in matches}

        # Step 5: Ensure all keys are present with default value 0
        all_emotions = {"Anger": 0, "Fear": 0, "Joy": 0, "Sadness": 0, "Surprise": 0}
        all_emotions.update(emotions)

        # Append the cleaned response
        cleaned_responses.append(all_emotions)

    return cleaned_responses

In [None]:
cleaned_response = preprocess_llm_responses(predictions)
cleaned_response

In [None]:
validation_pred_df = pd.DataFrame(cleaned_response)
validation_pred_df.head()

**Adjusting the data for some missing values (18 in this case)**

In [None]:
def drop_missing_indices_direct(data, missing_indices):
    """
    Removes entries from a list based on indices, adjusting for 1-based to 0-based indexing.

    Parameters:
    - data (list): Original list of dictionaries or items.
    - missing_indices (list): 1-based indices of entries to remove.

    Returns:
    - List with the specified indices removed.
    """
    # Convert 1-based indices to 0-based
    zero_based_indices = [idx - 1 for idx in missing_indices]
    return [data[i] for i in range(len(data)) if i not in zero_based_indices]

missing_indices = [481, 527, 533, 536, 537, 538, 542, 544, 545, 546, 547, 548, 549, 550, 551, 552, 553, 554]

# Drop missing entries
filtered_val_inout = drop_missing_indices_direct(val_inout, missing_indices)

# Display results
print(f"Total entries before: {len(val_inout)}")
print(f"Total entries after dropping: {len(filtered_val_inout)}")

In [None]:
filtered_val_inout[480]

In [None]:
v_df = validation_pred_df.assign(text_input=filtered_val_inout)

In [None]:
v_df['text_input'] = v_df['text_input'].apply(lambda x: x['text_input'])

In [None]:
v_df.head()

In [None]:
v_df.columns

In [None]:
v_df.rename(columns={'Anger': 'Pred_Anger', 'Fear': 'Pred_Fear', 'Joy':'Pred_Joy', 'Sadness':'Pred_Sadness', 'Surprise':'Pred_Surprise'}, inplace=True)

In [None]:
v_df.head()

In [None]:
# Instead of using missing_indices directly, get the actual index values to drop
missing_org_indices = [480, 526, 532, 535, 536, 537, 541, 543, 544, 545, 546, 547, 548, 549, 550, 551, 552, 553]
indices_to_drop = val_df.index[missing_org_indices]

# Now drop the rows using the correct index values
val_df.drop(indices_to_drop, inplace=True, axis=0)

In [None]:
val_df.info()

In [None]:
v_df.head()

In [None]:
val_df.info()

In [None]:
# Columns to add
columns_to_add = ['Anger', 'Fear', 'Joy', 'Sadness', 'Surprise']

# Iterate over columns_to_add and set the corresponding values from val_df
for col in columns_to_add:
    v_df[col] = val_df[col].values

# Display the updated v_df
print(v_df)

In [None]:
v_df.head()

In [None]:
for col in columns_to_add:
  v_df[col].astype(int)

In [None]:
v_df.info()

**Classification report for Validation Data**

In [None]:
from sklearn.metrics import classification_report

# Assuming you have true labels in the DataFrame as 'true_anger', 'true_fear', etc.
true_labels = ['Anger', 'Fear', 'Joy', 'Sadness', 'Surprise']
pred_emotion = ['Pred_Anger', 'Pred_Fear', 'Pred_Joy', 'Pred_Sadness', 'Pred_Surprise']

# Create an empty dictionary to store classification reports for each emotion
classification_reports = {}

# Iterate through each emotion and calculate the classification report
for true_label, pred_label in zip(true_labels, pred_emotion):
    true_values = v_df[true_label]  # True labels for the emotion
    pred_values = v_df[pred_label]  # Predicted labels for the emotion


    # Generate the classification report
    report = classification_report(true_values, pred_values)

    # Store the report in the dictionary
    classification_reports[true_label] = report

# Print the classification reports
for emotion, report in classification_reports.items():
    print(f"Classification Report for {emotion}:")
    print(report)
    print("\n")

**Making predictions for Testing Data**

In [None]:
test_df.head()

In [None]:
test_inout = input_val_test(df_to_json(test_df))

In [None]:
test_inout[0]

In [None]:
import time

# Initialize predictions list
test_predictions = []

# Define the fine-tuned model name
fine_tuned_model = "tunedModels/sentimentanalysisfinetune-bxndg9w0jfgu"
model = genai.GenerativeModel(fine_tuned_model)

# Total number of entries
total_entries = len(test_inout)
# Iterate through validation data
for i, entry in enumerate(test_inout):
# Construct prompt
    test_prompt = f"""
    You are a fine-tuned model trained to predict emotions in a given text. Always provide the output strictly in the specified dictionary format.

    Analyze the following text:
    {entry}

    Your response must be exactly in this format:
    {{
      'text_input': 'text you did analysis on',
      'output': 'Anger: , Fear: , Joy: , Sadness: , Surprise: '
    }}


    Only return the output in the specified format. Do not add or remove anything.
    All emotion tags must be present in the output.

    Now process the input text and provide your response.


    """

    try:
        # Generate content using the fine-tuned model
        test_response = model.generate_content(test_prompt)
        test_predictions.append(test_response.text)  # Store response
    except Exception as e:
        print(f"Error for entry {i + 1}/{total_entries}: {e}")  # Log error details

    # Display progress
    progress = ((i + 1) / total_entries) * 100
    print(f"Progress: {i + 1}/{total_entries} ({progress:.2f}%)")

    # Adjust delay based on API rate limits
    time.sleep(10)  # Set this to match the API's recommended rate limit


In [None]:
 test_predictions

In [None]:
len(test_predictions)

In [None]:
clean_testing = preprocess_llm_responses(test_predictions)

In [None]:
clean_testing

In [None]:
len(clean_testing)

In [None]:
testing_df = pd.DataFrame(clean_testing)

In [None]:
testing_df.head()

In [None]:
t_df = testing_df.assign(text_input=test_inout)

In [None]:
t_df['text_input'] = t_df['text_input'].apply(lambda x: x['text_input'])

In [None]:
t_df

In [None]:
t_df.rename(columns={'Anger': 'Pred_Anger', 'Fear': 'Pred_Fear', 'Joy':'Pred_Joy', 'Sadness':'Pred_Sadness', 'Surprise':'Pred_Surprise'}, inplace=True)

In [None]:
t_df.head()

In [None]:
# Columns to add
columns_to_add = ['Anger', 'Fear', 'Joy', 'Sadness', 'Surprise']

# Iterate over columns_to_add and set the corresponding values from val_df
for col in columns_to_add:
    t_df[col] = test_df[col].values

# Display the updated v_df
print(t_df)

In [None]:
t_df.head()

In [None]:
for col in columns_to_add:
  t_df[col].astype(int)

In [None]:
t_df.info()

**Classification Report for Testing Data**

In [None]:
from sklearn.metrics import classification_report

# Assuming you have true labels in the DataFrame as 'true_anger', 'true_fear', etc.
true_labels = ['Anger', 'Fear', 'Joy', 'Sadness', 'Surprise']
pred_emotion = ['Pred_Anger', 'Pred_Fear', 'Pred_Joy', 'Pred_Sadness', 'Pred_Surprise']

# Create an empty dictionary to store classification reports for each emotion
classification_reports = {}

# Iterate through each emotion and calculate the classification report
for true_label, pred_label in zip(true_labels, pred_emotion):
    true_values = t_df[true_label]  # True labels for the emotion
    pred_values = t_df[pred_label]  # Predicted labels for the emotion


    # Generate the classification report
    report = classification_report(true_values, pred_values)

    # Store the report in the dictionary
    classification_reports[true_label] = report

# Print the classification reports
for emotion, report in classification_reports.items():
    print(f"Classification Report for {emotion}:")
    print(report)
    print("\n")

Finished with Fine Tuning:🙂

**TESTING**

In [None]:
test_dataframe = pd.read_csv('eng_test.csv')

In [None]:
test_dataframe.head()

In [None]:
test_dataframe.drop('id', axis=1, inplace=True)

In [None]:
test_dataframe.head()

In [None]:
test_dataframe.info()

In [None]:
testing_inout = input_val_test(df_to_json(test_dataframe))

In [None]:
testing_inout[1]

In [None]:
import time

# Initialize predictions list
test_predictions = []

# Define the fine-tuned model name
fine_tuned_model = "tunedModels/sentimentanalysisfinetune-nb68ky4hhjlr"
model = genai.GenerativeModel(fine_tuned_model)

# Total number of entries
total_entries = len(testing_inout)
# Iterate through validation data
for i, entry in enumerate(testing_inout):
# Construct prompt
    test_prompt = f"""
    You are a fine-tuned model trained to predict emotions in a given text. Always provide the output strictly in the specified dictionary format.

    Analyze the following text:
    {entry}

    Your response must be exactly in this format:
    {{
      'text_input': 'text you did analysis on',
      'output': 'Anger: , Fear: , Joy: , Sadness: , Surprise: '
    }}


    Only return the output in the specified format. Do not add or remove anything.
    All emotion tags must be present in the output.

    Now process the input text and provide your response.


    """

    try:
        # Generate content using the fine-tuned model
        test_response = model.generate_content(test_prompt)
        test_predictions.append(test_response.text)  # Store response
    except Exception as e:
        print(f"Error for entry {i + 1}/{total_entries}: {e}")  # Log error details

    # Display progress
    progress = ((i + 1) / total_entries) * 100
    print(f"Progress: {i + 1}/{total_entries} ({progress:.2f}%)")

    # Adjust delay based on API rate limits
    time.sleep(7)  # Set this to match the API's recommended rate limit


In [None]:
for model in genai.list_tuned_models():
    print(model)
