# Setting up the API

In [None]:
!pip install replicate

In [None]:
import replicate

In [None]:
# Setting the replicate API token

import os
#get this token from here https://replicate.com/account/api-tokens
os.environ["REPLICATE_API_TOKEN"] = "r8_3K14qvkrlE2GaUPWEKS5h8qz21vUro03cODrC"

# Reading the data

In [None]:
# Connecting to drive
from google.colab import drive
drive.mount('/content/drive')

This is a path in my drive, I dont know if you have to create one of your own and change this

In [None]:
cd 'drive/MyDrive/Unipd/CBSD/Data/'

In [None]:
import pandas as pd
import numpy as np

In [None]:
# Reading the Excel files into a pandas DataFrame
opinions_en = pd.read_excel("DecOp_data_EN_500.xlsx")
opinions_it = pd.read_excel("DecOp_data_IT_500.xlsx")

# Preprocessing

In [None]:
# Checking the datasets' structure
opinions_en.head()

In [None]:
opinions_en.info()

## Splitting into training and test sets

In [None]:
# I could not find the train/test split in the data, but we can do it following
# the same criteria that was used in the papers
random_seed = 111

# Splitting 100 authors for the test set of the english dataset
opinions_en_test = opinions_en.sample(n=100, random_state=random_seed)
opinions_en_train = opinions_en.drop(opinions_en_test.index)


# Splitting 100 authors for the test set of the italian dataset
opinions_it_test = opinions_it.sample(n=100, random_state=random_seed)
opinions_it_train = opinions_it.drop(opinions_it_test.index)

In the paper, they perform a "cross-topic" approach, where "Each Turker was asked to classify 10 opinions randomly extracted from the DecOp test-set as truthful or deceptive in their respective language in a binary response modality. The overall classification performance was computed separately for the US participants and the Italian ones."

In [None]:
# Reorganizing the datasets to have a "cross-topic" dataframe, in which each row
# is an opinion. That way, we can run the code sequentially through the examples

def long_dataset(df):
  topic_list = [('A','GT.A'), ('CL','GT.CL'), ('E','GT.E'),
                ('GM','GT.GM'), ('Pom','GT.Pom')]

  df_list = []

  for topic_label, GT_label in topic_list:
    df_topic = df[['ID', 'gender', 'age', topic_label, GT_label]].copy()
    df_topic['topic'] = topic_label

    df_topic.rename(columns={topic_label: 'opinion', GT_label: 'GT'}, inplace=True)
    df_list.append(df_topic)

  long_df = pd.concat(df_list, ignore_index=True)
  long_df.sort_values(by=['ID','topic'], inplace=True, ignore_index=True)
  return long_df

opinions_en_test_long = long_dataset(opinions_en_test)
opinions_en_train_long = long_dataset(opinions_en_train)

opinions_it_test_long = long_dataset(opinions_it_test)
opinions_it_train_long = long_dataset(opinions_it_train)

In [None]:
# Checking the new structure
opinions_en_test_long.head(15)

In [None]:
# Checking the ground truth balance of the splits
print(f"English train ground truth balance: \n{opinions_en_train_long['GT'].value_counts()}\n")
print(f"English test ground truth balance: \n{opinions_en_test_long['GT'].value_counts()}\n")
print(f"Italian train ground truth balance: \n{opinions_it_train_long['GT'].value_counts()}\n")
print(f"Italian test ground truth balance: \n{opinions_it_test_long['GT'].value_counts()}\n")

In [None]:
# Checking the topics balance of the splits
print(f"English train topic balance: \n{opinions_en_train_long['topic'].value_counts()}\n")
print(f"English test topic balance: \n{opinions_en_test_long['topic'].value_counts()}\n")
print(f"Italian train topic balance: \n{opinions_it_train_long['topic'].value_counts()}\n")
print(f"Italian test topic balance: \n{opinions_it_test_long['topic'].value_counts()}\n")

In [None]:
# Checking the topic and truth balance of the splits
print(f"English train topic balance: \n{opinions_en_train_long.groupby('topic')['GT'].value_counts()}\n")
print(f"English test topic balance: \n{opinions_en_test_long.groupby('topic')['GT'].value_counts()}\n")
print(f"Italian train topic balance: \n{opinions_it_train_long.groupby('topic')['GT'].value_counts()}\n")
print(f"Italian test topic balance: \n{opinions_it_test_long.groupby('topic')['GT'].value_counts()}\n")

## Obtaining random and balanced samples to pass to the model

In the paper, they asked 100 people to classify 10 opinions randomly as truthful or deceptive in their respective languages using a binary response modality. Then, they obtained the overall classification performance separately for U.S. and Italian participants. That is equivalent to 1000 samples (per language). However, they are not explicit in saying if the samples could be done with replacement or not. Assuming that they could, drawing the 1000 samples at once would not reproduce the same methodology as the one in the paper, so we are going to follow the paper as closely as possible in this sense.

What we can do is get 100 subsets with 10 (randomly sampled) elements each and run them with the model. We do this for each language and obtain the overall accuracy.

In [None]:
# Function to sample within each group (so that it is balanced)
def balanced_sample(group, n, seed):
    return group.sample(n, random_state=seed)

In [None]:
# Random seeds to obtain reproducible results (one for each sampling)
random_seeds = np.arange(0, 10)

# Getting the list of the 100 samples with 10 elements
# Since we have 5 topics and 2 labels, we can obtain the subsets
# by sampling 1 opinion of each topic and label
def get_test_subsets(df):
  random_subsets = []
  for seed in random_seeds:
    balanced_sampled_df = df.groupby(by=['topic','GT'], group_keys=False).apply(balanced_sample, n=1, seed=seed).copy()
    balanced_sampled_df = balanced_sampled_df.sample(frac=1)
    random_subsets.append(balanced_sampled_df)
  return random_subsets

opinions_en_test_subsets = get_test_subsets(opinions_en_test_long)
opinions_it_test_subsets = get_test_subsets(opinions_it_test_long)

In [None]:
# Checking the topic and truth balance of the subsets
print(f"English train topic balance: \n{opinions_en_test_subsets[0].groupby('topic')['GT'].value_counts()}\n")
print(f"Italian test topic balance: \n{opinions_it_test_subsets[5].groupby('topic')['GT'].value_counts()}\n")

In [None]:
# Checking a subset
opinions_en_test_subsets[4]

In [None]:
# Alternative without balancing
opinions_en_test_long.sample(10, random_state=10)

# Zero-shot

In this case, we will run the prompt without any previous information

In [None]:
# Testing the prompting with one example
opinion_example = opinions_en_test_long['opinion'][0]
output = replicate.run("replicate/llama-2-70b-chat:02e509c789964a7ea8736978a43525956ef40397be9033abf9fd2badfe68c9e3",
    input={"prompt": f"This is the opinion in question: {opinion_example}",
           "system_prompt": """
           I am going to give you a person's opinion about a certain topic.
           Tell me if you think that it corresponds to their honest opinion on that topic or not.
           Answer only with 'T' (it corresponds to their honest opinion) or 'F' (it does not correspond to their honest opinion).""",
           "max_new_tokens": 5})


full_response = ""

for item in output:
  full_response += item

print(f"The opinion given to Llama 2 was: {opinion_example}")
print(f"Llama2s'answer: {full_response}")

In [None]:
# Defining the function to get Llama's answer with respect to each opinion
def get_zero_shot_answer(row):
  opinion = row['opinion']
  output = replicate.run("replicate/llama-2-70b-chat:02e509c789964a7ea8736978a43525956ef40397be9033abf9fd2badfe68c9e3",
                          input={"prompt": f"This is the opinion in question: {opinion}",
                                "system_prompt":"""
                                I am going to give you a person's opinion about a certain topic.
                                Tell me if you think that it corresponds to their honest opinion on that topic or not.
                                Answer only with 'T' (it corresponds to their honest opinion) or 'F' (it does not correspond to their honest opinion).""",
                                "max_new_tokens": 5})
  full_response = ""
  for item in output:
    full_response += item
  return full_response

In [None]:
# Function to run the zero-shot prompt for all the randomly selected subsets
def run_zero_shot(subsets_list):
  for i, subset in enumerate(subsets_list):
    subset['id_run'] = i
    subset['LLama2_zero_shot'] = subset.apply(get_zero_shot_answer, axis=1)
    print(f"It has processed {i+1} subsets.")
  df_zero_shot_complete = pd.concat(subsets_list, ignore_index=True)
  return df_zero_shot_complete

## Zero-shot for the english dataset

In [None]:
# Running and checking the results of the english dataset
opinions_en_results_zs = run_zero_shot(opinions_en_test_subsets)
opinions_en_results_zs.head(15)

In [None]:
# We can check what were the Llama's responses
opinions_en_results_zs['LLama2_zero_shot'].unique()

In [None]:
# By checking, it seems that there are spaces at the beginning or the end
# of the Llama column, so maybe it's better to strip the result to compare it
# with the ground truth.
opinions_en_results_zs['LLama2_zero_shot'] = opinions_en_results_zs['LLama2_zero_shot'].str.strip()

In [None]:
# Writing the zero shot results for the english dataset
#opinions_en_results_zs.to_csv('../Results/opinions_en_results_zero_shot_new_run2.csv', index=False)

## Zero-shot for the italian dataset

In [None]:
# Running and checking the results of the italian dataset
opinions_it_results = run_zero_shot(opinions_it_test_subsets)
opinions_it_results.head(15)

In [None]:
opinions_it_results['LLama2_zero_shot'].unique()

In [None]:
# In this case, we also strip the results
opinions_it_results['LLama2_zero_shot'] = opinions_it_results['LLama2_zero_shot'].str.strip()

In [None]:
opinions_it_results['LLama2_zero_shot'].value_counts()

In [None]:
# Writing the zero shot results for the english dataset
#opinions_it_results.to_csv('../Results/opinions_it_results_zero_shot_new_run2.csv', index=False)

# Few-shot

We will do three one-shot approaches:


*   Giving two examples in the same language as the test opinion.
*   Giving one example in english and one example in italian.
*   Giving two examples in a different language (with respect to the test opinion's language).



In [None]:
# Testing the prompting with one example
opinion_test = opinions_en_test_long['opinion'][96]

row_opinion_example1 = opinions_en_train_long.iloc[0]
row_opinion_example2 = opinions_en_train_long.iloc[12]
output_fs_en = replicate.run("replicate/llama-2-70b-chat:02e509c789964a7ea8736978a43525956ef40397be9033abf9fd2badfe68c9e3",
    input={"prompt": f"This is the test opinion: {opinion_test}",
           "system_prompt": f"""
           I am going to give you a person's opinion about a certain topic. This will be the test opinion.
           Tell me if you think that it corresponds to their honest opinion on that topic or not.
           Answer only with 'T' (it corresponds to their honest opinion) or 'F' (it does not correspond to their honest opinion).

           As an example, I will give you two different sample opinions and their corresponding label ('T' or 'F'):
           - Sample opinion 1: {row_opinion_example1['opinion']}
           - Label of opinion 1: {row_opinion_example1['GT']}

           - Sample opinion 2: {row_opinion_example2['opinion']}
           - Label of opinion 2: {row_opinion_example2['GT']}

           Please, only respond regarding the opinion of interest. The output should only be one letter ('T' or 'F')
           """,
           "max_new_tokens": 5})


full_response_fs = ""

for item in output_fs_en:
  full_response_fs += item

print(f"The opinion given to Llama 2 was: {opinion_test}")
print(f"Llama2s'answer: {full_response_fs}")

In [None]:
# Defining the function to get Llama's answer with respect to each opinion
def get_few_shot_answer(row_opinion_test, language_examples):
  # Assigning the examples to show the LLM before doign the test,
  # depending on the type of task
  if language_examples == 'english':
    opinion_example1 = row_opinion_test['opinion_example_en1']
    opinion_example2 = row_opinion_test['opinion_example_en2']
    GT_example1 = row_opinion_test['GT_example_en1']
    GT_example2 = row_opinion_test['GT_example_en2']
  elif language_examples == 'italian':
    opinion_example1 = row_opinion_test['opinion_example_it1']
    opinion_example2 = row_opinion_test['opinion_example_it2']
    GT_example1 = row_opinion_test['GT_example_it1']
    GT_example2 = row_opinion_test['GT_example_it2']
  elif language_examples == 'mixed':
    opinion_example1 = row_opinion_test['opinion_example_en1']
    opinion_example2 = row_opinion_test['opinion_example_it1']
    GT_example1 = row_opinion_test['GT_example_en1']
    GT_example2 = row_opinion_test['GT_example_it1']

  # Getting the test opinion
  opinion_test = row_opinion_test['opinion']


  output = replicate.run("replicate/llama-2-70b-chat:02e509c789964a7ea8736978a43525956ef40397be9033abf9fd2badfe68c9e3",
                          input={"prompt": f"This is the test opinion: {opinion_test}",
                                 "system_prompt": f"""
           I am going to give you a person's opinion about a certain topic. This will be the test opinion.
           Tell me if you think that it corresponds to their honest opinion on that topic or not.
           Answer only with 'T' (it corresponds to their honest opinion) or 'F' (it does not correspond to their honest opinion).

           As an example, I will give you two different sample opinions and their corresponding label ('T' or 'F'):
           - Sample opinion 1: {opinion_example1}
           - Label of opinion 1: {GT_example1}

           - Sample opinion 2: {opinion_example2}
           - Label of opinion 2: {GT_example2}

           Please, only respond regarding the opinion of interest. The output should only be one letter ('T' or 'F')
           """,
           "max_new_tokens": 5})

  full_response = ""
  for item in output:
    full_response += item

  return full_response

Getting the examples for each case

In [None]:
# Random seeds to obtain reproducible results (one for each sampling)
# Adding the examples as a columns to the test dataframes.
# Each dataframe will receive four examples: two in english and two in italian
# Depending on the task, when prompting Llama, we select two of the examples.
def get_examples_few_shot(subsets_list):

  # Getting one seed for each subset (and they are different for each example)
  random_seed_examples_en1 = np.arange(0, 10)
  random_seed_examples_en2 = np.arange(10, 20)
  random_seed_examples_it1 = np.arange(20, 30)
  random_seed_examples_it2 = np.arange(30, 40)


  subsets_list_result = []
  for i, subset in enumerate(subsets_list):
    opinion_GT_example_en1 = opinions_en_train_long[['opinion','GT']].sample(10, random_state=random_seed_examples_en1[i]).reset_index(drop=True)
    opinion_GT_example_en1.rename(columns={'opinion':'opinion_example_en1','GT':'GT_example_en1'}, inplace=True)

    opinion_GT_example_en2 = opinions_en_train_long[['opinion','GT']].sample(10, random_state=random_seed_examples_en2[i]).reset_index(drop=True)
    opinion_GT_example_en2.rename(columns={'opinion':'opinion_example_en2','GT':'GT_example_en2'}, inplace=True)

    opinion_GT_example_it1 = opinions_it_train_long[['opinion','GT']].sample(10, random_state=random_seed_examples_it1[i]).reset_index(drop=True)
    opinion_GT_example_it1.rename(columns={'opinion':'opinion_example_it1','GT':'GT_example_it1'}, inplace=True)

    opinion_GT_example_it2 = opinions_it_train_long[['opinion','GT']].sample(10, random_state=random_seed_examples_it2[i]).reset_index(drop=True)
    opinion_GT_example_it2.rename(columns={'opinion':'opinion_example_it2','GT':'GT_example_it2'}, inplace=True)

    subset = pd.concat([subset.reset_index(drop=True),
                        opinion_GT_example_en1, opinion_GT_example_en2,
                        opinion_GT_example_it1, opinion_GT_example_it2],
                       axis=1)
    subsets_list_result.append(subset)
  return subsets_list_result

In [None]:
# Executing the function to extract the examples for each observation
opinions_en_test_subsets = get_examples_few_shot(opinions_en_test_subsets)
opinions_it_test_subsets = get_examples_few_shot(opinions_it_test_subsets)

In [None]:
# Checking the new structure
opinions_en_test_subsets[4]

In [None]:
# Function to run the few-shot prompt for all the randomly selected subsets
# It can be done with two examples in english, two examples in italian and two
# mixed-language examples
def run_few_shot(subsets_list, language_examples):
  for i, subset in enumerate(subsets_list):
    col_name = 'LLama2_few_shot_' + language_examples
    subset['id_run'] = i
    subset[col_name] = subset.apply(get_few_shot_answer, language_examples=language_examples, axis=1)
    print(f"It has processed {i+1} subsets.")
  df_few_shot_complete = pd.concat(subsets_list, ignore_index=True)
  return df_few_shot_complete

## Few-shot for the english dataset

### Two examples in english

In [None]:
# Running and checking the results of the english dataset with the within language few-shot
opinions_en_results_fs_within = run_few_shot(opinions_en_test_subsets, language_examples='english')
opinions_en_results_fs_within.head(15)

In [None]:
# We can check what were the Llama's responses
opinions_en_results_fs_within['LLama2_few_shot_english'].unique()

In [None]:
# In this case, we also strip the results
opinions_en_results_fs_within['LLama2_few_shot_english'] = opinions_en_results_fs_within['LLama2_few_shot_english'].str.strip()

In [None]:
# Writing the one-shot results for the english dataset and the within language test
#opinions_en_results_fs_within.to_csv('../Results/opinions_en_results_fs_within.csv', index=False)

### Two examples in italian

In [None]:
# Running and checking the results of the english dataset with the few-shot in other language
opinions_en_results_fs_other = run_few_shot(opinions_en_test_subsets, language_examples='italian')
opinions_en_results_fs_other.head(15)

In [None]:
# We can check what were the Llama's responses
opinions_en_results_fs_other['LLama2_few_shot_italian'].unique()

In [None]:
# In this case, we also strip the results
opinions_en_results_fs_other['LLama2_few_shot_italian'] = opinions_en_results_fs_other['LLama2_few_shot_italian'].str.strip()

In [None]:
# Writing the one-shot results for the english dataset and the "other" language test
#opinions_en_results_fs_other.to_csv('../Results/opinions_en_results_fs_other.csv', index=False)

### Two mixed-language examples

In [None]:
# Running and checking the results of the english dataset with the mixed language few-shot
opinions_en_results_fs_mixed = run_few_shot(opinions_en_test_subsets, language_examples='mixed')
opinions_en_results_fs_mixed.head(15)

In [None]:
# We can check what were the Llama's responses
opinions_en_results_fs_mixed['LLama2_few_shot_mixed'].unique()

In [None]:
# In this case, we also strip the results
opinions_en_results_fs_mixed['LLama2_few_shot_mixed'] = opinions_en_results_fs_mixed['LLama2_few_shot_mixed'].str.strip()

In [None]:
# Writing the one shot results for the english dataset and the mixed language test
#opinions_en_results_fs_mixed.to_csv('../Results/opinions_en_results_fs_mixed.csv', index=False)

## Few-shot for the italian dataset

### Two examples in italian

In [None]:
# Running and checking the results of the italian dataset with the within language few-shot
opinions_it_results_fs_within = run_few_shot(opinions_it_test_subsets, language_examples='italian')
opinions_it_results_fs_within.head(15)

In [None]:
# We can check what were the Llama's responses
opinions_it_results_fs_within['LLama2_few_shot_italian'].unique()

In [None]:
# In this case, we also strip the results
opinions_it_results_fs_within['LLama2_few_shot_italian'] = opinions_it_results_fs_within['LLama2_few_shot_italian'].str.strip()

In [None]:
# Writing the one shot results for the english dataset and the within language test
#opinions_it_results_fs_within.to_csv('../Results/opinions_it_results_fs_within.csv', index=False)

### Two examples in english

In [None]:
# Running and checking the results of the italian dataset with the "other" language few-shot
opinions_it_results_fs_other = run_few_shot(opinions_it_test_subsets, language_examples='english')
opinions_it_results_fs_other.head(15)

In [None]:
# We can check what were the Llama's responses
opinions_it_results_fs_other['LLama2_few_shot_english'].unique()

In [None]:
# In this case, we also strip the results
opinions_it_results_fs_other['LLama2_few_shot_english'] = opinions_it_results_fs_other['LLama2_few_shot_english'].str.strip()

In [None]:
# Writing the one-shot results for the italian dataset and the "other" language test
#opinions_it_results_fs_other.to_csv('../Results/opinions_it_results_fs_other.csv', index=False)

### Two mixed-language examples

In [None]:
# Running and checking the results of the italian dataset with the mixed language test
opinions_it_results_fs_mixed = run_few_shot(opinions_it_test_subsets, language_examples='mixed')
opinions_it_results_fs_mixed.head(15)

In [None]:
# We can check what were the Llama's responses
opinions_it_results_fs_mixed['LLama2_few_shot_mixed'].unique()

In [None]:
# In this case, we also strip the results
opinions_it_results_fs_mixed['LLama2_few_shot_mixed'] = opinions_it_results_fs_mixed['LLama2_few_shot_mixed'].str.strip()

In [None]:
# Writing the one-shot results for the italian dataset and the mixed language test
#opinions_it_results_fs_mixed.to_csv('../Results/opinions_it_results_fs_mixed.csv', index=False)