# Server 3: Verification

In [None]:
import os
import csv
import json
import time
import openai
import pickle
import random
import tiktoken
import py3langid
import numpy as np
import pandas as pd
from sklearn import metrics
from openai import AzureOpenAI

In [None]:
DATA_SIZE = 30
api_version = "2023-12-01-preview"  # "2023-05-15" 
deploy_name_map = {"gpt-4-turbo": "GPT4-WEST-US", "gpt-35-1106": "GPT-35-1106"}
official_name_map = {"gpt-4-turbo": "GPT-4 Turbo", "gpt-35-1106": "GPT-3.5 Turbo"}
client = AzureOpenAI(api_key="c6af48fe651d44bb80477d9f17918c3d", api_version=api_version, azure_endpoint="https://gpt-35-1106.openai.azure.com")

In [None]:
def eval_fn(y_test, y_pred, average='binary', print_flag=True):
    acc = round(metrics.accuracy_score(y_test, y_pred)*100, 2)
    f1 = round(metrics.f1_score(y_test, y_pred, average=average)*100, 2)
    recall = round(metrics.recall_score(y_test, y_pred, average=average)*100, 2)
    precision = round(metrics.precision_score(y_test, y_pred, average=average)*100, 2)
    if print_flag:
        print("Accuracy:", acc, "% | Precision:", precision, "% | Recall:", recall, "% | F1:", f1, "%\n")
    return acc, precision, recall, f1


def compare_baseline(exp_df_ls, exp_model_ls, exp_method_ls):
    ls_res = []
    for i, df_tmp in enumerate(exp_df_ls):
        ls_res.append((exp_method_ls[i], exp_model_ls[i])+eval_fn(df_tmp["same"], df_tmp["answer"], print_flag=False) + df_tmp.shape)
    res = pd.DataFrame(ls_res, columns=['Prompt', 'Model', 'Accuracy', 'Precision', 'Recall', 'F1', 'Size', 'df.shape[1]'])
    return res


def num_tokens_from_string(texts, model_id):
    encoding = tiktoken.encoding_for_model(model_id)
    num_tokens = len(encoding.encode(texts))
    return num_tokens

## Data Prep (Blog)

In [None]:
df = pd.read_csv("/data1/baixiang/dataset/blogtext.csv")
df.drop(['gender', 'age', 'topic', 'sign', 'date'], axis=1, inplace=True)
df.shape

In [None]:
# Finding and removing duplicate rows
df[df[['text']].duplicated(keep=False)].sort_values('text')

In [None]:
print('# duplicates:', df.text.duplicated().sum(), 'sanity check:', df.shape[0] - len(set(df.text)))
print('Before removing duplicates, df.shape:', df.shape)
df = df.drop_duplicates(subset=['text'], keep='first').reset_index(drop=True)
print('New df.shape:', df.shape)

In [None]:
v = df.id.value_counts()
df = df[df.id.isin(v[v >= 2].index)]
print('# unique authors', len(df.id.unique()))
df.shape

In [None]:
%%time
print(f"{df.shape[0]:,}")
df['lang'] = df['text'].apply(lambda x: py3langid.classify(x)[0])
print('% of English text:', f"{df[df.lang=='en'].shape[0] / df.shape[0]}")

df = df[df.lang=='en']
df.drop('lang', axis=1, inplace=True)
print(f"{df.shape[0]:,}") 

In [None]:
# check # of tokens
for i in range(10):
    text1, text2 = df.sample(2).text.values
    print(num_tokens_from_string(text1 + text2, "gpt-3.5-turbo"))

## Data Prep (Mail)
https://www.kaggle.com/datasets/wcukierski/enron-email-dataset

In [None]:
emails_df = pd.read_csv("/data1/baixiang/dataset/enron-emails.csv")
emails_df

In [None]:
import email
def get_text_from_email(msg):
    '''To get the content from email objects'''
    parts = []
    for part in msg.walk():
        if part.get_content_type() == 'text/plain':
            parts.append(part.get_payload())
    return ''.join(parts)


def split_email_addresses(line):
    '''To separate multiple email addresses'''
    if line:
        addrs = line.split(',')
        addrs = frozenset(map(lambda x: x.strip(), addrs))
    else:
        addrs = None
    return addrs


# Parse the emails into a list email objects
messages = list(map(email.message_from_string, emails_df['message'])) 
for key in messages[0].keys():
    emails_df[key] = [doc[key] for doc in messages]
emails_df['Text'] = list(map(get_text_from_email, messages))
emails_df['From'] = emails_df['From'].map(split_email_addresses)
emails_df['To'] = emails_df['To'].map(split_email_addresses)
del messages
emails_df = emails_df[['From', 'To', 'Text', 'Date', 'message']]
emails_df.head(2)

In [None]:
for i in emails_df.index:
    sender = emails_df.loc[i, 'From']
    receiver = emails_df.loc[i, 'To']
    if type(sender) is list and len(sender) > 1:
        print('More than 1 sender:', sender)
    
    # if receiver is None:
    #     receiver = 'nan'
    # # elif len(emails_df.loc[i, 'To']) > 1:
    # #     print('More than 1 receiver:', emails_df.loc[i, 'To'])
    
emails_df['From'] = emails_df["From"].apply(lambda x: list(x)[0])
# emails_df['To'] = emails_df["To"].apply(lambda x: ' '.join(list(x)))#.astype("unicode")
emails_df

In [None]:
# Finding and removing duplicate rows
emails_df[emails_df[['Text']].duplicated(keep=False)].sort_values('Text').shape

In [None]:
# emails_df = emails_df.drop_duplicates(subset=['From', 'To', 'Text', 'Date'], keep='first').reset_index(drop=True)
emails_df = emails_df.drop_duplicates(subset=['Text'], keep='first').reset_index(drop=True)
emails_df.shape

In [None]:
mail_corpus = emails_df.copy()
mail_corpus.columns = ['user', 'receiver', 'text', 'date', 'message_old']

unique_author = mail_corpus['user'].unique()
email_mapping = {k: v for k, v in zip(unique_author, range(len(unique_author)))}
mail_corpus['id'] = mail_corpus['user'].apply(lambda x: 'mail_'+str(email_mapping[x]))
mail_corpus

In [None]:
df = mail_corpus
df.drop(['user', 'receiver', 'date', 'message_old'], axis=1, inplace=True)
print(df[df['text']==''].shape)
df.text = df.text.str.strip()
df.isnull().any()

In [None]:
print(f"{df.shape[0]:,}") 
print(df[df['text']==''].shape)
df.text = df.text.str.strip()
df.dropna(inplace=True)
print(f"{df.shape[0]:,}") 
df.isnull().any()

In [None]:
# Finding and removing duplicate rows
df[df[['text']].duplicated(keep=False)].sort_values('text')

In [None]:
print('# duplicates:', df.text.duplicated().sum(), 'sanity check:', df.shape[0] - len(set(df.text)))
print('Before removing duplicates, df.shape:', df.shape)
df = df.drop_duplicates(subset=['text'], keep='first').reset_index(drop=True)
print('New df.shape:', df.shape)

In [None]:
v = df.id.value_counts()
df = df[df.id.isin(v[v >= 2].index)]
print('# unique authors', len(df.id.unique()))
df.shape

In [None]:
%%time
print(f"{df.shape[0]:,}")
df['lang'] = df['text'].apply(lambda x: py3langid.classify(x)[0])
print('% of English text:', f"{df[df.lang=='en'].shape[0] / df.shape[0]}")

df = df[df.lang=='en']
df.drop('lang', axis=1, inplace=True)
print(f"{df.shape[0]:,}") 

In [None]:
# check # of tokens
for i in range(10):
    text1, text2 = df.sample(2).text.values
    print(num_tokens_from_string(text1 + text2, "gpt-3.5-turbo"))

## Sample data

In [None]:
def sampler_fn(df, size=DATA_SIZE):
    """Sample a subset in a balanced way"""
    dict_to_df = []
    text_set = set()
    author_ls = random.sample(df.id.unique().tolist(), size*2)

    for i in range(size):
        if i % 2 == 0:  # sample documents from different authors
            aut_id1, aut_id2 = random.sample(author_ls, 2)
            text1 = df[df.id==aut_id1].text.sample(1).values[0]
            text2 = df[df.id==aut_id2].text.sample(1).values[0]
            author_ls.remove(aut_id1)
            author_ls.remove(aut_id2)
        else:  # sample documents from same authors to make it balance
            same_auth_id = random.choice(author_ls)
            author_ls.remove(same_auth_id)
            aut_id1, aut_id2 = same_auth_id, same_auth_id
            text1, text2 = df[df.id==same_auth_id].sample(2).text.tolist()
            while text1 in text_set or text2 in text_set:
                text1, text2 = df[df.id==same_auth_id].sample(2).text.tolist()
        # print(text1, text2)
        dict_row = {}
        dict_row["text1"], dict_row["text2"] = text1, text2
        dict_row["aut_id1"], dict_row["aut_id2"] = aut_id1, aut_id2
        text_set.add(text1)
        text_set.add(text2)
        dict_to_df.append(dict_row)

    df_sub = pd.DataFrame(dict_to_df)
    df_sub['same'] = df_sub.aut_id1 == df_sub.aut_id2
    print('# same authors:', df_sub['same'].sum(), '# different authors:', len(np.unique(df_sub.aut_id1)))
    return df_sub
    

# df_sub = sampler_fn(df)
df_sub = pd.read_csv("llm-verify-res/df_sub_blog_30.csv")
print(df_sub.shape)
df_sub.head()

In [None]:
print(df_sub[df_sub[['text1']].duplicated(keep=False)].shape)
print(df_sub[df_sub[['text2']].duplicated(keep=False)].shape)
print(df_sub[df_sub[['aut_id1']].duplicated(keep=False)].shape)
print(df_sub[df_sub[['aut_id2']].duplicated(keep=False)].shape)
# Avg number of words
(df_sub['text1'].apply(lambda x: len(x.split())).mean() + df_sub['text2'].apply(lambda x: len(x.split())).mean()) / 2

In [None]:
df_sub.aut_id1.unique()

In [None]:
for i in range(5):
    text1, text2 = df_sub.loc[i, 'text1'], df_sub.loc[i, 'text2']
    print(num_tokens_from_string(text1 + text2, "gpt-3.5-turbo"))

In [None]:
def run_verfication(df, method, model_name, prompt_prefix, system_msg, ls_df, ls_model, ls_method, prompt_postfix=""):
    ls = []
    start_time = time.time()
    print("\n++++++++++ ", method, model_name, " ++++++++++")
    
    for i in df.index:
        aut_id1, aut_id2 = df.loc[i, 'aut_id1'], df.loc[i, 'aut_id2']
        text1, text2 = df.loc[i, 'text1'], df.loc[i, 'text2']
        prompt = prompt_prefix + f"""The input texts (Text 1 and Text 2) are delimited with triple backticks. ```\n\nText 1: {text1}, \n\nText 2: {text2}\n\n```""" + prompt_postfix
        
        raw_response = client.chat.completions.create(
            model=deploy_name_map[model_name], 
            response_format={"type": "json_object"} if model_name in ["gpt-35-1106", "gpt-4-turbo"] else None, 
            messages=[
                {"role": "system", "content": system_msg},
                {"role": "user", "content": prompt}
            ], 
            temperature=0
        )
        
        response_str = raw_response.choices[0].message.content
        print('Raw response content:', response_str, '\n')
        try:
            response = json.loads(response_str, strict=False)  
        except json.JSONDecodeError:
            print(f"===== JSONDecodeError =====\n")
            response = json.loads("{}")
            response['answer'] = not(aut_id1 == aut_id2)  # generate a wrong answer when JSONDecodeError occur
            response['analysis'] = 'JSONDecodeError' + response_str
            # continue
        
        response["text1"], response["text2"] = text1, text2
        response["author_id1"], response["author_id2"] = aut_id1, aut_id2
        response["tokens"] = raw_response.usage.total_tokens
        ls.append(response)
        response = None
    df_res = pd.DataFrame(ls)
    ls_df.append(df_res)
    ls_method.append(method)
    ls_model.append(official_name_map[model_name])
    df_res['same'] = df_res.author_id1 == df_res.author_id2
    df_res["answer"] = df_res["answer"].astype('bool')
    eval_fn(df_res["same"], df_res["answer"])
    print("--- Execution Time: %s seconds ---" % round(time.time() - start_time, 2))
    return df_res

In [None]:
v1, v2, v3, v4 = 'no_guidance', 'little_guidance', 'grammar', 'LIP'

## Exp

In [None]:
system_msg = """
Respond with a JSON object including two key elements:
{
  "analysis": Reasoning behind your answer.
  "answer":  A boolean (True/False) answer.
}
"""
prompt1 = """
Verify if two input texts were written by the same author.
"""
prompt2 = """
Verify if two input texts were written by the same author. Analyze the writing styles of the input texts, disregarding the differences in topic and content.
"""
prompt3 = """
Verify if two input texts were written by the same author. Focus on grammatical styles indicative of authorship.
"""
prompt4 = """
Verify if two input texts were written by the same author. Analyze the writing styles of the input texts, disregarding the differences in topic and content. Reasoning based on linguistic features such as phrasal verbs, modal verbs, punctuation, rare words, affixes, quantities, humor, sarcasm, typographical errors, and misspellings. 
""" 

In [None]:
%%time
ls_df_1, ls_model_1, ls_method_1 = [], [], []

df1_gpt35 = run_verfication(df_sub, v1, 'gpt-35-1106', prompt1, system_msg, ls_df_1, ls_model_1, ls_method_1)
df1_gpt4 = run_verfication(df_sub, v1, 'gpt-4-turbo', prompt1, system_msg, ls_df_1, ls_model_1, ls_method_1)

df2_gpt35 = run_verfication(df_sub, v2, 'gpt-35-1106', prompt2, system_msg, ls_df_1, ls_model_1, ls_method_1)
df2_gpt4 = run_verfication(df_sub, v2, 'gpt-4-turbo', prompt2, system_msg, ls_df_1, ls_model_1, ls_method_1)

df3_gpt35 = run_verfication(df_sub, v3, 'gpt-35-1106', prompt3, system_msg, ls_df_1, ls_model_1, ls_method_1)
df3_gpt4 = run_verfication(df_sub, v3, 'gpt-4-turbo', prompt3, system_msg, ls_df_1, ls_model_1, ls_method_1)

df4_gpt35 = run_verfication(df_sub, v4, 'gpt-35-1106', prompt4, system_msg, ls_df_1, ls_model_1, ls_method_1)
df4_gpt4 = run_verfication(df_sub, v4, 'gpt-4-turbo', prompt4, system_msg, ls_df_1, ls_model_1, ls_method_1)

res1 = compare_baseline(ls_df_1, ls_model_1, ls_method_1)
res1

In [None]:
%%time
ls_df_2, ls_model_2, ls_method_2 = [], [], []

df1_gpt35 = run_verfication(df_sub, v1, 'gpt-35-1106', prompt1, system_msg, ls_df_2, ls_model_2, ls_method_2)
df1_gpt4 = run_verfication(df_sub, v1, 'gpt-4-turbo', prompt1, system_msg, ls_df_2, ls_model_2, ls_method_2)

df2_gpt35 = run_verfication(df_sub, v2, 'gpt-35-1106', prompt2, system_msg, ls_df_2, ls_model_2, ls_method_2)
df2_gpt4 = run_verfication(df_sub, v2, 'gpt-4-turbo', prompt2, system_msg, ls_df_2, ls_model_2, ls_method_2)

df3_gpt35 = run_verfication(df_sub, v3, 'gpt-35-1106', prompt3, system_msg, ls_df_2, ls_model_2, ls_method_2)
df3_gpt4 = run_verfication(df_sub, v3, 'gpt-4-turbo', prompt3, system_msg, ls_df_2, ls_model_2, ls_method_2)

df4_gpt35 = run_verfication(df_sub, v4, 'gpt-35-1106', prompt4, system_msg, ls_df_2, ls_model_2, ls_method_2)
df4_gpt4 = run_verfication(df_sub, v4, 'gpt-4-turbo', prompt4, system_msg, ls_df_2, ls_model_2, ls_method_2)

res2 = compare_baseline(ls_df_2, ls_model_2, ls_method_2)
res2

In [None]:
%%time
ls_df_3, ls_model_3, ls_method_3 = [], [], []

df1_gpt35 = run_verfication(df_sub, v1, 'gpt-35-1106', prompt1, system_msg, ls_df_3, ls_model_3, ls_method_3)
df1_gpt4 = run_verfication(df_sub, v1, 'gpt-4-turbo', prompt1, system_msg, ls_df_3, ls_model_3, ls_method_3)

df2_gpt35 = run_verfication(df_sub, v2, 'gpt-35-1106', prompt2, system_msg, ls_df_3, ls_model_3, ls_method_3)
df2_gpt4 = run_verfication(df_sub, v2, 'gpt-4-turbo', prompt2, system_msg, ls_df_3, ls_model_3, ls_method_3)

df3_gpt35 = run_verfication(df_sub, v3, 'gpt-35-1106', prompt3, system_msg, ls_df_3, ls_model_3, ls_method_3)
df3_gpt4 = run_verfication(df_sub, v3, 'gpt-4-turbo', prompt3, system_msg, ls_df_3, ls_model_3, ls_method_3)

df4_gpt35 = run_verfication(df_sub, v4, 'gpt-35-1106', prompt4, system_msg, ls_df_3, ls_model_3, ls_method_3)
df4_gpt4 = run_verfication(df_sub, v4, 'gpt-4-turbo', prompt4, system_msg, ls_df_3, ls_model_3, ls_method_3)

res3 = compare_baseline(ls_df_3, ls_model_3, ls_method_3)
res3

In [None]:
r1 = res1.drop(['Size', 'df.shape[1]'], axis=1)
r2 = res2.drop(['Size', 'df.shape[1]'], axis=1)
r3 = res3.drop(['Size', 'df.shape[1]'], axis=1)
res_con = pd.concat([r1, r2, r3])

res_mean = res_con.groupby(['Method', 'Model'], as_index=False, sort=False).mean().round(decimals=2)
res_std = res_con.groupby(['Method', 'Model'], as_index=False, sort=False).std().round(decimals=2)
res_max = res_con.groupby(['Method', 'Model'], as_index=False, sort=False).max().round(decimals=2)
res_mean

In [None]:
res_mean.astype(str).iloc[:, 2:]+'±'+res_std.astype(str).iloc[:, 2:]