# Server 4: Authorship Attribution
- New prompts
- Chunked data
- New sampler (Sample a new list of authors every time, use each of author as a query author so that the number of labels = n.
    Then, compute evaluaion metric for this set of authors and repeat this for multiple times (repetitions) to compute mean F1 etc.)

In [None]:
import torch
import openai
import random
import tiktoken
import py3langid
import numpy as np
import torch.nn.functional as F

from random import shuffle
from openai import AzureOpenAI
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from transformers import AutoModel, AutoTokenizer, AutoConfig, AutoModelForSequenceClassification

In [None]:
import os
import time
import json
import pickle
import pandas as pd
from sklearn import metrics
from huggingface_hub import login
from vllm import LLM, SamplingParams
from transformers import AutoTokenizer

os.environ["CUDA_VISIBLE_DEVICES"] = "6"
login("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")

In [None]:
model_id = "TheBloke/Llama-2-70B-chat-GPTQ"
llm = LLM(model=model_id, quantization='gptq')

In [None]:
def num_tokens_from_string(string, encoding_name):
    encoding = tiktoken.encoding_for_model(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens
    

def eval_fn(y_test, y_pred, average='weighted', print_flag=True):
    acc = round(metrics.accuracy_score(y_test, y_pred)*100, 2)
    f1 = round(metrics.f1_score(y_test, y_pred, average=average)*100, 2)
    recall = round(metrics.recall_score(y_test, y_pred, average=average, zero_division=0)*100, 2)
    precision = round(metrics.precision_score(y_test, y_pred, average=average, zero_division=0)*100, 2)
    if print_flag:
        print("Accuracy:", acc, "% | Precision:", precision, "% | Recall:", recall, "% | F1:", f1, "%\n")   
    return acc, precision, recall, f1
    

def embed_fn(model_name, texts, baseline_type):
    if baseline_type == 'bert':
        model = AutoModel.from_pretrained(model_name)
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        tokenized_texts = tokenizer(texts, return_tensors='pt', padding=True, truncation=True, max_length=512)
        embedding = model(tokenized_texts.input_ids.to(model.device), tokenized_texts.attention_mask.to(model.device)).last_hidden_state.mean(dim=1)
    elif baseline_type == 'tf-idf':
        vectorizer = TfidfVectorizer(max_features=3000, analyzer='char', ngram_range=(4, 4))
        embedding = torch.from_numpy(vectorizer.fit_transform(texts).toarray())
    elif baseline_type == 'ada':
        ada_client = AzureOpenAI(api_key = "08e99b6c65e84ead8676c505ee4d6f1e", api_version = "2023-05-15", azure_endpoint = "https://iarpa.openai.azure.com")
        ada_response = ada_client.embeddings.create(input = texts, model = "test_embedding")
        embedding = torch.Tensor([e.embedding for e in ada_response.data])
    return embedding


def run_aa_baseline(df_sub, model_name, baseline_type='bert', print_flag=True, eval_average='weighted'):
    ls_acc, ls_precision, ls_recall, ls_f1 = [], [], [], []

    for i in df_sub.index:
        ls_query_text, ls_potential_text = df_sub.loc[i, 'query_text'], df_sub.loc[i, 'potential_text']
        embed_query_texts = F.normalize(embed_fn(model_name, ls_query_text, baseline_type)) 
        embed_potential_texts = F.normalize(embed_fn(model_name, ls_potential_text, baseline_type))
        
        preds = embed_query_texts @ embed_potential_texts.T
        preds = F.softmax(preds, dim=-1)
        labels = np.arange(0, len(ls_query_text))

        acc, precision, recall, f1 = eval_fn(labels, preds.argmax(-1).numpy(), eval_average, print_flag)
        ls_acc.append(acc)
        ls_precision.append(precision)
        ls_recall.append(recall)
        ls_f1.append(f1)

    muti_avg = (round(np.mean(ls_acc), 2), round(np.mean(ls_precision), 2), round(np.mean(ls_recall), 2), round(np.mean(ls_f1), 2))
    muti_std = (round(np.std(ls_acc), 2), round(np.std(ls_precision), 2), round(np.std(ls_recall), 2), round(np.std(ls_f1), 2))
    return muti_avg, muti_std

## Data prep

In [None]:
df = pd.read_csv("/data/baixiang/dataset/blogtext.csv")
df.drop(['gender', 'age', 'topic', 'sign', 'date'], axis=1, inplace=True)
df.shape

In [None]:
# Finding and removing duplicate rows
df[df[['text']].duplicated(keep=False)].sort_values('text')

In [None]:
print('Before removing duplicates, df.shape:', df.shape)
df = df.drop_duplicates(subset=['text'], keep='first').reset_index(drop=True)
print('New df.shape:', df.shape)

In [None]:
%%time
print(f"{df.shape[0]:,}")
df['lang'] = df['text'].apply(lambda x: py3langid.classify(x)[0])
print('% of English text:', f"{df[df.lang=='en'].shape[0] / df.shape[0]}")

df = df[df.lang=='en']
df.drop('lang', axis=1, inplace=True)
print(f"{df.shape[0]:,}") 

In [None]:
# check # of tokens
for i in range(10):
    text1, text2 = df.sample(2).text.values
    print(num_tokens_from_string(text1 + text2, "gpt-3.5-turbo"))

In [None]:
%%time
df = df[df["text"].apply(lambda x: num_tokens_from_string(x, "gpt-3.5-turbo") < 512)]
print(f"{df.shape[0]:,}") 

In [None]:
%%time
df = df[df["text"].apply(lambda x: num_tokens_from_string(x, "gpt-3.5-turbo") > 56)]
print(f"{df.shape[0]:,}") 

In [None]:
v = df.id.value_counts()
df = df[df.id.isin(v[v >= 2].index)]
print('# unique authors:', df.id.nunique())
print('New df.shape:', df.shape)

In [None]:
def sampler_aa_fn_pro(df, n, reps):
    """
    Sample a new list of authors every time, use each of author as a query author so that the number of labels = n.
    Then, compute evaluaion metric for this set of authors and repeat this for multiple times (repetitions) to compute mean F1 etc.
    All the authors are unique as long as n is less than the number of unique authors.
    n: number of candidate authors.
    reps: number of repetitions.
    """
    dict_to_df = []
    ls_unique_author = df.id.unique().tolist()
    for _ in range(reps):
        candidate_authors = random.sample(ls_unique_author, n)
        ls_unique_author = [e for e in ls_unique_author if e not in candidate_authors]
        ls_queries, ls_potential_texts = [], []
        dict_row = {}
        
        for author_id in candidate_authors:
            # each text in these 2 lists are from unique authors, texts at same index are from the same author
            text, text_same_author = df.loc[author_id == df.id].text.sample(2)
            ls_queries.append(text)
            ls_potential_texts.append(text_same_author)

        dict_row["query_text"] = ls_queries
        dict_row["potential_text"] = ls_potential_texts
        dict_to_df.append(dict_row)

    df_sub = pd.DataFrame(dict_to_df)
    return df_sub

## Exp

In [None]:
import re
def run_aa(df, method, model_name, prompt_input, system_msg, ls_df, ls_model, ls_method, n_eval=5):
    """randomly select a subset of query texts"""
    start_time = time.time()
    df_res_all = pd.DataFrame()
    print("\n++++++++++ ", method, model_name, " ++++++++++")

    for i in df.index:
        ls_reps = []
        text_label_map = {}
        sampled_queries = []  # select a subset for evaluation (e.g, n_eval out of 10)
        ls_query_text, ls_potential_text = df.loc[i, 'query_text'], df.loc[i, 'potential_text']
        random.seed(0)
        for idx, val in random.sample(list(enumerate(ls_query_text)), n_eval):
            text_label_map[val] = idx
            sampled_queries.append(val)
        # print(text_label_map.values())
            
        for query_text in sampled_queries:
            example_texts = json.dumps(dict(enumerate(ls_potential_text)))
            # mistral 
            # prompt = f"""<s> [INST] {system_msg} {prompt_input} The input texts are delimited with triple backticks. ```\n\nQuery text: {query_text} \n\nTexts from potential authors: {example_texts}\n\n```[/INST]"""
            # llama
            prompt = f"""<s>[INST] <<SYS>>\n{system_msg}\n<</SYS>>\n\n{prompt_input} The input texts are delimited with triple backticks. ```\n\nQuery text: {query_text} \n\nTexts from potential authors: {example_texts}\n\n```[/INST]"""
       
            raw_response = llm.generate(prompt, sampling_params)
            response_str = raw_response[0].outputs[0].text.strip()
        
            response = json.loads("{}")
            response['analysis'] = response_str
            # ls_possible_ans = [s for s in response_str.split() if s.isdigit()]
            ls_possible_ans = re.findall(r'\d+', response_str)
            if len(ls_possible_ans) > 0:
                response['answer'] = ls_possible_ans[-1]
            else:
                response['answer'] = -1
            print('\n++++++++++ Raw response:\n', response['analysis'], '\nModel prediction:', response['answer'], 'Label:', text_label_map[query_text])
                
            response["query_text"], response["example_texts"] = query_text, example_texts
            response["tokens"] = len(raw_response[0].prompt_token_ids)
            response["label"] = text_label_map[query_text]
            ls_reps.append(response)

        df_reps = pd.DataFrame(ls_reps)
        df_reps['answer'] = pd.to_numeric(df_reps['answer'], errors='coerce')
        df_reps['answer'] = df_reps['answer'].fillna(-1)
        df_res_all = pd.concat([df_res_all, df_reps]).reset_index(drop=True)

    ls_df.append(df_res_all)
    ls_method.append(method)
    ls_model.append(model_name)
    print("--- Execution Time: %s seconds ---" % round(time.time() - start_time, 2))
    return df_res_all

In [None]:
dict_baseline = {'TF-IDF':'TF-IDF', 'BERT':'bert-base-uncased', 
                 'RoBERTa':'roberta-base', 'ELECTRA':'google/electra-base-discriminator',
                 'DeBERTa':'microsoft/deberta-base', 'Ada':'ada v2'}
dict_embed_type = {'TF-IDF':'tf-idf', 'BERT':'bert', 'RoBERTa':'bert', 
                   'ELECTRA':'bert', 'DeBERTa':'bert', 'Ada':'ada'}

def compare_baseline_mod(df_sub, ls_df, ls_model, ls_method, n_eval=5, std_flag=False):
    ls_res_avg, ls_res_std = [], []

    for key, val in list(dict_baseline.items())[:0]:
        muti_avg, muti_std = run_aa_baseline(df_sub, val, dict_embed_type[key], print_flag=False)
        ls_res_avg.append((key, val)+muti_avg+(0,))
        ls_res_std.append((key, val)+muti_std+(0,))

    for i, df_tmp in enumerate(ls_df):
        muti_avg, muti_std = eval_all_fn(df_tmp, n_eval)
        answer_tmp = df_tmp.copy()
        
        ls_res_avg.append((ls_method[i], ls_model[i])+muti_avg+(abs(answer_tmp[answer_tmp.answer==-1]['answer'].astype('int').sum()),))
        ls_res_std.append((ls_method[i], ls_model[i])+muti_std+(None,))
    
    res_avg = pd.DataFrame(ls_res_avg, columns=['Prompt', 'Model', 'Accuracy', 'Precision', 'Recall', 'F1', 'Unsure'])
    res_std = pd.DataFrame(ls_res_std, columns=['Prompt', 'Model', 'Accuracy', 'Precision', 'Recall', 'F1', 'Unsure'])
    if std_flag:
        return res_avg, res_std
    else:
        return res_avg


def eval_all_fn(df_res_all, n_eval):
    """evaluate the entire df of multiple repetitions, take avg of each rep. 
    The null or -1 answers are counted as false
    Make sure n_eval is same in run_aa()"""
    ls_acc, ls_precision, ls_recall, ls_f1 = [], [], [], []
    for i in range(0, len(df_res_all.index), n_eval):
        df_reps = df_res_all[i: i+n_eval]
        acc, precision, recall, f1 = eval_fn(df_reps["label"], df_reps["answer"], average='weighted', print_flag=False)
        ls_acc.append(acc)
        ls_precision.append(precision)
        ls_recall.append(recall)
        ls_f1.append(f1)
        
    muti_avg = (round(np.mean(ls_acc), 2), round(np.mean(ls_precision), 2), round(np.mean(ls_recall), 2), round(np.mean(ls_f1), 2))
    muti_std = (round(np.std(ls_acc), 2), round(np.std(ls_precision), 2), round(np.std(ls_recall), 2), round(np.std(ls_f1), 2))
    return muti_avg, muti_std

In [None]:
sampling_params = SamplingParams(temperature=0.0, top_p=1.0, max_tokens=4096)
sampling_params

## n = 10

In [None]:
m1, m2 = "GPT-3.5 Turbo", "GPT-4 Turbo"
v1, v2, v3, v4 = 'no_guidance', 'little_guidance', 'grammar', 'LIP'

prompt1 = "Given a set of texts with known authors and a query text, determine the author of the query text. "
prompt2 = prompt1+"Do not consider topic differences. "
prompt3 = prompt1+"Focus on grammatical styles. "
prompt4 = prompt1+"Analyze the writing styles of the input texts, disregarding the differences in topic and content. Focus on linguistic features such as phrasal verbs, modal verbs, punctuation, rare words, affixes, quantities, humor, sarcasm, typographical errors, and misspellings. "
system_msg = """Always respond with a JSON object including two key elements:
{
  "analysis": Reasoning behind your answer.
  "answer": The query text's author ID.
}"""

In [None]:
from ast import literal_eval
df_10 = pd.read_csv("llm-aa-res/blog_n10_reps3.csv", converters={"query_text": literal_eval, "potential_text": literal_eval})
df_10.shape, len(df_10.loc[0, 'potential_text'])

In [None]:
%%time
ls_df_10, ls_model_10, ls_method_10 = [], [], []
df1 = run_aa(df_10, v1, model_id, prompt1, system_msg, ls_df_10, ls_model_10, ls_method_10)
df2 = run_aa(df_10, v2, model_id, prompt2, system_msg, ls_df_10, ls_model_10, ls_method_10)
df3 = run_aa(df_10, v3, model_id, prompt3, system_msg, ls_df_10, ls_model_10, ls_method_10)
df4 = run_aa(df_10, v4, model_id, prompt4, system_msg, ls_df_10, ls_model_10, ls_method_10)
compare_baseline_mod(df_10, ls_df_10, ls_model_10, ls_method_10)

In [None]:
%%time
df_10 = pd.read_csv("llm-aa-res/email_n10_reps3.csv", converters={"query_text": literal_eval, "potential_text": literal_eval})
print(df_10.shape, len(df_10.loc[0, 'potential_text']))

ls_df_10, ls_model_10, ls_method_10 = [], [], []
df1 = run_aa(df_10, v1, model_id, prompt1, system_msg, ls_df_10, ls_model_10, ls_method_10)
df2 = run_aa(df_10, v2, model_id, prompt2, system_msg, ls_df_10, ls_model_10, ls_method_10)
df3 = run_aa(df_10, v3, model_id, prompt3, system_msg, ls_df_10, ls_model_10, ls_method_10)
df4 = run_aa(df_10, v4, model_id, prompt4, system_msg, ls_df_10, ls_model_10, ls_method_10)
compare_baseline_mod(df_10, ls_df_10, ls_model_10, ls_method_10)