## llama 2 70B
https://docs.vllm.ai/en/latest/getting_started/quickstart.html

https://huggingface.co/models

In [None]:
import os
import time
import json
import pickle
import pandas as pd
from sklearn import metrics
from huggingface_hub import login
from vllm import LLM, SamplingParams
from transformers import AutoTokenizer

os.environ["CUDA_VISIBLE_DEVICES"] = "0"
login("replace_this")

In [None]:
# model_id = "TheBloke/Llama-2-70B-Chat-AWQ"  # AWQ takes 10 more minutes and doesn't improve performance
# llm = LLM(model="TheBloke/Llama-2-13b-Chat-AWQ", quantization="AWQ"")

model_id = "TheBloke/Llama-2-70B-chat-GPTQ"
llm = LLM(model=model_id, quantization='gptq')

In [None]:
def eval_fn(y_test, y_pred, average='binary', print_flag=True):
    acc = round(metrics.accuracy_score(y_test, y_pred)*100, 2)
    f1 = round(metrics.f1_score(y_test, y_pred, average=average)*100, 2)
    recall = round(metrics.recall_score(y_test, y_pred, average=average)*100, 2)
    precision = round(metrics.precision_score(y_test, y_pred, average=average)*100, 2)
    if print_flag:
        print("Accuracy:", acc, "% | Precision:", precision, "% | Recall:", recall, "% | F1:", f1, "%\n")   
    return acc, precision, recall, f1


def compare_baseline(exp_df_ls, exp_model_ls, exp_method_ls):
    ls_res = []
    for i, df_tmp in enumerate(exp_df_ls):
        ls_res.append((exp_method_ls[i], exp_model_ls[i])+eval_fn(df_tmp["same"], df_tmp["answer"], print_flag=False) + df_tmp.shape)
    res = pd.DataFrame(ls_res, columns=['Prompt', 'Model', 'Accuracy', 'Precision', 'Recall', 'F1', 'Size', 'df.shape[1]'])
    return res


def num_tokens_from_string(texts, model_id):
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    encoding = tokenizer(texts)
    num_tokens = len(encoding.input_ids)
    return num_tokens

In [None]:
def run_verfication(df, method, model_name, prompt_prefix, system_msg, ls_df, ls_model, ls_method, prompt_postfix="", print_flag=False):
    ls = []
    start_time = time.time()
    print("\n++++++++++ ", method, model_name, " ++++++++++")
    
    for i in df.index:
        aut_id1, aut_id2 = df.loc[i, 'aut_id1'], df.loc[i, 'aut_id2']
        text1, text2 = df.loc[i, 'text1'].strip(), df.loc[i, 'text2'].strip()
        # prompt = prompt_prefix + f"""The input texts (Text 1 and Text 2) are delimited with triple backticks. ```Text 1: {text1}, \n\nText 2: {text2}```\n\n""" + prompt_postfix
        prompt = f"""<s>[INST] <<SYS>>\n{system_msg}\n<</SYS>>\n\n{prompt_prefix} The input texts (Text 1 and Text 2) are delimited with triple backticks. ```Text 1: {text1}, \n\nText 2: {text2}```\n\n{prompt_postfix}[/INST]"""
        # print('\nprompt + Input texts:', prompt)

        raw_response = llm.generate(prompt, sampling_params)
        response_str = raw_response[0].outputs[0].text.strip()
        
        response = json.loads("{}")
        response['analysis'] = response_str
        if 'True' in response_str or 'true' in response_str:
            response['answer'] = True
        elif 'False' in response_str or 'false' in response_str:
            response['answer'] = False
        else:
            response['answer'] = not(aut_id1 == aut_id2)  # generate a wrong answer
        if print_flag:
            print('Response:\n', response['analysis'], '\nLabel:', aut_id1 == aut_id2)
        
        response["text1"], response["text2"] = text1, text2
        response["author_id1"], response["author_id2"] = aut_id1, aut_id2
        response["tokens"] = len(raw_response[0].prompt_token_ids)  # Number of input tokens
        ls.append(response)
        response = None
    df_res = pd.DataFrame(ls)
    ls_df.append(df_res)
    ls_method.append(method)
    ls_model.append(model_name)
    df_res['same'] = df_res.author_id1 == df_res.author_id2
    df_res["answer"] = df_res["answer"].astype('bool')
    eval_fn(df_res["same"], df_res["answer"])
    print("--- Execution Time: %s seconds ---" % round(time.time() - start_time, 2))
    return df_res

In [None]:
sampling_params = SamplingParams(temperature=0.0, top_p=1.0, max_tokens=4096)
sampling_params

In [None]:
v1, v2, v3, v4 = 'no_guidance', 'little_guidance', 'grammar', 'LIP'

In [None]:
system_msg = """
Respond with a JSON object including two key elements:
{
  "analysis": Reasoning behind your answer.
  "answer":  A boolean (True/False) answer.
}
"""
prompt1 = "Verify if the input texts were written by the same author. "
prompt2 = prompt1 + "Do not consider topic differences. "
prompt3 = prompt1 + "Focus on grammatical styles. "
prompt4 = prompt1 + "Analyze the writing styles of the input texts, disregarding the differences in topic and content. Reasoning based on linguistic features such as phrasal verbs, modal verbs, punctuation, rare words, affixes, quantities, humor, sarcasm, typographical errors, and misspellings. " 

### Blog

In [None]:
df_sub_blog = pd.read_csv("llm-verify-res/df_sub_blog_30.csv")
print(df_sub_blog.shape)
print(model_id)

In [None]:
%%time
ls_llama_blog_1, ls_model, ls_method = [], [], []
df1 = run_verfication(df_sub_blog, v1, 'llama2-70b', prompt1, system_msg, ls_llama_blog_1, ls_model, ls_method)
df2 = run_verfication(df_sub_blog, v2, 'llama2-70b', prompt2, system_msg, ls_llama_blog_1, ls_model, ls_method)
df3 = run_verfication(df_sub_blog, v3, 'llama2-70b', prompt3, system_msg, ls_llama_blog_1, ls_model, ls_method)
df4 = run_verfication(df_sub_blog, v4, 'llama2-70b', prompt4, system_msg, ls_llama_blog_1, ls_model, ls_method)
compare_baseline(ls_llama_blog_1, ls_model, ls_method)

In [None]:
%%time
ls_llama_blog_2, ls_model, ls_method = [], [], []
df1 = run_verfication(df_sub_blog, v1, 'llama2-70b', prompt1, system_msg, ls_llama_blog_2, ls_model, ls_method)
df2 = run_verfication(df_sub_blog, v2, 'llama2-70b', prompt2, system_msg, ls_llama_blog_2, ls_model, ls_method)
df3 = run_verfication(df_sub_blog, v3, 'llama2-70b', prompt3, system_msg, ls_llama_blog_2, ls_model, ls_method)
df4 = run_verfication(df_sub_blog, v4, 'llama2-70b', prompt4, system_msg, ls_llama_blog_2, ls_model, ls_method)
compare_baseline(ls_llama_blog_2, ls_model, ls_method)

In [None]:
%%time
ls_llama_blog_3, ls_model, ls_method = [], [], []
df1 = run_verfication(df_sub_blog, v1, 'llama2-70b', prompt1, system_msg, ls_llama_blog_3, ls_model, ls_method)
df2 = run_verfication(df_sub_blog, v2, 'llama2-70b', prompt2, system_msg, ls_llama_blog_3, ls_model, ls_method)
df3 = run_verfication(df_sub_blog, v3, 'llama2-70b', prompt3, system_msg, ls_llama_blog_3, ls_model, ls_method)
df4 = run_verfication(df_sub_blog, v4, 'llama2-70b', prompt4, system_msg, ls_llama_blog_3, ls_model, ls_method)
compare_baseline(ls_llama_blog_3, ls_model, ls_method)

In [None]:
with open("llm-verify-res/llama_70b_blog_1.pkl", "wb") as f:
    pickle.dump(ls_llama_blog_1, f)
with open("llm-verify-res/llama_70b_blog_2.pkl", "wb") as f:
    pickle.dump(ls_llama_blog_2, f)
with open("llm-verify-res/llama_70b_blog_3.pkl", "wb") as f:
    pickle.dump(ls_llama_blog_3, f)

### Email

In [None]:
df_sub_email = pd.read_csv("llm-verify-res/df_sub_email_30.csv")
print(df_sub_email.shape)

In [None]:
%%time
ls_email_1, ls_model, ls_method = [], [], []
df1 = run_verfication(df_sub_email, v1, 'llama2-70b', prompt1, system_msg, ls_email_1, ls_model, ls_method)
df2 = run_verfication(df_sub_email, v2, 'llama2-70b', prompt2, system_msg, ls_email_1, ls_model, ls_method)
df3 = run_verfication(df_sub_email, v3, 'llama2-70b', prompt3, system_msg, ls_email_1, ls_model, ls_method)
df4 = run_verfication(df_sub_email, v4, 'llama2-70b', prompt4, system_msg, ls_email_1, ls_model, ls_method)
compare_baseline(ls_email_1, ls_model, ls_method)

In [None]:
%%time
ls_email_2, ls_model, ls_method = [], [], []
df1 = run_verfication(df_sub_email, v1, 'llama2-70b', prompt1, system_msg, ls_email_2, ls_model, ls_method)
df2 = run_verfication(df_sub_email, v2, 'llama2-70b', prompt2, system_msg, ls_email_2, ls_model, ls_method)
df3 = run_verfication(df_sub_email, v3, 'llama2-70b', prompt3, system_msg, ls_email_2, ls_model, ls_method)
df4 = run_verfication(df_sub_email, v4, 'llama2-70b', prompt4, system_msg, ls_email_2, ls_model, ls_method)
compare_baseline(ls_email_2, ls_model, ls_method)

In [None]:
%%time
ls_email_3, ls_model, ls_method = [], [], []
df1 = run_verfication(df_sub_email, v1, 'llama2-70b', prompt1, system_msg, ls_email_3, ls_model, ls_method)
df2 = run_verfication(df_sub_email, v2, 'llama2-70b', prompt2, system_msg, ls_email_3, ls_model, ls_method)
df3 = run_verfication(df_sub_email, v3, 'llama2-70b', prompt3, system_msg, ls_email_3, ls_model, ls_method)
df4 = run_verfication(df_sub_email, v4, 'llama2-70b', prompt4, system_msg, ls_email_3, ls_model, ls_method)
compare_baseline(ls_email_3, ls_model, ls_method)

In [None]:
with open("llm-verify-res/llama_70b_email_1.pkl", "wb") as f:
    pickle.dump(ls_email_1, f)
with open("llm-verify-res/llama_70b_email_2.pkl", "wb") as f:
    pickle.dump(ls_email_2, f)
with open("llm-verify-res/llama_70b_email_3.pkl", "wb") as f:
    pickle.dump(ls_email_3, f)