# Model Evaluation
Dev Note: Try `print` Pandas df if `display` does not work. An error sometimes is thrown if `display`. The issue is due to certain imported libraries. 


## Environment Setup

In [None]:
if 'google.colab' in str(get_ipython()):
    # authorize & import/mount colab/google drive
    from google.colab import output
    from google.colab import drive
    from google.colab import auth
    auth.authenticate_user()
    drive.mount('/content/gdrive')

    # install libraries
    !pip install --upgrade openai
    !pip install --upgrade numba
    !pip install transformers
    !pip install sentence_transformers
    !pip install unidecode
    !pip install bertopic
    !pip install unidecode
    !pip install pandas==1.1.5

    # clear output
    output.clear()

In [None]:
import json
import os 
import openai
import nltk
import torch
import string
import gspread
import random
import pickle
import pandas as pd
import numpy as np

from tqdm import tqdm
from collections import Counter
from datetime import datetime
from oauth2client.client import GoogleCredentials
from io import BytesIO
from gensim.utils import simple_preprocess
from nltk.corpus import stopwords
from unidecode import unidecode
from typing import List
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sentence_transformers import SentenceTransformer, util

# NOTE: For some reason, we get an error of Python 3.7 the first time the notebook 
# is run on a new machine thorugh Colab. Solution: Remove the bertopic import, run 
# the cell, paste the bertopic import back to the cell, and then run the cell 
# again. Don't ask. No fucking idea why.
# Note: Removed; not required for model evaluation...
# from bertopic import BERTopic

key = os.environ['openai_api']
nltk.download('stopwords')
stop = set(stopwords.words('english'))

# Alice: /content/drive/MyDrive/w210_Capstone_Project_Fall2021/Repo/
# Tim: /content/gdrive/MyDrive/Berkeley/W210/w210_Capstone_Project/Repo
ROOT_DIR = "/content/gdrive/MyDrive/Berkeley/W210/w210_Capstone_Project/Repo"
EVALUATION_EXPORT_DIR = f"{ROOT_DIR}/memorai/evaluation/reports"
PREDICTIONS_SAVE_DIR = f"{ROOT_DIR}/memorai/evaluation/predictions"
TEST_SET_SPREADSHEET_URL = "https://docs.google.com/spreadsheets/d/1xjH4mlfoLqBdQ_WNUYyMrmTmbTm7VA8KiKTjBqom5oM/edit#gid=1838923989"
MODELS_SPREADSHEET_URL = "https://docs.google.com/spreadsheets/d/1zzzGBn7oyFMDQYQo19Xmec1-qDRFmAB7xsULVTGpy2w/edit#gid=1164311185"
os.chdir(ROOT_DIR)

SCORE_PRECISION = 4
DEFAULT_ALEX_CONFIGS = {
    "engine_name": "curie:ft-brainmonkey-foundation-2021-10-26-08-56-48",
    "temp": 0.1,
    "pres_pen": 1,
    "freq_pen": 1,
    "max_tokens": 512,
    "stability_thd": 0.3,
    "echo": False}

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Test Set & Model List Prep

In [None]:
def load_gsheet(url: str, tab: str) -> pd.DataFrame:
    """
    Load Google spreadsheet and return a Pandas DF.
    """
    gc = gspread.authorize(GoogleCredentials.get_application_default())
    worksheet = gc.open_by_url(url).worksheet(tab)
    data = worksheet.get_all_values()
    headers = data.pop(0)

    return pd.DataFrame(data, columns=headers)

# load test set
TEST_DF_ALEX = load_gsheet(TEST_SET_SPREADSHEET_URL, "alex")
TEST_DF_ALEX = TEST_DF_ALEX.loc[:, ["Set Symbol", "Question", "Answer"]]
TEST_DF_ALEX["Prediction"] = None
TEST_DF_ALEX["F1"] = None
TEST_DF_ALEX["Fluency"] = None
TEST_DF_ALEX["Relevancy"] = None
TEST_DF_ALEX = TEST_DF_ALEX.astype({
                                'F1': 'f', 
                                'Fluency': 'f', 
                                'Relevancy': 'f'})
# load eval model configs
EVAL_MODELS = load_gsheet(MODELS_SPREADSHEET_URL+f'?{random.randint(0, 9999999999)}', "test")

## GPT-3 Inference

Note: Make sure to turn `echo=False` during evalution so question is not repeated.


In [None]:
def get_model():
    """
    Retrieve BERTopic model from EC2
    """
    load_bert = BERTopic.load('bertopic_trained_alex_1026')
    return load_bert

def topic_similarity(question):
    """
    Use the trained BERTopic to find out whether the user's question belongs 
    to the train data topic distribution. 
    """
    topic_model = get_model()
    question_token = simple_preprocess(question, deacc=True, max_len=512)
    question_whole = " ".join([kept for kept in question_token if not kept in stop])
    similar_topics, similarity = topic_model.find_topics(question_whole, top_n=5)
    top_score = similarity[0]
    return top_score

def content_filtering(answer: str) -> int:
    """
    Filter GPT-3 completion before returning to user
    If content is sensitive or unsafe, regenerate completion 
    0 = safe, 1 = senstive, 2 = unsafe
    """
    content_filter = openai.Completion.create(
        engine="content-filter-alpha",
        prompt= "<|endoftext|>"+answer+"\n--\nLabel:",
        temperature=0,
        max_tokens=1,
        top_p=1,
        frequency_penalty=0,
        presence_penalty=0,
        logprobs=10
    )
    content_rate = content_filter['choices'][0]["text"]
    return content_rate

def completion(
            question: str, 
            configs: dict = DEFAULT_ALEX_CONFIGS) -> str:
    """
    Generate completion given the question using the params
    """
    # NOTE: Turn off "echo" during evaluation!
    answer_parse = openai.Completion.create(
                                    model = configs['engine_name'],
                                    prompt = question,
                                    temperature = configs['temp'],
                                    max_tokens = configs['max_tokens'],
                                    frequency_penalty = configs['freq_pen'],
                                    presence_penalty = configs['pres_pen'],
                                    echo = configs['echo'],
                                    stop = [" \###"])
    return answer_parse['choices'][0]['text']

# app = FastAPI()
# @app.get("/alex_gpt/{question}")
def alex_gpt(
        question: str, 
        skip_guardrail: bool = False,
        configs: dict = DEFAULT_ALEX_CONFIGS) -> str:
    """
    Receive the question and fine topic of it
    Go through content filtering first, if unsafe, refuse to answer
    If topic is higher than threshold then answer question
    If answer is unsafe, keep generate new answer until safe or sensitive
    Return I don't know if the question is lower than threshold"
    """
    try_times = 3
    pronouns = {
        "alex":"",
        " are you":" am I",
        " are you ":" am I ",
        "Are you ":"Am I ",
        "You ":"I ",
        " you ":" I ",
        " your ":" my ",
        "Your ":"My ",
        " me ":" you "}

    # If there's empty question
    if not question or question == "":
        return "Ask me a question that you would like to know from me"

    # Parse question from api
    question_parsed = " ".join(question.split("_"))

    # Don't take question less than 3 words
    if len(question_parsed.split()) <3:
      return "That's not a fully formatted question, is it?"

    # Change pronous, a bit hacky but quick
    for key in pronouns.keys():
        question_parsed = question_parsed.replace(key, pronouns[key])
    
    if skip_guardrail:
        return completion(question_parsed, configs)

    # Content filter question
    content_rating_question = content_filtering(question_parsed)
    if content_rating_question == "2":
        return "Sorry, can't answer that one, that's not very polite."
    
    # Anwer and content filter answer
    else:
        similarity_score = topic_similarity(question_parsed)
        if similarity_score >= SIMILARITY_THRESHOLD: 
            answer = completion(question_parsed, configs)
            content_rate = content_filtering(answer)
            cur_rate = content_rate

            # Try 3 times if answer is offensive, regenerate answer
            while cur_rate == "2":
                answer = completion(question_parsed, configs)
                new_content_rate = content_filtering(answer)
                cur_rate = new_content_rate
                try_times -= 1
                if try_times < 0:
                  return "I have no nice way to respond to this. Try another question."

            return answer
        else:
            return "I really don't know the answer, please try another one."

def get_alex_configs(
            engine_name = DEFAULT_ALEX_CONFIGS['engine_name'],
            temp = DEFAULT_ALEX_CONFIGS['temp'],
            pres_pen = DEFAULT_ALEX_CONFIGS['pres_pen'],
            freq_pen = DEFAULT_ALEX_CONFIGS['freq_pen'],
            max_tokens = DEFAULT_ALEX_CONFIGS['max_tokens'],
            stability_thd = DEFAULT_ALEX_CONFIGS['stability_thd'],
            echo = DEFAULT_ALEX_CONFIGS['echo']) -> dict:
    configs = {
        "engine_name": engine_name,
        "temp": temp,
        "pres_pen": pres_pen,
        "freq_pen": freq_pen,
        "max_tokens": max_tokens,
        "stability_thd": stability_thd,
        "echo": echo}
        
    return configs

In [None]:
summary_df = EVAL_MODELS.copy()
summary_df['fact_f1'] = None
summary_df['inf_fluency'] = None
summary_df['inf_relevancy'] = None
summary_df['abs_fluency'] = None
summary_df['abs_relevancy'] = None
summary_df['all_f1'] = None
summary_df['all_fluency'] = None
summary_df['all_relevancy'] = None

# inference
test_df_dict = {}
for index, row in tqdm(summary_df.iterrows(), total=summary_df.shape[0]):
    # sometimes an error is thrown when GPT3's response is too slow; jsut a horrible 
    # hack to get around the issue 
    while True:
        try:
            model_configs = get_alex_configs(
                                engine_name = str(row['engine_name']),
                                temp = float(row['temp']),
                                pres_pen = float(row['pres_pen']),
                                freq_pen = float(row['freq_pen']))
            test_df_dict[row['model_id']] = test_df = TEST_DF_ALEX.copy()
            test_df["Prediction"] = test_df["Question"].apply(lambda x: alex_gpt(x, True, model_configs))
            break
        except:
            pass

# save predictions
with open(PREDICTIONS_SAVE_DIR+f'/preds - {str(datetime.utcnow())}.pkl', 'wb') as handle:
    pickle.dump(test_df_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

100%|██████████| 1/1 [01:36<00:00, 96.67s/it]


## Model Evaluation

In [None]:
# load fluency model
tokenizer = AutoTokenizer.from_pretrained("salesken/query_wellformedness_score")
model_fluency = AutoModelForSequenceClassification.from_pretrained("salesken/query_wellformedness_score")

# load relevancy model
# sBERT Doc: https://www.sbert.net/
# pre-trained sBERT: https://huggingface.co/sentence-transformers
# DistillBERT vs. RoBERTa: https://tinyurl.com/yz27ngb5

# STS Benchmark: https://ixa2.si.ehu.eus/stswiki/index.php/STSbenchmark
# STS STOA: https://paperswithcode.com/sota/semantic-textual-similarity-on-sts-benchmark

# Tested models (ordered from good to bad performance + speed):
#   stsb-distilroberta-base-v2
#   all-MiniLM-L12-v2
#   all-MiniLM-L6-v2
#   msmarco-distilbert-cos-v5
#   stsb-roberta-large
model_se = SentenceTransformer('stsb-distilroberta-base-v2')

Some weights of the model checkpoint at salesken/query_wellformedness_score were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
def compute_f1(a_pred: str, a_gold: str) -> float:
    """A modified version of f1 computation from SQuAD.
    Ref: https://tinyurl.com/yjscv9oy
    """
    # strip punctuation
    a_gold = a_gold.translate(str.maketrans('', '', string.punctuation))
    a_pred = a_pred.translate(str.maketrans('', '', string.punctuation))

    # break string into list
    gold_toks = a_gold.lower().split(" ")
    pred_toks = a_pred.lower().split(" ")

    # keep prediction tokens only if the token is found in gold_toks 
    pred_toks = [token for token in gold_toks if token in pred_toks]

    # compute f1
    common = Counter(gold_toks) & Counter(pred_toks)
    num_same = sum(common.values())
    if len(gold_toks) == 0 or len(pred_toks) == 0:
        # If either is no-answer, then F1 is 1 if they agree, 0 otherwise
        return int(gold_toks == pred_toks)
    if num_same == 0:
        return 0
    precision = 1.0 * num_same / len(pred_toks)
    recall = 1.0 * num_same / len(gold_toks)
    f1 = (2 * precision * recall) / (precision + recall)

    return f1

def get_f1_scores(
            preds: np.ndarray, 
            golds: np.ndarray) -> np.ndarray:
    if preds is None or len(preds) == 0:
        return []

    arr = np.vstack([preds, golds]).T
    scores_arr = np.apply_along_axis(lambda x: compute_f1(x[0], x[1]), 1, arr)
    
    return scores_arr

def get_fluency_scores(sentences: list) -> np.ndarray:
    if sentences is None or len(sentences) == 0:
        return []

    features = tokenizer(
                    sentences, 
                    padding=True, 
                    truncation=True, 
                    return_tensors="pt")
    model_fluency.eval()
    with torch.no_grad():
        scores = model_fluency(**features).logits
    
    return scores.numpy().flatten()

def clean_sentences(sentences: List[str]) -> List[str]:
    """ Convert to unicode characters and strip all punctuations.
    """
    for i, s in enumerate(sentences):
        s = unidecode(s)
        s = s.translate(str.maketrans('', '', string.punctuation))
        sentences[i] = s
    
    return sentences

def get_relevancy_scores(
                sentences1: list, 
                sentences2: list,
                model: SentenceTransformer = model_se) -> list:
    sentences1 = clean_sentences(sentences1)
    sentences2 = clean_sentences(sentences2)
    
    # embedding for both lists
    embeddings1 = model.encode(sentences1, convert_to_tensor=True)
    embeddings2 = model.encode(sentences2, convert_to_tensor=True)

    # compute cosine-similarits
    score_matrix = util.pytorch_cos_sim(embeddings1, embeddings2)
    scores = []
    for i in range(len(sentences1)):
        scores.append(float(score_matrix[i][i]))

    return scores

In [None]:
# create the report save folder
timestamp = str(datetime.utcnow())
report_save_dir = f'{EVALUATION_EXPORT_DIR}/{timestamp}'
os.mkdir(report_save_dir)

# evaluate models
for model_id, test_df in tqdm(test_df_dict.items()):
    # fact scores
    test_df.loc[test_df['Set Symbol'] == 'FACT', 'F1'] = get_f1_scores(
                                                            test_df.loc[test_df['Set Symbol'] == 'FACT', 'Prediction'], 
                                                            test_df.loc[test_df['Set Symbol'] == 'FACT', 'Answer'])
    fact_summary_df = test_df.loc[test_df['Set Symbol'] == 'FACT', ["Set Symbol", "F1"]].groupby(["Set Symbol"]).mean()

    # fluency scores
    scores = get_fluency_scores(test_df.loc[test_df["Set Symbol"] != "FACT", "Prediction"].tolist())
    test_df.loc[test_df["Set Symbol"] != "FACT", "Fluency"] = scores
    fluency_scores_df = test_df.loc[test_df["Set Symbol"] != "FACT", ["Set Symbol", "Prediction", "Fluency"]]
    fluency_summary_df = fluency_scores_df.loc[:, ["Set Symbol", "Fluency"]].groupby(["Set Symbol"]).mean()

    # relevancy scores
    scores = get_relevancy_scores(
                    test_df.loc[test_df["Set Symbol"] != "FACT", "Answer"].tolist(),
                    test_df.loc[test_df["Set Symbol"] != "FACT", "Prediction"].tolist())
    test_df.loc[test_df["Set Symbol"] != "FACT", "Relevancy"] = scores
    relevancy_scores_df = test_df.loc[test_df["Set Symbol"] != "FACT", ["Set Symbol", "Question", "Prediction", "Relevancy"]]
    relevancy_summary_df = test_df.loc[test_df["Set Symbol"] != "FACT", ["Set Symbol", "Relevancy"]].groupby(["Set Symbol"]).mean()

    # join & save
    model_summary_df = fluency_summary_df.join(relevancy_summary_df, on="Set Symbol")
    model_summary_df = model_summary_df.join(fact_summary_df, on="Set Symbol", how="outer").reset_index(drop=True)
    test_df.to_csv(f'{report_save_dir}/m{model_id}_full_report.csv')

    # update overall summary_df
    # f1
    df = fact_summary_df.reset_index()
    fact_f1 = df.loc[df['Set Symbol'] == 'FACT', 'F1'].values[0]
    summary_df.loc[summary_df['model_id'] == model_id, 'fact_f1']  = fact_f1

    # fluency
    df = fluency_summary_df.reset_index()
    inf_fluency = df.loc[df['Set Symbol'] == 'INF', 'Fluency'].values[0]
    abs_fluency = df.loc[df['Set Symbol'] == 'ABS', 'Fluency'].values[0]
    summary_df.loc[summary_df['model_id'] == model_id, 'inf_fluency']  = inf_fluency
    summary_df.loc[summary_df['model_id'] == model_id, 'abs_fluency']  = abs_fluency
    
    # relevancy
    df = relevancy_summary_df.reset_index()
    inf_relevancy = df.loc[df['Set Symbol'] == 'INF', 'Relevancy'].values[0]
    abs_relevancy = df.loc[df['Set Symbol'] == 'ABS', 'Relevancy'].values[0]
    summary_df.loc[summary_df['model_id'] == model_id, 'inf_relevancy']  = inf_relevancy
    summary_df.loc[summary_df['model_id'] == model_id, 'abs_relevancy']  = abs_relevancy

    # macro average scores
    all_fluency = (inf_fluency + abs_fluency) / 2
    all_relevancy = (inf_relevancy + abs_relevancy) / 2
    all_macro = (fact_f1 + all_fluency + all_relevancy) / 3
    summary_df.loc[summary_df['model_id'] == model_id, 'all_f1']  = fact_f1
    summary_df.loc[summary_df['model_id'] == model_id, 'all_fluency']  = all_fluency
    summary_df.loc[summary_df['model_id'] == model_id, 'all_relevancy']  = all_relevancy
    summary_df.loc[summary_df['model_id'] == model_id, 'all_macro']  = all_macro

# save overall summary report
summary_path = f'{report_save_dir}/summary.csv'
summary_df.to_csv(summary_path)
print("\n\nReports Directory:")
print(report_save_dir)
print("\nSummary Report:")
print(summary_path)

# print executive summary
df = summary_df[['model_id', 'all_f1', 'all_fluency', 'all_relevancy', 'all_macro']]
df.sort_values(by='all_macro', ascending=False)

100%|██████████| 1/1 [00:24<00:00, 24.47s/it]



Reports Directory:
/content/gdrive/MyDrive/Berkeley/W210/w210_Capstone_Project/Repo/memorai/evaluation/reports/2021-12-04 06:14:38.103663

Summary Report:
/content/gdrive/MyDrive/Berkeley/W210/w210_Capstone_Project/Repo/memorai/evaluation/reports/2021-12-04 06:14:38.103663/summary.csv





Unnamed: 0,model_id,all_f1,all_fluency,all_relevancy,all_macro
0,99,0.274921,0.55826,0.333737,0.388973
