# Model Evaluation
Dev Note: Try `print` Pandas df if `display` does not work. An error sometimes is thrown if `display`. The issue is due to certain imported libraries. 


## Environment Setup

In [1]:
if 'google.colab' in str(get_ipython()):
    # authorize & import/mount colab/google drive
    from google.colab import output
    from google.colab import drive
    from google.colab import auth
    auth.authenticate_user()
    drive.mount('/content/gdrive')

    # install libraries
    !pip install --upgrade openai
    !pip install --upgrade numba
    !pip install transformers
    !pip install sentence_transformers
    !pip install unidecode
    !pip install bertopic
    !pip install unidecode
    !pip install pandas==1.1.5

    # clear output
    output.clear()

In [2]:
import json
import os 
import openai
import nltk
import torch
import string
import gspread
import random
import pickle
import pandas as pd
import numpy as np

from tqdm import tqdm
from collections import Counter
from datetime import datetime
from oauth2client.client import GoogleCredentials
from io import BytesIO
from gensim.utils import simple_preprocess
from nltk.corpus import stopwords
from unidecode import unidecode
from typing import List
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sentence_transformers import SentenceTransformer, util

# NOTE: For some reason, we get an error of Python 3.7 the first time the notebook 
# is run on a new machine thorugh Colab. Solution: Remove the bertopic import, run 
# the cell, paste the bertopic import back to the cell, and then run the cell 
# again. Don't ask. No fucking idea why.
# Note: Removed; not required for model evaluation...
# from bertopic import BERTopic

key = os.environ['openai_api']
nltk.download('stopwords')
stop = set(stopwords.words('english'))

# Alice: /content/drive/MyDrive/w210_Capstone_Project_Fall2021/Repo/
# Tim: /content/gdrive/MyDrive/Berkeley/W210/w210_Capstone_Project/Repo
ROOT_DIR = "/content/gdrive/MyDrive/Berkeley/W210/w210_Capstone_Project/Repo"
EVALUATION_EXPORT_DIR = f"{ROOT_DIR}/memorai/evaluation/reports"
PREDICTIONS_SAVE_DIR = f"{ROOT_DIR}/memorai/evaluation/predictions"
TEST_SET_SPREADSHEET_URL = "https://docs.google.com/spreadsheets/d/1xjH4mlfoLqBdQ_WNUYyMrmTmbTm7VA8KiKTjBqom5oM/edit#gid=1838923989"
MODELS_SPREADSHEET_URL = "https://docs.google.com/spreadsheets/d/1zzzGBn7oyFMDQYQo19Xmec1-qDRFmAB7xsULVTGpy2w/edit#gid=1164311185"
os.chdir(ROOT_DIR)

SCORE_PRECISION = 4
DEFAULT_ALEX_CONFIGS = {
    "engine_name": "curie:ft-brainmonkey-foundation-2021-10-26-08-56-48",
    "temp": 0.1,
    "pres_pen": 1,
    "freq_pen": 1,
    "max_tokens": 512,
    "stability_thd": 0.3,
    "echo": False}

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Test Set & Model List Prep

In [3]:
def load_gsheet(url: str, tab: str) -> pd.DataFrame:
    """
    Load Google spreadsheet and return a Pandas DF.
    """
    gc = gspread.authorize(GoogleCredentials.get_application_default())
    worksheet = gc.open_by_url(url).worksheet(tab)
    data = worksheet.get_all_values()
    headers = data.pop(0)

    return pd.DataFrame(data, columns=headers)

# load test set
TEST_DF_ALEX = load_gsheet(TEST_SET_SPREADSHEET_URL, "factual_qs_eval")
TEST_DF_ALEX = TEST_DF_ALEX.loc[:, ["Question", "Answer"]]
TEST_DF_ALEX["Prediction"] = None
TEST_DF_ALEX["F1"] = None
TEST_DF_ALEX = TEST_DF_ALEX.astype({'F1': 'f'})

# load eval model configs for gpt3 completition model
COMPLETION_MODEL_CONFIGS = load_gsheet(MODELS_SPREADSHEET_URL+f'?{random.randint(0, 9999999999)}', "models")

## GPT-3 Inference

Note: Make sure to turn `echo=False` during evalution so question is not repeated.


In [4]:
SIMILARITY_THRESHOLD_QUESTION = 0.35
SIMILARITY_THRESHOLD_ANSWER = 0.3

def get_model():
    """
    Retrieve BERTopic model from EC2
    """
    load_bert = BERTopic.load('bertopic_trained_alex_1026')
    return load_bert


def topic_similarity(question):
    """
    Use the trained BERTopic to find out whether the user's question belongs 
    to the train data topic distribution. 
    """
    topic_model = get_model()
    question_token = simple_preprocess(question, deacc=True, max_len=512)
    question_whole = " ".join([kept for kept in question_token if not kept in stop])
    similar_topics, similarity = topic_model.find_topics(question_whole, top_n=5)
    top_score = similarity[0]
    return top_score


def content_filtering(answer: str) -> int:
    """
    Filter GPT-3 completion before returning to user
    If content is sensitive or unsafe, regenerate completion 
    0 = safe, 1 = senstive, 2 = unsafe
    """
    content_filter = openai.Completion.create(
        engine="content-filter-alpha",
        prompt= "<|endoftext|>"+answer+"\n--\nLabel:",
        temperature=0,
        max_tokens=1,
        top_p=1,
        frequency_penalty=0,
        presence_penalty=0,
        logprobs=10
    )
    content_rate = content_filter['choices'][0]["text"]
    return content_rate


def question_answer(question: str) -> str:
  try:
    # best model: temp = 0.4, pres pen = -1.5, freq pen = 2.00
    qa_answer = openai.Answer.create (
        search_model="babbage", 
        model="davinci", 
        question=question, 
        file="file-mcgwAkzglsZSibNyeFuuGjcH",
        examples_context="In 2017, U.S. life expectancy was 78.6 years.", 
        examples=[["What is human life expectancy in the United States?", "78 years."]], 
        max_rerank=200,
        max_tokens=25,
        temperature=0.05,
        stop=["\n", "<|endoftext|>"]
    )
    qa_answer_parse = qa_answer['answers'][0]
    # Acccounting for incomplete answer
    if not qa_answer_parse.endswith("."):
        qa_answer_x = qa_answer_parse.split(".")
        return qa_answer_x[0]+'.'
    else: 
      return qa_answer_parse
  except:
    return "This is not within my training data, I don't have the an answer. Sorry."


def completion(
            question: str, 
            configs: dict = DEFAULT_ALEX_CONFIGS) -> str:
    """
    Generate completion given the question using the params
    """
    # NOTE: Turn off "echo" during evaluation!
    answer_parse = openai.Completion.create(
                                    model = configs['engine_name'],
                                    prompt = question,
                                    temperature = configs['temp'],
                                    max_tokens = configs['max_tokens'],
                                    frequency_penalty = configs['freq_pen'],
                                    presence_penalty = configs['pres_pen'],
                                    echo = configs['echo'],
                                    stop = [" \###"])
    return answer_parse['choices'][0]['text']


# app = FastAPI()
# @app.get("/alex_gpt/{question}")
def alex_gpt(
        question: str, 
        skip_guardrail: bool = False,
        configs: dict = DEFAULT_ALEX_CONFIGS) -> str:
    """
    Receive the question and fine topic of it
    Go through content filtering first, if unsafe, refuse to answer
    If topic is higher than threshold then answer question
    If answer is unsafe, keep generate new answer until safe or sensitive
    Return I don't know if the question is lower than threshold"
    """    
    return completion(question, configs)      


def get_alex_configs(
            engine_name = DEFAULT_ALEX_CONFIGS['engine_name'],
            temp = DEFAULT_ALEX_CONFIGS['temp'],
            pres_pen = DEFAULT_ALEX_CONFIGS['pres_pen'],
            freq_pen = DEFAULT_ALEX_CONFIGS['freq_pen'],
            max_tokens = DEFAULT_ALEX_CONFIGS['max_tokens'],
            stability_thd = DEFAULT_ALEX_CONFIGS['stability_thd'],
            echo = DEFAULT_ALEX_CONFIGS['echo']) -> dict:
    configs = {
        "engine_name": engine_name,
        "temp": temp,
        "pres_pen": pres_pen,
        "freq_pen": freq_pen,
        "max_tokens": max_tokens,
        "stability_thd": stability_thd,
        "echo": echo}
        
    return configs

In [5]:
# completion_summary_df = COMPLETION_MODEL_CONFIGS.copy()
# completion_summary_df['fact_f1'] = None

# inference - q&a
qa_test_df = TEST_DF_ALEX.copy()
qa_test_df["Prediction"] = qa_test_df["Question"].apply(lambda x: question_answer(x))

# # inference - completion
# completion_test_df_dict = {}
# for index, row in tqdm(completion_summary_df.iterrows(), total=completion_summary_df.shape[0]):
#     # sometimes an error is thrown when GPT3's response is too slow; jsut a horrible 
#     # hack to get around the issue 
#     while True:
#         try:
#             model_configs = get_alex_configs(
#                                 engine_name = str(row['engine_name']),
#                                 temp = float(row['temp']),
#                                 pres_pen = float(row['pres_pen']),
#                                 freq_pen = float(row['freq_pen']))
#             completion_test_df_dict[row['model_id']] = completion_test_df = TEST_DF_ALEX.copy()
#             completion_test_df["Prediction"] = completion_test_df["Question"].apply(lambda x: alex_gpt(x, True, model_configs))
#             break
#         except:
#             pass
    

# # save predictions
# with open(PREDICTIONS_SAVE_DIR+f'/preds - {str(datetime.utcnow())}.pkl', 'wb') as handle:
#     pickle.dump(completion_test_df_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

## Model Evaluation

In [6]:
def compute_f1(a_pred: str, a_gold: str) -> float:
    """A modified version of f1 computation from SQuAD.
    Ref: https://tinyurl.com/yjscv9oy
    """
    # strip punctuation
    a_gold = a_gold.translate(str.maketrans('', '', string.punctuation))
    a_pred = a_pred.translate(str.maketrans('', '', string.punctuation))

    # break string into list
    gold_toks = a_gold.lower().split(" ")
    pred_toks = a_pred.lower().split(" ")

    # keep prediction tokens only if the token is found in gold_toks 
    pred_toks = [token for token in gold_toks if token in pred_toks]

    # compute f1
    common = Counter(gold_toks) & Counter(pred_toks)
    num_same = sum(common.values())
    if len(gold_toks) == 0 or len(pred_toks) == 0:
        #  if either is no-answer, then F1 is 1 if they agree, 0 otherwise
        return int(gold_toks == pred_toks)
    if num_same == 0:
        return 0
    precision = 1.0 * num_same / len(pred_toks)
    recall = 1.0 * num_same / len(gold_toks)
    f1 = (2 * precision * recall) / (precision + recall)

    return f1

def get_f1_scores(
            preds: np.ndarray, 
            golds: np.ndarray) -> np.ndarray:
    if preds is None or len(preds) == 0:
        return []

    arr = np.vstack([preds, golds]).T
    scores_arr = np.apply_along_axis(lambda x: compute_f1(x[0], x[1]), 1, arr)
    
    return scores_arr

In [7]:
# create the report save folder
timestamp = str(datetime.utcnow())
report_save_dir = f'{EVALUATION_EXPORT_DIR}/{timestamp}'
os.mkdir(report_save_dir)

# evaluate qa model
qa_test_df.loc[:, 'F1'] = get_f1_scores(
                                qa_test_df.loc[:, 'Prediction'], 
                                qa_test_df.loc[:, 'Answer'])
qa_test_df.to_csv(f'{report_save_dir}/qa_full_report.csv')
qa_f1 = qa_test_df.loc[:, "F1"].mean()

# # evaluate completion models
# BEST_COMPLETION_MODEL_ID = '3'
# completion_best_model_f1 = 0
# for model_id, completion_test_df in tqdm(completion_test_df_dict.items()):
#     # fact scores
#     completion_test_df.loc[:, 'F1'] = get_f1_scores(
#                                         completion_test_df.loc[:, 'Prediction'], 
#                                         completion_test_df.loc[:, 'Answer'])
#     completion_test_df.to_csv(f'{report_save_dir}/completion_m{model_id}_full_report.csv')

#     # update completion_summary_df
#     fact_f1 = completion_test_df.loc[:, "F1"].mean()
#     completion_summary_df.loc[completion_summary_df['model_id'] == model_id, 'fact_f1']  = fact_f1

#     # track down best model f1
#     if model_id == BEST_COMPLETION_MODEL_ID:
#         completion_best_model_f1 = fact_f1
# completion_f1 = completion_summary_df.loc[:, "fact_f1"].mean()

# overall summary
summary_all_df = {
    'qa_f1': [qa_f1]
    # 'completion_best_model_f1' : [completion_best_model_f1],
    # 'completion_models_macro_avg_f1': [completion_f1]
}
summary_all_df = pd.DataFrame.from_dict(summary_all_df)
summary_all_path = f'{report_save_dir}/summary_all.csv'
summary_all_df.to_csv(summary_all_path)
print("\n\nReports Directory:")
print(report_save_dir)
print("\nCompletion Summary Report:")
print(summary_all_path)

# print executive summary
print("")
summary_all_df



Reports Directory:
/content/gdrive/MyDrive/Berkeley/W210/w210_Capstone_Project/Repo/memorai/evaluation/reports/2021-12-04 06:08:24.750365

Completion Summary Report:
/content/gdrive/MyDrive/Berkeley/W210/w210_Capstone_Project/Repo/memorai/evaluation/reports/2021-12-04 06:08:24.750365/summary_all.csv



Unnamed: 0,qa_f1
0,0.854422
