In [None]:
!git clone https://github.com/shmsw25/FActScore.git

In [None]:
!pip install openai==0.27.0

In [None]:
!pip install -r FActScore/requirements.txt

In [None]:
!python -m spacy download en_core_web_sm

In [None]:
# Unzip demos.zip
import zipfile
with zipfile.ZipFile('/content/demos.zip', 'r') as zip_ref:
    zip_ref.extractall('/content/FActScore/factscore')

In [None]:
import pandas as pd
import json
import sys
import time
sys.path.append('/content/FActScore')
from factscore.factscorer import FactScorer

In [None]:
import openai
# Fill your openai key here
openai.api_key = ""

In [None]:
from datetime import datetime

In [None]:
import os
import shutil
os.makedirs('.cache/factscore/demos/', exist_ok=True)
shutil.copy('./FActScore/factscore/demos/demons.json',
            '.cache/factscore/demos/demons.json')

In [None]:
# Prepare input data for FACTSCORE
input_csv = '/content/.csv'  # Replace with the file name you want to test
df = pd.read_csv(input_csv)
input_data = []
for i, row in df.iterrows():
    # the title of question column should be changed to "Question", and the answer column should be changed to "Generated Answer"
    question = row["Question"]
    generated_answer = row["Generated Answer"]

    def convert_datetime(value):
        if isinstance(value, (pd.Timestamp, datetime)):
            return value.isoformat()
        return value

    question = convert_datetime(question)
    generated_answer = convert_datetime(generated_answer)

    input_data.append({
        "question": question,
        "generated_answer": generated_answer
    })

In [None]:
# Write input to a JSONL file, which FACTSCORE expects
with open("input.jsonl", "w") as f:
    for item in input_data:
        f.write(json.dumps(item) + "\n")

In [None]:
fs = FactScorer()

# upload konwledge_base.jsonl first
name_of_your_knowledge_source = 'Q&A'
fs.register_knowledge_source(name_of_your_knowledge_source, data_path='/content/knowledge_base.jsonl', db_path='input.db')

In [None]:
topics = [item["question"] for item in input_data]
generations = [item["generated_answer"] for item in input_data]

In [None]:
import nltk
nltk.download('punkt_tab')

In [None]:
from nltk.tokenize import sent_tokenize

# testing
test_text = "This is a test sentence. Let's see if the tokenizer works correctly."
print(sent_tokenize(test_text))

['This is a test sentence.', "Let's see if the tokenizer works correctly."]


In [None]:
# Replace with your openai api key first
with open("/content/api.key", "w") as f:
    f.write("")


In [None]:
import sqlite3
# set up path to store the factscore result first
excel_path = ".xlsx"
pd.DataFrame(columns=['Question', 'Score', 'Init Score', 'Respond Ratio', 'Num Facts']).to_excel(excel_path, index=False)

score = 0
init_score = 0
num_facts = 0
respond_ratio = 0

for i, item in enumerate(input_data):
    conn = sqlite3.connect("/content/input.db")
    cursor = conn.cursor()

    fs.db[name_of_your_knowledge_source].connection = conn

    out = fs.get_score([item["question"]], [item["generated_answer"]], knowledge_source=name_of_your_knowledge_source)

    new_row = pd.DataFrame({
        'Question': [item["question"]],
        'Score': [out["score"]],
        'Init Score': [out["init_score"]],
        'Respond Ratio': [out["respond_ratio"]],
        'Num Facts': [out["num_facts_per_response"]]
    })

    with pd.ExcelWriter(excel_path, engine='openpyxl', mode='a', if_sheet_exists='overlay') as writer:
        startrow = writer.sheets['Sheet1'].max_row
        new_row.to_excel(writer, index=False, header=False, startrow=startrow)

    print(f"Question {i+1} saved")

    score += out["score"] * out["num_facts_per_response"]
    init_score += out["init_score"] * out["num_facts_per_response"]
    respond_ratio += out["respond_ratio"]
    num_facts += out["num_facts_per_response"]


summary = pd.DataFrame({
    'Question': ['Summary'],
    'Score': [score / num_facts],
    'Init Score': [init_score / num_facts],
    'Respond Ratio': [respond_ratio / len(input_data)],
    'Num Facts': [num_facts / len(input_data)]
})

with pd.ExcelWriter(excel_path, engine='openpyxl', mode='a', if_sheet_exists='overlay') as writer:
    startrow = writer.sheets['Sheet1'].max_row
    summary.to_excel(writer, index=False, header=False, startrow=startrow)

print("Summary saved")