In [2]:
!pip install openai langchain ipywidgets pandas ipywidgets
!pip install faiss-cpu --no-cache
!pip install tiktoken
!jupyter nbextension enable --py widgetsnbextension

Enabling notebook extension jupyter-js-widgets/extension...
      - Validating: [32mOK[0m


In [1]:
import pandas as pd
from ipywidgets import widgets
#pd.set_option('display.max_colwidth', None)


import os
import openai
from IPython.display import display, Markdown
import json

from langchain.chat_models import ChatOpenAI

from IPython.display import display, Markdown
from langchain.prompts import ChatPromptTemplate
from langchain import PromptTemplate, LLMChain
from langchain.prompts.chat import (
    ChatPromptTemplate,
    SystemMessagePromptTemplate,
    AIMessagePromptTemplate,
    HumanMessagePromptTemplate,
)

from langchain.output_parsers import ResponseSchema
from langchain.output_parsers import StructuredOutputParser



In [4]:
def load_credentials(file_path):
    with open(file_path, 'r') as file:
        content = file.read()

    lines = content.split('\n')
    username = lines[0].split(': ')[1]
    api_key = lines[1].split(': ')[1]

    username = username.strip()
    api_key = api_key.strip()

    return username, api_key

In [6]:
username, api_key = load_credentials('Data/Key.txt')
openai.api_key = api_key
os.environ['OPENAI_API_KEY'] = api_key

# Evaluation Pipeline for the Recommender System

This notebook establishes the necessary ingredinets for the evaluation pipeline for our recommender prototype. The design is inspired by both LangChain documentation and OpenAI's evals repository. It is relatively clear that the challenge in the evaluation is the lack of ground truth values. Thus we utilize the functionality of large language models (LLMs) to evaluate our model.

The notebook is divided into two main parts:

## Part 1: Query Development 

In this section, we develop queries from 100 randomly selected articles. For each article, we generate two types of queries:

1. **General queries**: These queries are derived from the broad information of the specific news article. And are supposed to only capture the general topics that the specific news article captures.
2. **Specific queries**: These queries are closely tied to the content of the specific news article. 

These queries will later be used in the recommender system to verify if it suggests the original news article from which the query was derived.

## Part 2: Evaluation of Recommender System Responses 

This part of the notebook deals with the evaluation of the recommender system's responses. The idea is to use the LLM to determine whether a suggestion is relevant to the specific query. To achieve this, we develop an eval chain and validate it using the previously generated queries.

We develop two different system prompts:

1. **Normal eval system**: A stricter system.
2. **Relaxed eval system**: A more lenient system.

Based on the evaluation chain, the general queries achieved a relevance score of 88 on the normal eval system and 99 on the relaxed one. Similarly, the specific queries achieved a relevance score of 97 and 98, respectively.

Thus, this system can now be utilized in the recommender system for evaluation purposes. However, please keep in mind that each evaluation run incurs a cost, as we use the LLM and send it the entire information of the article, which can be costly.


## Part 1: Query Formulation

Much like the topic assignment procedure, we iterated over numerous prompts to generate the desired queries. Alongside this, we also employed zero-shot examples to better steer the language model. In this instance, we adjusted the language model's temperature to 0.75 to inspire more creative queries that are less dependent on the zero-shot examples. The produced queries, along with the corresponding article IDs, are saved in a JSON file, enabling subsequent analysis. In addition teh gernal chain and specific chaina re used on the same articles such that comapring the results between these two generated prompts is possible. 


In [3]:
#Load Data
file = "Data/cleaned_df.csv"
df = pd.read_csv(file)

df = df[df['characters']<=5000]
sorted_df = df.sort_values('characters')
df.shape

(5150, 23)

In [29]:
# System and human template in order to generate the queries
system_template_general = """ You are provided with the textual information and the title as well as subtitle of a new article.
Your task is to create a Human prompt for the news article, specifically taking into acocount the topics that the article is talking about.
The propmt should be general format and not mention specific words or context of the article. Rather conctrate on the general topics described.
The prompt must be written in german. The input information is given in german. 
The different inputs of the news article are seperated by: ####
Missing values are indicated by the string: nan
"""

system_template_specific = """ You are provided with the textual information and the title as well as subtitle of a new article.
Your task is to create a Human prompt for the news article, specifically taking into acocount the topics that the article is talking about.
The propmt should be very specifict. Meaning is I use it in a query it will only return this article and no other. Conctrate on the specific topics described.
The prompt must be written in german. The input information is given in german. 
The different inputs of the news article are seperated by: ####
Missing values are indicated by the string: nan
"""

human_template = """ The following is a news article. All the information is given in german.
The title of the article:
####
{title}
####
The text of the article:
####
{text}
####
The general classification given by the dataset:
####
{general}
####
The subtitle of the article:
####
{subtitle}
####
The supertitle of the article:
{dachzeile}
###
The rubrik and resort of the article:
{rubrik}, {resort}
####
"""
templates_gen = {'system_gen':system_template_general, 'system_spe': system_template_specific, 'human_temp': human_template}


In [23]:
if False:
    with open ('prompt_templates/evals/gen_sys_templates.json', 'w') as handle:
        json.dump(templates_gen,handle)

In [24]:
with open ('prompt_templates/evals/gen_sys_templates.json', 'r') as handle:
    templates_gen = json.load(handle)

In [30]:
# This is the zeros hot example, it is about the yearly earnings of teh News Corp corperation.
templates_new = {}
idx= 635
id = sorted_df.loc[idx,:]['article_id']
title = sorted_df.loc[idx,:]['title']
subtitle = sorted_df.loc[idx,:]['subtitle']
text = sorted_df.loc[idx,:]['paragraphs']
rubrik = sorted_df.loc[idx,:]['rubrik']
general = sorted_df.loc[idx,:]['general_topic']
ressort = sorted_df.loc[idx,:]['ressort']
dach = sorted_df.loc[idx,:]['dachzeile']

human_example = human_template.format(title = title, subtitle = subtitle,text=text, rubrik = rubrik, resort=ressort, general=general, dachzeile=dach)

ai_example_general = "Ich such nach einen Artikel über Nachrichtendienste"
ai_example_spec = "Könntest du Artikel über die Geschäftsentwicklung von der Mediengruppe News Corp anzeigen"

templates_new[id] =[]
templates_new[id].append({'human':human_example, 'ai_gen':ai_example_general, 'ai_spec':ai_example_spec})


In [26]:
if False:
    with open ('prompt_templates/evals/gen_templates.json', 'w') as handle:
        json.dump(templates_new,handle)

In [31]:
 with open ('prompt_templates/evals/gen_templates.json', 'r') as handle:
        templates_new = json.load(handle)

In [13]:
# for the genration of prompts i set the temperature higher to let the llm be more creative
chat = ChatOpenAI(temperature=0.75)

In [41]:
system_message_prompt_gen = SystemMessagePromptTemplate.from_template(templates_gen['system_gen'])
system_message_prompt_spe = SystemMessagePromptTemplate.from_template(templates_gen['system_spe'])
human_message_prompt = HumanMessagePromptTemplate.from_template(templates_gen['human_temp'])
example_human = HumanMessagePromptTemplate.from_template(templates_new['FALTER_20151111C499C672CF'][0]['human'])
example_ai_gen = AIMessagePromptTemplate.from_template(templates_new['FALTER_20151111C499C672CF'][0]['ai_gen'])
example_ai_spe = AIMessagePromptTemplate.from_template(templates_new['FALTER_20151111C499C672CF'][0]['ai_spec'])


In [42]:
chat_prompt_gen = ChatPromptTemplate.from_messages([system_message_prompt_gen, example_human, example_ai_gen, human_message_prompt])
chain_gen = LLMChain(llm=chat, prompt=chat_prompt_gen)

chat_prompt_spe = ChatPromptTemplate.from_messages([system_message_prompt_spe, example_human, example_ai_spe, human_message_prompt])
chain_spe = LLMChain(llm=chat, prompt=chat_prompt_spe)

In [43]:
# loop over generate 100 random propmts, the articles for the general and specific set are the same in order to be able to compare them
# No article is selected twice to keep diveristy.

if False:
    general_eval_set = {}
    spe_eval_set = {}

    used_indices = set()

    for i in range(100):
        idx = random.randint(0,df.shape[0]-1)
        while idx in used_indices:
            idx = random.randint(0, df.shape[0] - 1)

        used_indices.add(idx)
        id = sorted_df.loc[idx,:]['article_id']
        title = sorted_df.loc[idx,:]['title']
        subtitle = sorted_df.loc[idx,:]['subtitle']
        text = sorted_df.loc[idx,:]['paragraphs']
        rubrik = sorted_df.loc[idx,:]['rubrik']
        general = sorted_df.loc[idx,:]['general_topic']
        ressort = sorted_df.loc[idx,:]['ressort']
        dach = sorted_df.loc[idx,:]['dachzeile']

        result_gen = chain_gen.run(title = title, subtitle = subtitle,text=text, rubrik = rubrik, resort=ressort, general=general, dachzeile=dach)
        result_spe = chain_spe.run(title = title, subtitle = subtitle,text=text, rubrik = rubrik, resort=ressort, general=general, dachzeile=dach)


        general_eval_set[i]=[{'prompt':result_gen, 'answer':id}]
        spe_eval_set[i]=[{'prompt':result_spe, 'answer':id}]


    with open ('evals/general_prompts.json', 'w') as handle:
        json.dump(general_eval_set, handle)

    with open ('evals/specific_prompts.json', 'w') as handle:
        json.dump(spe_eval_set, handle)
        


In [45]:
with open ('prompt_templates/evals/general_prompts.json', 'r') as handle:
        general_eval_set = json.load(handle)

with open ('prompt_templates/evals/specific_prompts.json', 'r') as handle:
        spe_eval_set = json.load( handle)
        

## Part 2: Query Evaluation and Refinement

While the queries generated in the previous section serve as a good starting point, merely evaluating recommendations based on the match between the article IDs of the base articles can be overly restrictive. This is due to the fact that articles might cover similar topics, making them relevant recommendations nonetheless. Hence, we designed a language model pipeline that takes a query and the recommended news article, assessing the relevance of the news article to the query. We developed two evaluation chains: one lenient and one stricter. These chains allow us to gauge the quality of the generated queries. The generic queries achieved a relevance score of 88 in the standard evaluation system and 99 in the more relaxed system. Similarly, the specific queries achieved relevance scores of 97 and 98, respectively.


In [46]:
# new chat with low temperature this time we need the llm to be listen to its constructions 
chat = ChatOpenAI(temperature=0)

In [47]:
system_template = """You are given the information of a news article and a user prompt. 
You are part of the evaluation system of a recommender system for news articles based on user prompts.
More specific the recoomendation system is trying to match user prompts to news articles based on the topics that the user is looking for.
Your task is to decide if the recommended article matches the user prompt in terms of the topics that the prompt is looking for.
The input information is given in german. 
The different inputs of the news article are seperated by: ####
Missing values are indicated by the string: nan

{format_instructions}"""

system_template_relaxed = """You are given the information of a news article and a user prompt. 
You are part of the evaluation system of a recommender system for news articles based on user prompts.
More specific the recoomendation system is trying to match user prompts to news articles based on the topics that the user is looking for.
Your task is to decide if the recommended article matches the user prompt in terms of the topics. In your decision process you are already satisfied if the topics the prompt is looking for are only remotly present in the news article.
The input information is given in german. 
The different inputs of the news article are seperated by: ####
Missing values are indicated by the string: nan

{format_instructions}"""

system_template_relaxed_2 = """You are given the information of a news article and a user prompt. 
You are part of the evaluation system of a recommender system that matches news articles based on user prompts based on topics.
Your task is to decide if the recommended article matches the user prompt in terms of the topics that both handle. 
In your decision process you are already not strict and are already satisfied if the topics the prompt is looking for are only slightly present in the news article.
The input information is given in german. 
The different inputs of the news article are seperated by: ####
Missing values are indicated by the string: nan

{format_instructions}"""



human_template = """ The following is a news article and the corrpsoning prompt. All the information is given in german.
The title of the article:
####
{title}
####
The text of the article:
####
{text}
####
The general classification given by the dataset:
####
{general}
####
The subtitle of the article:
####
{subtitle}
####
The supertitle of the article:
{dachzeile}
###
The rubrik and resort of the article:
{rubrik}, {resort}
####
Finally the propmt:
{prompt}
####
"""

description = "Matches the recommended news article the given prompt in terms of topics of the news article that the user prompt is looking for  ? \
                            Answer True if yes,\
                            False if not or unknown."

description_relaxed = "Matches the recommended news article the given prompt in terms of topics of the news article that the user prompt is looking for in the slightest sense? \
                            Answer True if yes,\
                            False if not or unknown."

description_relaxed_2 = "Matches the recommended news article the given prompt in terms of topics of the news article that the user prompt is looking for in the slightest sense? \
                            Answer True if yes,\
                            False if not or unknown."

templates = {'system': system_template, 'system_relaxed':system_template_relaxed, 'system_relaxed_2':system_template_relaxed_2, 'human':human_template, 'schema': description, 'schema_relaxed' : description_relaxed, 'schema_relaxed_2' : description_relaxed_2}


In [50]:
if False:
    with open('prompt_templates/evals/eval_templates.json', 'w') as handle:
        json.dump(templates, handle)

In [51]:
with open('prompt_templates/evals/eval_templates.json', 'r') as handle:
        templates = json.load(handle)

In [54]:
# the schema makes the output uniform and we can simply return a dictionary
relevance_schema = ResponseSchema(name="relevance",
                            description=templates['schema'])
                                                  
relevance_schema_rel = ResponseSchema(name="relevance",
                            description=templates['schema_relaxed_2'])

response_schemas = [relevance_schema]
response_schemas_rel = [relevance_schema_rel]
                                                  
output_parser = StructuredOutputParser.from_response_schemas(response_schemas)
output_parser_rel = StructuredOutputParser.from_response_schemas(response_schemas_rel)


format_instructions = output_parser.get_format_instructions()
format_instructions_rel = output_parser_rel.get_format_instructions()

system_message_prompt = SystemMessagePromptTemplate.from_template(templates['system'])
system_message_prompt_rel = SystemMessagePromptTemplate.from_template(templates['system_relaxed_2'])
                                                  
human_message_prompt = HumanMessagePromptTemplate.from_template(templates['human'])

chat_prompt = ChatPromptTemplate.from_messages([system_message_prompt, human_message_prompt])
chat_prompt_rel = ChatPromptTemplate.from_messages([system_message_prompt_rel, human_message_prompt])
                                                  
chain = LLMChain(llm=chat, prompt=chat_prompt)
chain_rel = LLMChain(llm=chat, prompt=chat_prompt_rel)

#### Validation of the Prompts and the normal Chain

In [55]:
# Evaluate the whole General prompts set and the strict evaluation
# Carful costs a lot of money and takes about 10 minutes
ovr_results_gen = []
for i in range(len(general_eval_set)):
    prompt = general_eval_set[str(i)][0]['prompt']
    id = general_eval_set[str(i)][0]['answer']
    matching_row = df[df['article_id'] == id]


    title = matching_row['title'].values[0]
    subtitle = matching_row['subtitle'].values[0]
    text = matching_row['paragraphs'].values[0]
    rubrik = matching_row['rubrik'].values[0]
    general = matching_row['general_topic'].values[0]
    ressort = matching_row['ressort'].values[0]
    dach = matching_row['dachzeile'].values[0]

    result = chain.run(format_instructions = format_instructions, title = title, subtitle = subtitle,text=text, rubrik = rubrik, resort=ressort, general=general, dachzeile=dach, prompt = prompt)

    result_dict = output_parser.parse(result)
    result_dict['id'] = id
    ovr_results_gen.append(result_dict)

Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.<locals>._completion_with_retry in 1.0 seconds as it raised APIError: Bad gateway. {"error":{"code":502,"message":"Bad gateway.","param":null,"type":"cf_bad_gateway"}} 502 {'error': {'code': 502, 'message': 'Bad gateway.', 'param': None, 'type': 'cf_bad_gateway'}} {'Date': 'Mon, 03 Jul 2023 13:34:29 GMT', 'Content-Type': 'application/json', 'Content-Length': '84', 'Connection': 'keep-alive', 'X-Frame-Options': 'SAMEORIGIN', 'Referrer-Policy': 'same-origin', 'Cache-Control': 'private, max-age=0, no-store, no-cache, must-revalidate, post-check=0, pre-check=0', 'Expires': 'Thu, 01 Jan 1970 00:00:01 GMT', 'Server': 'cloudflare', 'CF-RAY': '7e0f7a00ad4b2fb9-VIE', 'alt-svc': 'h3=":443"; ma=86400'}.


In [56]:
true_count = 0
total_count = len(ovr_results_gen)

for entry in ovr_results_gen:
    relevance = entry['relevance']
    if isinstance(relevance, bool) and relevance:
        true_count += 1

percentage = (true_count / total_count) * 100

print(percentage)

88.0


In [59]:
# Evaluate the whole specific prompts set
# Careful costs a lot of money
ovr_results_spe = []
for i in range(len(spe_eval_set)):
    prompt = spe_eval_set[str(i)][0]['prompt']
    id = spe_eval_set[str(i)][0]['answer']
    matching_row = df[df['article_id'] == id]
    title = matching_row['title'].values[0]
    subtitle = matching_row['subtitle'].values[0]
    text = matching_row['paragraphs'].values[0]
    rubrik = matching_row['rubrik'].values[0]
    general = matching_row['general_topic'].values[0]
    ressort = matching_row['ressort'].values[0]
    dach = matching_row['dachzeile'].values[0]

    result = chain.run(format_instructions = format_instructions, title = title, subtitle = subtitle,text=text, rubrik = rubrik, resort=ressort, general=general, dachzeile=dach, prompt = prompt)

    result_dict = output_parser.parse(result)
    result_dict['id'] = id
    ovr_results_spe.append(result_dict)

Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.<locals>._completion_with_retry in 1.0 seconds as it raised ServiceUnavailableError: The server is overloaded or not ready yet..


In [60]:
true_count = 0
total_count = len(ovr_results_spe)

for entry in ovr_results_spe:
    relevance = entry['relevance']
    if isinstance(relevance, bool) and relevance:
        true_count += 1

percentage = (true_count / total_count) * 100

print(percentage)

97.0


#### Validation with the Prompts and the Relaxed Chain

In [61]:
# Evaluate the whole General prompts set and the strict evaluation
# Carful costs a lot of money and takes about 10 minutes
ovr_results_gen = []
for i in range(len(general_eval_set)):
    prompt = general_eval_set[str(i)][0]['prompt']
    id = general_eval_set[str(i)][0]['answer']
    matching_row = df[df['article_id'] == id]


    title = matching_row['title'].values[0]
    subtitle = matching_row['subtitle'].values[0]
    text = matching_row['paragraphs'].values[0]
    rubrik = matching_row['rubrik'].values[0]
    general = matching_row['general_topic'].values[0]
    ressort = matching_row['ressort'].values[0]
    dach = matching_row['dachzeile'].values[0]

    result = chain_rel.run(format_instructions = format_instructions_rel, title = title, subtitle = subtitle,text=text, rubrik = rubrik, resort=ressort, general=general, dachzeile=dach, prompt = prompt)

    result_dict = output_parser.parse(result)
    result_dict['id'] = id
    ovr_results_gen.append(result_dict)

In [62]:
true_count = 0
total_count = len(ovr_results_gen)

for entry in ovr_results_gen:
    relevance = entry['relevance']
    if isinstance(relevance, bool) and relevance:
        true_count += 1

percentage = (true_count / total_count) * 100

print(percentage)

99.0


In [63]:
# Evaluate the whole specific prompts set
# Careful costs a lot of money
ovr_results_spe = []
for i in range(len(spe_eval_set)):
    prompt = spe_eval_set[str(i)][0]['prompt']
    id = spe_eval_set[str(i)][0]['answer']
    matching_row = df[df['article_id'] == id]
    title = matching_row['title'].values[0]
    subtitle = matching_row['subtitle'].values[0]
    text = matching_row['paragraphs'].values[0]
    rubrik = matching_row['rubrik'].values[0]
    general = matching_row['general_topic'].values[0]
    ressort = matching_row['ressort'].values[0]
    dach = matching_row['dachzeile'].values[0]

    result = chain_rel.run(format_instructions = format_instructions_rel, title = title, subtitle = subtitle,text=text, rubrik = rubrik, resort=ressort, general=general, dachzeile=dach, prompt = prompt)

    result_dict = output_parser.parse(result)
    result_dict['id'] = id
    ovr_results_spe.append(result_dict)

In [64]:
true_count = 0
total_count = len(ovr_results_spe)

for entry in ovr_results_spe:
    relevance = entry['relevance']
    if isinstance(relevance, bool) and relevance:
        true_count += 1

percentage = (true_count / total_count) * 100

print(percentage)

98.0
