## Search "ECB guide to internal models"

**A. Get Questions and Embeddings**:
- Load ECB Guide embeddings from a pickle file.
- Convert string embeddings to numpy arrays.
- Import questions from an Excel file.

**B. Helper Functions**:
- `search_docs`: Search documents and rank them based on cosine similarity of embeddings.
- `create_embedding`: Generate an embedding for a search phrase.
- `test_answers`: Test the questions against the embeddings to get relevant answers.
- `display_rows`: Display specified rows from a DataFrame in a given format.

**C. Conduct Testing**:
- Run the `test_answers` function and display the top results.

**D. Perform Manual Testing**
- Conduct manual testing for specific queries and display the top 3 results.

In [1]:
import pandas as pd
from tqdm import tqdm
import openai
import time
import tiktoken
from openai.embeddings_utils import get_embedding, cosine_similarity
import ast
import numpy as np
import matplotlib.pyplot as plt


# Settings
tqdm.pandas()
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

#### A. Get questions and embeddings

In [2]:
# Get the ECB Guide with embeddings and testing questions
embeddings = pd.read_pickle("ecb_guide_embeddings.pkl")
embeddings['embedding'] = embeddings['embedding'].apply(ast.literal_eval).apply(np.array)

questions = pd.read_excel("questions_ecb_extended.xlsx")

# Extract relevant parts from questions
idx = questions['in_scope'] == 1
questions = questions[idx]

#### B. Helper functions

In [3]:
def search_docs(df, search_phrase):
    search_embedding = create_embedding(search_phrase)
    df["similarity"] = df["embedding"].apply(
        lambda x: cosine_similarity(x, search_embedding))
    df = df.sort_values(by='similarity', ascending=False).reset_index()
    first_value = df['Index'].iloc[0]

    return df, first_value

def create_embedding(search_phrase):
    return get_embedding(
        search_phrase,
        engine="text-embedding-ada-002"
    )

def test_answers(embeddings, questions):
    # Create an empty list to store the results
    results_list = []
    
    for index, row in questions.iterrows():
        query = row['question']
        question_number = row['Index']
        df, first_value = search_docs(embeddings, query)

        # Create a Boolean mask and find the index of the first occurrence
        mask = df['Index'] == question_number
        first_index = mask.idxmax() + 1 if mask.any() else 10000

       # Append a new dictionary to the results list
        results_list.append({
            'Question': query,
            'question_number': question_number,
            'first_value': first_value,
            'top_result': int(first_index),
            'total_documents': len(df)
        })

    # Convert the list of dictionaries to a DataFrame
    results_df = pd.DataFrame(results_list)

    # Replace NaN values with 1000 in column 'A'
    results_df['top_result'].fillna(100000, inplace=True)
    results_df['top_result'] = results_df['top_result'].astype(int)
    
    return results_df

def display_rows(df, top=3):
    """
    Display multiple rows in the DataFrame in the specified format.
    
    Parameters:
    - df (pd.DataFrame): The DataFrame containing the data
    - row_indices (list): A list of indices of the rows to display
    
    """
    row_indices = list(range(top))
    
    for i, row_index in enumerate(row_indices):
        # Get the values of the specified row from the DataFrame
        row_data = df.iloc[row_index]
        
        # Extract the 'source' and 'text' values from the row
        source_value = row_data['full_label']
        text_value = row_data['checked_sentence']
        
        # Display the data as specified
        print(f"Source: {source_value}\n")
        print(f"{format_text(text_value)}\n")
        
        # Print separator if not the last row
        if i < len(row_indices) - 1:
            print("-" * 10)


def format_text(text):
    # Split the text into words
    words = text.split()
    
    # Initialize the formatted text and a temporary line
    formatted_text = ""
    line = ""
    
    for word in words:
        # If the word is a bullet or special marker, start a new line with spacing
        special_markers = {"(a)", "(b)", "(c)", "(d)", "(e)", "(f)", "(g)", "(h)", "(i)", "(j)", 
                           "(i)", "(ii)", "(iii)", "(iv)", "(v)", "(vi)",
                          "(vii)", "(viii)", "(ix)", "(x)", "(xi)", "(xii)",
                          "•"}

        if word in special_markers:
            formatted_text += line + "\n\n"  # Add two new lines for spacing
            line = word + " "
        # If adding the word does not exceed 80 characters, add it to the line
        elif len(line + word) <= 100:
            line += word + " "
        # If adding the word exceeds 80 characters, start a new line
        else:
            formatted_text += line + "\n"
            line = word + " "
    # Add the last line to the formatted text
    formatted_text += line
    
    return formatted_text

def frequency_analysis(places):
    top_1 = sum(places == 1)
    top_5 = sum(places <= 5)
    top_10 = sum(places <= 10)
    top_1000 = sum(places <= 10000)
    
    print(f"Occurrences in Top 1: {top_1}")
    print(f"Occurrences in Top 5: {top_5}")
    print(f"Occurrences in Top 10: {top_10}")
    print(f"Occurrences in Top 1000: {top_1000}")

#### C. Conduct testing

In [4]:
# Comppute the results
test_results = test_answers(embeddings, questions)

# Add stats
test_results['Rank/Total'] = test_results['top_result'].astype(str) + " / " + test_results['total_documents'].astype(str)
test_results['Top_10'] = test_results['top_result'].apply(lambda x: '✔️' if x <= 10 else '❌')

print('     Document Retrieval Performance: finding relevant document out of 1121 options')
test_results[['Question', 'Rank/Total', 'Top_10']].sample(len(test_results))

     Document Retrieval Performance: finding relevant document out of 1121 options


ValueError: Cannot take a larger sample than population when 'replace=False'

In [None]:
# Get top results
frequency_analysis(test_results['top_result'])

#### D. Peform manual testing

In [None]:
# Test 1
query = "What does the term 'initial validation' refer to?"
df, first_value = search_docs(embeddings, query)

display_rows(df, 2)

In [None]:
# Test 2
query = 'what are reference dates for EAD/CCF modelling?'
df, first_value = search_docs(embeddings, query)

display_rows(df, 2)