## Search "ECB guide to internal models"

1. **A. Get Questions and Embeddings**
    - Load ECB Guide embeddings from a pickle file.
    - Convert string embeddings to numpy arrays.
    - Import questions from an Excel file.

2. **B. Helper Functions**
    - `search_docs`: Search documents and rank them based on cosine similarity of embeddings.
    - `create_embedding`: Generate an embedding for a search phrase.
    - `test_answers`: Test the questions against the embeddings to get relevant answers.
    - `display_rows`: Display specified rows from a DataFrame in a given format.

3. **C. Conduct Testing**
    - Run the `test_answers` function and display the top results.

4. **D. Perform Manual Testing**
    - Conduct manual testing for specific queries and display the top 3 results.

In [None]:
import pandas as pd
from tqdm import tqdm
import openai
import time
import tiktoken
from openai.embeddings_utils import get_embedding, cosine_similarity
import ast
import numpy as np
import matplotlib.pyplot as plt


# Settings
tqdm.pandas()
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

#### A. Get questions and embeddings

In [None]:
# Get the ECB Guide with embeddings and testing questions
embeddings = pd.read_pickle("ecb_guide_embeddings.pkl")
embeddings['embedding'] = embeddings['embedding'].apply(ast.literal_eval).apply(np.array)

questions = pd.read_excel("questions_ecb.xlsx")

#### B. Helper functions

In [None]:
def search_docs(df, search_phrase, top=10):
    search_embedding = create_embedding(search_phrase)
    df["similarity"] = df["embedding"].apply(
        lambda x: cosine_similarity(x, search_embedding))
    best_answers = df.sort_values(by='similarity', ascending=False).reset_index().head(top)
    first_value = best_answers['paragraph_number'].iloc[0]

    return best_answers, df, first_value

def create_embedding(search_phrase):
    return get_embedding(
        search_phrase,
        engine="text-embedding-ada-002"
    )

def test_answers(embeddings, questions):
    # Create an empty list to store the results
    results_list = []
    
    for index, row in questions.iterrows():
        query = row['Question']
        question_number = row['Number']
        best_answers, df, first_value = search_docs(embeddings, query, top=100)

        # Create a Boolean mask and find the index of the first occurrence
        mask = best_answers['paragraph_number'] == first_value
        first_index = mask.idxmax() if mask.any() else 10000

       # Append a new dictionary to the results list
        results_list.append({
            'Query': query,
            'question_number': question_number,
            'first_value': first_value,
            'first_index': int(first_index)
        })

    # Convert the list of dictionaries to a DataFrame
    results_df = pd.DataFrame(results_list)

    # Replace NaN values with 1000 in column 'A'
    results_df['first_index'].fillna(100000, inplace=True)
    results_df['first_index'] = results_df['first_index'].astype(int)
    
    return results_df

def display_rows(df, row_indices):
    """
    Display multiple rows in the DataFrame in the specified format.
    
    Parameters:
    - df (pd.DataFrame): The DataFrame containing the data
    - row_indices (list): A list of indices of the rows to display
    
    """
    for i, row_index in enumerate(row_indices):
        # Get the values of the specified row from the DataFrame
        row_data = df.iloc[row_index]
        
        # Extract the 'source' and 'text' values from the row
        source_value = row_data['full_label']
        text_value = row_data['checked_sentence']
        
        # Display the data as specified
        print(f"Source: {source_value}\n")
        print(f"Source: {text_value}\n")
        
        # Print separator if not the last row
        if i < len(row_indices) - 1:
            print("-" * 10)

#### C. Conduct testing

In [None]:
# Get the file
test_results = test_answers(embeddings, questions)

test_results.head()

In [None]:
# Create a summary graph
pass

#### D. Peform manual testing

In [None]:
# Test 1
query = "What does the term 'initial validation' refer to?"
best_answers, df, first_value = search_docs(df, query, top=3)

print(best_answers)

In [None]:
# Test 2
query = 'what are reference dates for EAD/CCF modelling?'
best_answers, df, first_value = search_docs(df, query, top=3)

print(best_answers)