In [3]:
from typing import Dict, List, Optional
from io import StringIO
import csv
import requests
import os
import itertools
import tiktoken
import openai
import pandas as pd
from dotenv import load_dotenv
import ollama

# from utilities import num_token_from_messages, memoize_sqlite

from helpers.notebook.embeddings import (
    get_embedding,
    embedding_from_text,
    distances_from_embeddings,
    indices_of_nearest_neighbors_from_distances
)

from helpers.notebook.defaults import (
    DATA_PATH,
    INPUT_FILE,
    OUTPUT_FILE,
    INPUT_PATH,
    EMBEDDING_MODEL,
)


loading data/cache.pkl...
loading data/cache.pkl...


In [4]:
load_dotenv()
client = openai.OpenAI()
embed_client = ollama.Client()


In [5]:
MAX_CONTEXT_WINDOW = 4096
MINIMUM_RESPONSE_SPACE = 1000
MAX_PROMPT_SIZE = MAX_CONTEXT_WINDOW - MINIMUM_RESPONSE_SPACE

In [6]:
from pathlib import Path


def load_csv(filename:str, path:str=DATA_PATH):
    return pd.read_csv(
        Path(path, filename)
    )

In [7]:
def wikipedia_api_fetch(
        title:str,
        field:str
) -> str:
    base_url = 'https://en.wikipedia.org/w/api.php'
    
    params = {
        'action': 'query',
        'format': 'json',
        'prop': 'extracts',
        'titles': title,
        'explaintext': True
    }
    
    # req = requests.Request('GET', base_url, params=params)
    # print(req.prepare().url)
    
    response = requests.get(base_url, params=params)
    data = response.json()
    
    if 'query' in data and 'pages' in data['query']:
        page:dict = list(data['query']['pages'].values())[0]
        if field in page:
            return page[field]
        else:
            raise ValueError(f'Could not find {field} for page {page}')
    return data


In [11]:
def build_df_from_wikipedia(df:pd.DataFrame) -> pd.DataFrame:
    df['page_content'] = df['Link'].apply(lambda link: wikipedia_api_fetch(link, 'extract'))
    df['display_title'] = df['Link'].apply(lambda link: wikipedia_api_fetch(link, 'title'))
    
    return df

In [13]:
build_df_from_wikipedia(load_csv('f1_2022.csv'))

Unnamed: 0,Link,page_content,display_title
0,2022_Formula_One_World_Championship,The 2022 FIA Formula One World Championship wa...,2022 Formula One World Championship


In [None]:
def print_recommendations_from_plot(
        strings:list[str],
        plot:str,
        k_nearest_neighbors:int=3,
        model:str=EMBEDDING_MODEL
):
    embeddings = [embedding_from_text(text, model=model)['embedding'] for text in strings]
    # query_embedding = embeddings[index_of_source_strings]
    query_embedding = embedding_from_text(plot, model=model)['embedding'] 
    distances = distances_from_embeddings(query_embedding, embeddings)
    # distances = [float(distance) for distance in distances]
    indexes = indices_of_nearest_neighbors_from_distances(distances)
    near_k_indexes = indexes[1:1+k_nearest_neighbors]
    # return [list(CACHE.values())[x]['title'] for x in near_k_indexes]