# imports

In [13]:
from typing import Dict, List, Optional
from io import StringIO
import csv
import requests
import os
import itertools
import tiktoken
import openai
import pandas as pd
from dotenv import load_dotenv
import ollama
import logging
from pathlib import Path

# from utilities import num_token_from_messages, memoize_sqlite

from helpers.notebook.cache import memoize_to_sqlite

from helpers.notebook.embeddings import (
    get_embedding,
    embedding_from_text,
    distances_from_embeddings,
    indices_of_nearest_neighbors_from_distances
)

from helpers.notebook.defaults import (
    DATA_PATH,
    INPUT_FILE,
    OUTPUT_FILE,
    INPUT_PATH,
    EMBEDDING_MODEL,
)


# create api clients

In [14]:
load_dotenv()
client = openai.OpenAI()
embed_client = ollama.Client()


# constants

In [15]:
MAX_CONTEXT_WINDOW = 4096
MINIMUM_RESPONSE_SPACE = 1000
MAX_PROMPT_SIZE = MAX_CONTEXT_WINDOW - MINIMUM_RESPONSE_SPACE

# functions

## load_csv

In [16]:
def load_csv(filename:str, path:str=DATA_PATH):
    return pd.read_csv(
        Path(path, filename)
    )

## wikipedia_api_fetch

In [17]:
@memoize_to_sqlite('cache.db')
def wikipedia_api_fetch(
        title:str,
        field:str
) -> str:
    base_url = 'https://en.wikipedia.org/w/api.php'
    
    params = {
        'action': 'query',
        'format': 'json',
        'prop': 'extracts',
        'titles': title,
        'explaintext': True
    }
    
    # req = requests.Request('GET', base_url, params=params)
    # print(req.prepare().url)
    
    response = requests.get(base_url, params=params)
    data = response.json()
    
    if 'query' in data and 'pages' in data['query']:
        page:dict = list(data['query']['pages'].values())[0]
        if field in page:
            return page[field]
        else:
            raise ValueError(f'Could not find {field} for page {page}')
    return data


## build_df_from_wikipedia

In [18]:
def build_df_from_wikipedia(df:pd.DataFrame) -> pd.DataFrame:
    df['page_content'] = df['Link'].apply(lambda link: wikipedia_api_fetch(link, 'extract'))
    df['title'] = df['Link'].apply(lambda link: wikipedia_api_fetch(link, 'title'))
    
    return df

In [None]:
import re


def wikipedia_splitter(contents:str, title:str, split_regexes:list[str], token_limit:int|None=None):
    regex = split_regexes[0]
    sections = re.split(regex, contents)
    
    if not sections[0].strip():
        sections.pop(0)
    else:
        first_section = sections.pop(0)
        

## pseudo main

In [55]:
logging.basicConfig(level=logging.INFO)

df = load_csv('f1_2022.csv')


In [None]:
build_df_from_wikipedia(df)

# \s in regex matches any whitespace character (spaces, tabs, line breaks)
# split_regexes = [r'\n=+\s', r'\n+']
split_regexes = [r'\n==\s', r'\n===\s', r'\n====\s', r'\n\n', r'\n']

# sections = []
# for index, row in df.iterrows():
#     page_content = row['page_content']
#     title = row['title']
#     for section in wikipedia_splitter(page_content, title, split_regexes):
#         sections.append(section)
title, content = tuple(df.iloc[0][['title', 'page_content']])


INFO:helpers.notebook.cache:found result returning cached value
INFO:helpers.notebook.cache:found result returning cached value


In [None]:
sections = wikipedia_splitter(content, title, split_regexes)

[section[:20] if section else section for section in sections if section]

# ignore for now

In [None]:
def print_recommendations_from_plot(
        strings:list[str],
        plot:str,
        k_nearest_neighbors:int=3,
        model:str=EMBEDDING_MODEL
):
    embeddings = [embedding_from_text(text, model=model)['embedding'] for text in strings]
    # query_embedding = embeddings[index_of_source_strings]
    query_embedding = embedding_from_text(plot, model=model)['embedding'] 
    distances = distances_from_embeddings(query_embedding, embeddings)
    # distances = [float(distance) for distance in distances]
    indexes = indices_of_nearest_neighbors_from_distances(distances)
    near_k_indexes = indexes[1:1+k_nearest_neighbors]
    # return [list(CACHE.values())[x]['title'] for x in near_k_indexes]