# Downloading the at database

In [1]:
from decouple import Config, RepositoryEnv
import requests
import pandas as pd
import numpy as np

In [17]:
# Creation d'une classe pour dialoguer plus simplement avec l'API de AT
class APIClient:
    def __init__(self,x_auth_token):
        self.x_auth_token = x_auth_token
        self.base_uri = self.get_api_base_url()
        self.bearer_token = self.get_bearer_token()
        self.session = self.create_session() 

    def get_api_base_url(self):
        return 'https://aides-territoires.beta.gouv.fr/api/'

    def get_bearer_token(self):
        url = 'https://aides-territoires.beta.gouv.fr/api/connexion/'
        headers = {
            'accept': 'application/json',
            'Content-Type': 'application/json',
            'X-AUTH-TOKEN': self.x_auth_token
        }
        response = requests.post(url, headers=headers)
        response.raise_for_status()
        return response.json()['token']

    def create_session(self):
        session = requests.Session()
        session.headers.update({
            'Authorization': f'Bearer {self.bearer_token}',
            'Accept': 'application/json'
        })
        return session

    def simple_request_endpoint(self, endpoint, method='GET'):
        url = f'{self.base_uri}{endpoint}'
        response = self.session.request(method, url)
        response.raise_for_status()
        return response

    def simple_request_url(self, url, method='GET'):
        response = self.session.request(method, url)
        response.raise_for_status()
        return response

### Collect perimeters data

In [82]:
config = Config(RepositoryEnv('.env'))
at_x_auth_token = config('X_AUTH_TOKEN')
    
# Utilisation vous avez besoin d'un x_auth_token fournit par aide territoires
client = APIClient(at_x_auth_token)

In [83]:
response = client.simple_request_endpoint('perimeters/?page=1&itemsPerPage=100')
data = response.json()
data_collect = data['results']
error_counter = 0
while data['next'] or error_counter >10:
    try:
        url = data['next']
        print(url)
        response = client.simple_request_url(url)
        data = response.json()
        data_collect = data_collect + data['results']
        error_counter = 0
    except Exception as error:
        print(error)
        error_counter += 1

https://aides-territoires.beta.gouv.fr/api/perimeters/?page=2&itemsPerPage=100
https://aides-territoires.beta.gouv.fr/api/perimeters/?page=3&itemsPerPage=100
https://aides-territoires.beta.gouv.fr/api/perimeters/?page=4&itemsPerPage=100
https://aides-territoires.beta.gouv.fr/api/perimeters/?page=5&itemsPerPage=100
https://aides-territoires.beta.gouv.fr/api/perimeters/?page=6&itemsPerPage=100
https://aides-territoires.beta.gouv.fr/api/perimeters/?page=7&itemsPerPage=100
https://aides-territoires.beta.gouv.fr/api/perimeters/?page=8&itemsPerPage=100
https://aides-territoires.beta.gouv.fr/api/perimeters/?page=9&itemsPerPage=100
https://aides-territoires.beta.gouv.fr/api/perimeters/?page=10&itemsPerPage=100
https://aides-territoires.beta.gouv.fr/api/perimeters/?page=11&itemsPerPage=100
https://aides-territoires.beta.gouv.fr/api/perimeters/?page=12&itemsPerPage=100
https://aides-territoires.beta.gouv.fr/api/perimeters/?page=13&itemsPerPage=100
https://aides-territoires.beta.gouv.fr/api/perim

### Collect aide data

In [None]:
data_at_perimeters = pd.DataFrame(data_collect).set_index("id")
data_at_perimeters.to_csv('hard-database/perimeters-data.csv')

In [None]:
response = client.simple_request_endpoint('aids/?page=1&itemsPerPage=100')
data = response.json()
data_collect = data['results']
error_counter = 0
while data['next'] or error_counter >10:
    try:
        url = data['next']
        print(url)
        response = client.simple_request_url(url)
        data = response.json()
        data_collect = data_collect + data['results']
        error_counter = 0
    except Exception as error:
        print(error)
        error_counter += 1


https://aides-territoires.beta.gouv.fr/api/aids/?page=2&itemsPerPage=30
https://aides-territoires.beta.gouv.fr/api/aids/?page=3&itemsPerPage=30
https://aides-territoires.beta.gouv.fr/api/aids/?page=4&itemsPerPage=30
https://aides-territoires.beta.gouv.fr/api/aids/?page=5&itemsPerPage=30
https://aides-territoires.beta.gouv.fr/api/aids/?page=6&itemsPerPage=30
https://aides-territoires.beta.gouv.fr/api/aids/?page=7&itemsPerPage=30
https://aides-territoires.beta.gouv.fr/api/aids/?page=8&itemsPerPage=30
https://aides-territoires.beta.gouv.fr/api/aids/?page=9&itemsPerPage=30
https://aides-territoires.beta.gouv.fr/api/aids/?page=10&itemsPerPage=30
https://aides-territoires.beta.gouv.fr/api/aids/?page=11&itemsPerPage=30
https://aides-territoires.beta.gouv.fr/api/aids/?page=12&itemsPerPage=30
https://aides-territoires.beta.gouv.fr/api/aids/?page=13&itemsPerPage=30
https://aides-territoires.beta.gouv.fr/api/aids/?page=14&itemsPerPage=30
https://aides-territoires.beta.gouv.fr/api/aids/?page=15&it

In [4]:
data_at = pd.DataFrame(data_collect).set_index("id")

data_at_select_full = data_at.dropna(subset='description')

data_at_select = data_at_select_full.copy()

sub_min_description = 2000

data_at_select = data_at_select[data_at_select['description'].apply(len)>sub_min_description].reset_index(drop=True)


data_at_select_full.to_csv('hard-database/data_at_select_full.csv')
data_at_select.to_csv('hard-database/data_at_select.csv')

# Adding detail with AI

In [5]:
from decouple import Config, RepositoryEnv
from markdownify import markdownify as md
import json

from ollama_interaction import generate_ollama_request,generate_ollama_embeddings

In [6]:
config = Config(RepositoryEnv('.env'))

ollama_api_url = config('OLLAMA_API_URL')
ollama_bearer_token = config('OLLAMA_BEARER_TOKEN')

In [7]:
def converte_to_md(html_text):
    if html_text:
        return md(html_text)
    else:
        return None

In [8]:
data_at['description_md'] = data_at['description'].apply(converte_to_md)
data_at['eligibility_md'] = data_at['eligibility'].apply(converte_to_md)

  soup = BeautifulSoup(html, 'html.parser')


In [9]:
def check_text_viability(text:str,min_char:int=5)->bool:
    try:
        if text:
            if len(text)>min_char:
                return True
            else:
                return False
        else:
            return False
    except:
        return False

In [10]:
def gen_context_len(text:str,prompt_system:str,ollama_api_url:str,ollama_bearer_token:str,model_options:dict,delta_token:int=0,request_options:dict={})->int:
    request_options["num_ctx"] = request_options.get("num_ctx", 16384)
    request_options["num_predict"] = request_options.get("num_predict",1)


    prompt_user = f"{text}"
    try :
        if check_text_viability(text):
            response = generate_ollama_request(
                prompt_system=prompt_system,
                response_format=None,
                prompt_user=prompt_user,
                ollama_api_url=ollama_api_url,
                bearer_token=ollama_bearer_token,
                model_options = model_options,  # Default to None
                request_options= request_options,  # Default to None
                seed=0,
                )
            token_numb = len(response['context']) - delta_token
            return token_numb
        else:
            token_numb = 0
            return 0
    except:
        return None

It's a little bit dumb to make a generation request but right now ollama dosn't have a tokenize endpoint so we do a dumy generation of 1 token to get the context len.

In [11]:
model_options = {
    "best_context":64000,
    "max_context":128000,
    "top_k": 20,
    "top_p": 0.9,
    "temperature": 0.8,
    "repeat_penalty": 1.2,
    "presence_penalty": 1.5,
    "frequency_penalty": 1.0,
}
model = "llama3.2:1b"

prompt_system = f""
prompt_user = ""

token_numb_description_md_list = []
token_numb_description_list  = []

token_numb_eligibility_md_list  = []
token_numb_eligibility_list = []

for i, row in data_at.iterrows():
    # print('------')
    row_description_md = row['description_md']
    row_description = row['description']

    row_eligibility_md = row['eligibility_md']
    row_eligibility = row['eligibility']

    token_numb_description_md = gen_context_len(row_description_md,prompt_system,ollama_api_url,ollama_bearer_token,model_options)
    token_numb_description_md_list.append(token_numb_description_md)
    # print(token_numb_description_md)

    token_numb_description = gen_context_len(row_description,prompt_system,ollama_api_url,ollama_bearer_token,model_options)
    token_numb_description_list.append(token_numb_description)
    # print(token_numb_description)

    token_numb_eligibility_md = gen_context_len(row_eligibility_md,prompt_system,ollama_api_url,ollama_bearer_token,model_options)
    token_numb_eligibility_md_list.append(token_numb_eligibility_md)
    # print(token_numb_eligibility_md)

    token_numb_eligibility = gen_context_len(row_eligibility,prompt_system,ollama_api_url,ollama_bearer_token,model_options)
    token_numb_eligibility_list.append(token_numb_eligibility)
    # print(token_numb_eligibility)
    

In [12]:
data_at['token_numb_description_md'] = token_numb_description_md_list
data_at['token_numb_description'] = token_numb_description_list

data_at['token_numb_eligibility_md'] = token_numb_eligibility_md_list
data_at['token_numb_eligibility'] = token_numb_eligibility_list

In [13]:
data_at.to_csv('hard-database/data_at_select_ai.csv')

# Embedding generation

In [14]:
embedding_list = []
for description in data_at['description_md']:
    response = generate_ollama_embeddings(prompt=description,ollama_api_url=ollama_api_url)
    emb = response['embedding']
    embedding_list.append(emb)
    # print('ok')
data_at['embedding_md'] = embedding_list

In [15]:
data_at.to_csv('hard-database/data_at_select_ai_emb.csv')

# Graph extraction

In [16]:
import plotly.express as px

px.scatter(
    data_at,
    x='token_numb_eligibility_md',
    y='token_numb_eligibility',
    title='Number of token for sub eligibility with markdown conversion vs html original'
    ).write_html('graph/token_count_eligibility.html')

px.scatter(
    data_at,
    x='token_numb_description_md',
    y='token_numb_description',
    title='Number of token for sub description with markdown conversion vs html original'
    ).write_html('graph/token_count_description.html')