# Downloading the at database

In [1]:
from decouple import Config, RepositoryEnv
import requests
import pandas as pd
import numpy as np

In [2]:
# Creation d'une classe pour dialoguer plus simplement avec l'API de AT
class APIClient:
    def __init__(self,x_auth_token):
        self.x_auth_token = x_auth_token
        self.base_uri = self.get_api_base_url()
        self.bearer_token = self.get_bearer_token()
        self.session = self.create_session() 

    def get_api_base_url(self):
        return 'https://aides-territoires.beta.gouv.fr/api/'

    def get_bearer_token(self):
        url = 'https://aides-territoires.beta.gouv.fr/api/connexion/'
        headers = {
            'accept': 'application/json',
            'Content-Type': 'application/json',
            'X-AUTH-TOKEN': self.x_auth_token
        }
        response = requests.post(url, headers=headers)
        response.raise_for_status()
        return response.json()['token']

    def create_session(self):
        session = requests.Session()
        session.headers.update({
            'Authorization': f'Bearer {self.bearer_token}',
            'Accept': 'application/json'
        })
        return session

    def simple_request_endpoint(self, endpoint, method='GET'):
        url = f'{self.base_uri}{endpoint}'
        response = self.session.request(method, url)
        response.raise_for_status()
        return response

    def simple_request_url(self, url, method='GET'):
        response = self.session.request(method, url)
        response.raise_for_status()
        return response

In [None]:
config = Config(RepositoryEnv('.env'))
at_x_auth_token = config('X_AUTH_TOKEN')
    
# Utilisation vous avez besoin d'un x_auth_token fournit par aide territoires
client = APIClient(at_x_auth_token)


response = client.simple_request_endpoint('aids/?page=1&itemsPerPage=30')
data = response.json()
data_collect = data['results']
error_counter = 0
while data['next'] or error_counter >10:
    try:
        url = data['next']
        print(url)
        response = client.simple_request_url(url)
        data = response.json()
        data_collect = data_collect + data['results']
        error_counter = 0
    except Exception as error:
        print(error)
        error_counter += 1


https://aides-territoires.beta.gouv.fr/api/aids/?page=2&itemsPerPage=30
https://aides-territoires.beta.gouv.fr/api/aids/?page=3&itemsPerPage=30


KeyboardInterrupt: 

In [16]:
data_at = pd.DataFrame(data_collect).set_index("id")

data_at_select_full = data_at.dropna(subset='description')

data_at_select = data_at_select_full.copy()

sub_min_description = 2000

data_at_select = data_at_select[data_at_select['description'].apply(len)>sub_min_description].reset_index(drop=True)


data_at_select_full.to_csv('hard-database/data_at_select_full.csv')
data_at_select.to_csv('hard-database/data_at_select.csv')

# Adding detail with AI

In [19]:
from decouple import Config, RepositoryEnv
from markdownify import markdownify as md
import json

from ollama_interaction import embeding_ollama_request,generate_ollama_request

In [42]:
config = Config(RepositoryEnv('.env'))

ollama_api_url = config('OLLAMA_API_URL')
ollama_bearer_token = config('OLLAMA_BEARER_TOKEN')

In [35]:
def converte_to_md(html_text):
    if html_text:
        return md(html_text)
    else:
        return None

In [36]:
data_at['description_md'] = data_at['description'].apply(converte_to_md)
data_at['eligibility_md'] = data_at['eligibility'].apply(converte_to_md)

  soup = BeautifulSoup(html, 'html.parser')


In [89]:
def check_text_viability(text:str,min_char:int=5)->bool:
    try:
        if text:
            if len(text)>min_char:
                return True
            else:
                return False
        else:
            return False
    except:
        return False

In [1]:
def gen_context_len(text:str,prompt_system:str,ollama_api_url:str,ollama_bearer_token:str,model_options:dict,delta_token:int=0,request_options:dict={})->int:
    request_options["num_ctx"] = request_options.get("num_ctx", 16384)
    request_options["num_predict"] = request_options.get("num_predict",1)


    prompt_user = f"{text}"
    try :
        if check_text_viability(text):
            response = generate_ollama_request(
                prompt_system=prompt_system,
                response_format=None,
                prompt_user=prompt_user,
                ollama_api_url=ollama_api_url,
                bearer_token=ollama_bearer_token,
                model_options = model_options,  # Default to None
                request_options= request_options,  # Default to None
                seed=0,
                )
            token_numb = len(response['context']) - delta_token
            return token_numb
        else:
            token_numb = 0
            return 0
    except:
        return None

It's a little bit dumb to make a generation request but right now ollama dosn't have a tokenize endpoint so we do a dumy generation of 1 token to get the context len.

In [76]:
model_options = {
    "best_context":64000,
    "max_context":128000,
    "top_k": 20,
    "top_p": 0.9,
    "temperature": 0.8,
    "repeat_penalty": 1.2,
    "presence_penalty": 1.5,
    "frequency_penalty": 1.0,
}
model = "llama3.2:1b"

prompt_system = f"Tu es une IA"
prompt_user = ""

token_numb_description_md_list = []
token_numb_description_list  = []

token_numb_eligibility_md_list  = []
token_numb_eligibility_list = []

for i, row in data_at.iterrows():
    print('------')
    row_description_md = row['description_md']
    row_description = row['description']

    row_eligibility_md = row['eligibility_md']
    row_eligibility = row['eligibility']

    token_numb_description_md = gen_context_len(row_description_md,prompt_system,ollama_api_url,ollama_bearer_token,model_options)
    token_numb_description_md_list.append(token_numb_description_md)
    print(token_numb_description_md)

    token_numb_description = gen_context_len(row_description,prompt_system,ollama_api_url,ollama_bearer_token,model_options)
    token_numb_description_list.append(token_numb_description)
    print(token_numb_description)

    token_numb_eligibility_md = gen_context_len(row_eligibility_md,prompt_system,ollama_api_url,ollama_bearer_token,model_options)
    token_numb_eligibility_md_list.append(token_numb_eligibility_md)
    print(token_numb_eligibility_md)

    token_numb_eligibility = gen_context_len(row_eligibility,prompt_system,ollama_api_url,ollama_bearer_token,model_options)
    token_numb_eligibility_list.append(token_numb_eligibility)
    print(token_numb_eligibility)
    

------
501
610
380
444
------
1320
1595
1047
1306
------
548
1203
479
882
------
658
728
0
0
------
1187
1342
0
0
------
410
427
400
479
------
467
562
148
167
------
278
325
612
711
------
322
358
421
498
------
676
1292
0
0
------
221
267
82
95
------
621
766
747
865
------
780
916
747
865
------
205
246
82
95
------
230
279
82
95
------
367
681
87
99
------
243
297
747
865
------
219
272
87
99
------
364
443
87
99
------
210
275
87
99
------
407
500
89
101
------
504
613
90
107
------
268
321
87
99
------
258
312
87
99
------
233
288
87
99
------
1575
2165
1760
2330
------
1269
2146
2077
2613
------
937
1062
182
232
------
322
388
165
190
------
1216
1326
313
340
------
323
355
0
0
------
533
665
1089
1326
------
473
558
1024
1195
------
371
419
1024
1195
------
706
819
179
230
------
489
561
1024
1195
------
452
512
1024
1195
------
481
574
1024
1195
------
450
522
1024
1195
------
430
510
1024
1195
------
674
766
179
231
------
1214
1441
180
232
------
1023
1210
182
234
------
94


In [77]:
data_at['token_numb_description_md'] = token_numb_description_md_list
data_at['token_numb_description'] = token_numb_description_list

data_at['token_numb_eligibility_md'] = token_numb_eligibility_md_list
data_at['token_numb_eligibility'] = token_numb_eligibility_list

In [97]:
import plotly.express as px

px.scatter(
    data_at,
    x='token_numb_eligibility_md',
    y='token_numb_eligibility',
    title='Number of token for sub eligibility with markdown conversion vs html original'
    ).write_html('graph/token_count_eligibility.html')

px.scatter(
    data_at,
    x='token_numb_description_md',
    y='token_numb_description',
    title='Number of token for sub description with markdown conversion vs html original'
    ).write_html('graph/token_count_description.html')

In [99]:
data_at.to_csv('hard-database/data_at_select_ai.csv')