# Import

In [1]:
from decouple import Config, RepositoryEnv
from ollama_interaction import embeding_ollama_request,generate_ollama_request

import pandas as pd
import json
import time

## Import des données sur les modèles
Nous importons les modèles servant de base pour VApp. Par défaut, les modèles sont quantifiés en int4.

In [2]:
# context option are based on https://github.com/NVIDIA/RULER
# If model is not on doc we take the nearest one

# best context is based on  Effective length
# max context is base on claimed length

with open('model-data.json','r') as file:
    model_data = json.load(file)

for item in list(model_data.keys()):
    print('model :',item)

model : mistral-nemo:latest
model : mistral-small:latest
model : qwen2.5:14b
model : qwen2.5:32b
model : llama3.2:1b
model : llama3.1:latest


In [3]:
config = Config(RepositoryEnv('.env'))

ollama_api_url = config('OLLAMA_API_URL')
ollama_bearer_token = config('OLLAMA_BEARER_TOKEN')

gpu_model = config('OLLAMA_GPU')

# Scoring generation

## Import data base de travail
Import de la base de données générée dans download-data-base-at.ipynb.

In [4]:
data = pd.read_csv("hard-database/data_at_select_ai.csv",index_col='id')

data = data[(data['token_numb_description']<5000)&(data['token_numb_description']>500)]

In [5]:
# Load un échantillion de description de projet
with open("project-description-sample.json",'r') as file:
    project_descrpition_list = json.load(file)

## Usage de la fonction gen_prompt_aide_scoring

### Exemple d'usage
Pour générer le scoring, un petit LLM comme llama3.2:1b semble être suffisant. Des benchmark devront être réalisé avec des modèles plus gros mais moins rapide.

In [6]:
# Fonction disponible sous ./prompt_script/gen_prompt_aide_scoring.py
from prompt_script.gen_prompt_aide_scoring import gen_prompt_aide_scoring

In [7]:
seed_number = 5
max_retry = 3
row_list = []

model = "llama3.1:latest"
model_options = model_data[model]

score_sub_request_options = {
    "num_ctx": 16384,
    "num_predict": 2
}

error_list = []

request_options = score_sub_request_options

for project_descrpition_key in project_descrpition_list:
    project_description = project_descrpition_list[project_descrpition_key]
    print('-------------------------')
    print('starting : ',project_description[:80])
    starting_project_time = time.time()
    for i, row in data.iterrows():
        # print('-------------------------')
        # print(project_description)
        # print('---------')
        # print('Aide : ',row['name'])
        aide_description = row['description_md']
        aide_eligibility = row['eligibility_md']
        prompt_system,prompt_user = gen_prompt_aide_scoring(aide_description,project_description,max_score=5,min_score=-5)

        score_sub = 0
        seed = 0
        scoring_made = 0
        retry = 0

        start_requesting_score = time.time()
        while scoring_made < seed_number and retry < max_retry:
            seed += 1
            response = generate_ollama_request(
                prompt_system=prompt_system,
                response_format=None,
                prompt_user=prompt_user,
                ollama_api_url=ollama_api_url,
                bearer_token=ollama_bearer_token,
                model_options = model_options,  # Default to None
                request_options= request_options,  # Default to None
                seed=seed,
                )
            if response:
                try :
                    response_filtred = response['response'].replace(' ','').replace('\n','')
                    score_seed = int(response_filtred)
                    if score_seed > 5:
                        score_seed =5
                    if score_seed < -5:
                        score_seed = -5
                    score_sub+=score_seed
                    retry = 0
                    scoring_made+=1
                    # print(score_sub)
                except Exception as error:
                    retry += 1
                    error_list.append(response_filtred)
                    # print(f"error : {response['response']}")
        end_requesting_score = time.time()
        row['project_description'] = project_description
        row['project_score'] = score_sub
        row['scoring_made'] = scoring_made
        row['scoring_error'] = seed - scoring_made
        row['request_time_total'] = end_requesting_score - start_requesting_score
        row['request_time_single'] = (end_requesting_score - start_requesting_score)/(seed)
        row['gpu'] = gpu_model
        # print('score : ',score_sub)
        # print('error made : ',seed - scoring_made)
        row_list.append(row)
    end_project_time = time.time()
    print(f"project needed {(end_project_time - starting_project_time)/60}min to generate score for {len(data)} sub with {seed_number} seed")
    print(f"averaging {(end_project_time - starting_project_time)/len(data) :.3}s per sub")
    print(f"averaging {(end_project_time - starting_project_time)/(len(data)*seed_number) :.3}s per sub indexed on seed_number")


-------------------------
starting :  Revitalisation d'une zone humide
project needed 19.716583061218262min to generate score for 1432 sub with 5 seed
averaging 0.826s per sub
averaging 0.165s per sub indexed on seed_number
-------------------------
starting :  Entretient d'un vieux moulin
project needed 19.54666873613993min to generate score for 1432 sub with 5 seed
averaging 0.819s per sub
averaging 0.164s per sub indexed on seed_number
-------------------------
starting :  Réhabilitation d'une ancienne école en lieu dédié à la santé.
project needed 19.704740301767984min to generate score for 1432 sub with 5 seed
averaging 0.826s per sub
averaging 0.165s per sub indexed on seed_number
-------------------------
starting :  Voir fiche action PVD /ORT n°21 Après une première phase d’aménagement, la commu
project needed 19.741469343503315min to generate score for 1432 sub with 5 seed
averaging 0.827s per sub
averaging 0.165s per sub indexed on seed_number
-------------------------
starti

In [8]:
def normalize_score(project_score:int,scoring_made:int,score_max:int=5,score_min:int=-5)->(float,float):
    corrected_project_score = project_score/scoring_made

    corrected_normalize_score =(corrected_project_score-score_min)/(score_max-score_min)

    return corrected_normalize_score, corrected_project_score

In [9]:
data_project_score = pd.DataFrame(row_list)

corrected_normalize_score, corrected_project_score = normalize_score(data_project_score['project_score'],data_project_score['scoring_made'])

data_project_score['corrected_normalize_score'] = corrected_normalize_score
data_project_score['corrected_project_score'] = corrected_project_score

data_project_score.head(5)

Unnamed: 0,slug,url,name,name_initial,short_title,financers,financers_full,instructors,instructors_full,programs,...,token_numb_eligibility,project_description,project_score,scoring_made,scoring_error,request_time_total,request_time_single,gpu,corrected_normalize_score,corrected_project_score
162955,appel-a-projets-pedagogiques-culture-cheval-an...,/aides/appel-a-projets-pedagogiques-culture-ch...,Développer des projets pédagogiques en lien av...,Appel à projets pédagogiques Culture Cheval – ...,,['Conseil départemental de la Manche'],"[{'id': 164, 'name': 'Conseil départemental de...",[],[],[],...,444,Revitalisation d'une zone humide,3,5,0,3.044942,0.608988,H100,0.56,0.6
162956,appel-a-manifestation-dinteret-pour-loccupatio...,/aides/appel-a-manifestation-dinteret-pour-loc...,Candidater à l'appel à manifestation d’intérêt...,Appel à manifestation d’intérêt pour l’occupat...,,"[""Conseil régional de Provence-Alpes-Côte d'Az...","[{'id': 93, 'name': ""Conseil régional de Prove...",[],[],[],...,1306,Revitalisation d'une zone humide,2,5,0,0.837005,0.167401,H100,0.54,0.4
162957,passeurs-dimages-en-bourgogne-franche-comte,/aides/passeurs-dimages-en-bourgogne-franche-c...,Mener des projets d'éducation à l'image sur le...,Passeurs d'images,,['Ministère de la Culture'],"[{'id': 96, 'name': 'Ministère de la Culture',...",[],[],[],...,882,Revitalisation d'une zone humide,16,5,0,0.790148,0.15803,H100,0.82,3.2
162965,ecoconception-textile-dhabillement-texhabi,/aides/ecoconception-textile-dhabillement-texh...,Soutenir les projets d’écoconception textile e...,Écoconception textile d'habillement - TEXHABI,,['ADEME'],"[{'id': 22, 'name': 'ADEME', 'logo': 'https://...",[],[],[],...,0,Revitalisation d'une zone humide,-3,5,0,0.789822,0.157964,H100,0.44,-0.6
162966,aqacia-2024-gerer-les-pollutions-a-lozone-et-s...,/aides/aqacia-2024-gerer-les-pollutions-a-lozo...,Gérer les pollutions à l’ozone et sectorielles,AQACIA 2024 - Gérer les pollutions à l’ozone e...,,['ADEME'],"[{'id': 22, 'name': 'ADEME', 'logo': 'https://...",[],[],[],...,0,Revitalisation d'une zone humide,0,5,0,0.843217,0.168643,H100,0.5,0.0


In [10]:
data_project_score.to_csv(f"hard-database/data_project_scoring_gpu_{gpu_model}.csv")
data_project_score.to_csv(f"hard-database/data_project_scoring.csv")