In [88]:
# Defining search parameters (llm used, prompt template)
# the formatting of the documents is also a tunable knob i'd say. Thats why its not taken from the chain.py file but copied here again. We can play around with that formatting if it actually helps our llm if we format it in a specific way. But for now its in a different cell

import model_definitions
import prompt_templates
from langchain_community.llms.ollama import Ollama
from langchain_core.output_parsers import JsonOutputParser
from langchain_core.prompts import PromptTemplate



MODEL = "mistral" # can be replaed by grid search later
prompt_template = prompt_templates.v005 #can be replaced by GS later
file_name = "test_structure.json"
parser = JsonOutputParser(pydantic_object=model_definitions.ListPlayerResponse)
prompt_for_llm = PromptTemplate(
    template=prompt_template,
    input_variables=["context", "question", "format_instructions"],
)

model = Ollama(model=MODEL, format="json")

In [76]:
from langchain_core.documents import Document

def format_documents(docs: [Document]):
    casted_docs = []
    for doc in docs: 
        casted_doc = Document(**doc)
        casted_docs.append(casted_doc)
        
    # Create a dictionary to hold reports for each player ID
    player_reports = defaultdict(list)

    # Aggregate reports by player ID
    for doc in casted_docs:
        player_id = doc.metadata['player_transfermarkt_id']
        report_content = doc.page_content
        player_reports[player_id].append(report_content)

    # Format the aggregated reports
    formatted_reports = []
    for player_id, reports in player_reports.items():
        formatted_report = f"Player ID: {player_id}\n"
        for i, report in enumerate(reports, 1):
            formatted_report += f"Report {i}: {report}\n"
        formatted_report += "###"
        formatted_reports.append(formatted_report.strip())

    # Join all formatted reports into a single string
    return_string = "\n\n".join(formatted_reports)
    #print("------------\nAfter merging reports for each player:\n")
    #print(return_string)
    return return_string

In [49]:
# Define model

from dataclasses import dataclass
from langchain_core.documents import Document
from typing import List

@dataclass(init=True)
class QueryAndRetrievedDocuments:
    query: str
    retrieved_documents: List[Document]
            

@dataclass
class DataModel:
    data: List[QueryAndRetrievedDocuments]

In [78]:
from collections import defaultdict
import json

# Create the context
def load_inputs() -> DataModel:
    with open(file_name, "r") as file:
        json_data = file.read()
        parsed_data = json.loads(json_data)     
        return parsed_data
    

In [79]:
inputs: DataModel = load_inputs()
print(inputs[0])

{'query': 'my first query', 'retrieved_documents': [{'page_content': 'First page content!', 'metadata': {'id': 1, 'player_id': '12', 'player_transfermarkt_id': '123', 'scout_id': '0', 'grade_rating': 9.5, 'grade_potential': 10.0, 'main_position': 'centralmidfield', 'played_position': 'centralmidfield'}}, {'page_content': 'Second page content!', 'metadata': {'id': 1, 'player_id': '12', 'player_transfermarkt_id': '123', 'scout_id': '0', 'grade_rating': 9.5, 'grade_potential': 10.0, 'main_position': 'centralmidfield', 'played_position': 'centralmidfield'}}]}


In [52]:
print(inputs)

[{'query': 'my first query', 'retrieved_documents': [{'page_content': 'First page content!', 'metadata': {'id': 1, 'player_id': '12', 'player_transfermarkt_id': '123', 'scout_id': '0', 'grade_rating': 9.5, 'grade_potential': 10.0, 'main_position': 'centralmidfield', 'played_position': 'centralmidfield'}}]}]


In [107]:
from model_definitions import ListPlayerResponse
from evaluate import load

# for all contexts format the documents so it conforms to the string passed to llm
# then call llm
# then we have a input/llm response pair
# do all metrics on those
# print result
#

def get_reports_from_context(query_and_retrieved_doc: QueryAndRetrievedDocuments, player_id: str) -> str:
    return_string = ""
    for doc in query_and_retrieved_doc['retrieved_documents']:
        print(doc)
        if doc['metadata']['player_transfermarkt_id'] == player_id:
            print(doc)
            return_string += "Report:"+ doc['page_content'] + "\n"
    
    return return_string


for singleInput in inputs:
    actual_instance_of_input = QueryAndRetrievedDocuments(**singleInput)
    formatted_context_string = format_documents(actual_instance_of_input.retrieved_documents)
    
    prompt_injection = {"context": formatted_context_string, "question": actual_instance_of_input.query, "format_instructions": parser.get_format_instructions()}
    prompt_for_llm = prompt_template.format(**prompt_injection)
    print(prompt_for_llm)
    
    model_answer = model.invoke(prompt_for_llm)
    print(model_answer)
    
    model_json_answer = json.loads(model_answer)
    print(model_json_answer)

    player_response = ListPlayerResponse(**model_json_answer)
    
    
    # Metrics
    bertscore_metrics = []
    berscore = load("bertscore")
    for player in player_response.list:
        model_summary = player.report_summary
        context_reports = get_reports_from_context(singleInput, str(player.player_id))
        print("comparison now:")
        print("model_summary: \n\t",model_summary)
        print("initial reports: \n\t",context_reports)
        
        # bertscore
        predictions = [model_summary]
        references = [context_reports]
        # other model such as "roberta-large" is better, but larger obv (distilbert... takes 268MB vs roberta-large is 1.4GB)
        print("bert score: ", berscore.compute(predictions=predictions, references=references, model_type="distilbert-base-uncased"))
    # for every player now check the metrics
    # for list_item in model_json_answer:
        



You are an assistant in football (soccer) scouting.
    Use the following information to provide a concise answer to the question enclosed in <question> tags.
    Dont make up anything that you dont see from the context.
    
    <context>
    Player ID: 123
Report 1: First page content!
Report 2: Second page content!
###
    </context>

    <question>
    my first query
    </question>

   The output should be formatted as a JSON instance that conforms to the JSON schema below.

As an example, for the schema {"properties": {"foo": {"title": "Foo", "description": "a list of strings", "type": "array", "items": {"type": "string"}}}, "required": ["foo"]}
the object {"foo": ["bar", "baz"]} is a well-formatted instance of the schema. The object {"properties": {"foo": ["bar", "baz"]}} is not well-formatted.

Here is the output schema:
```
{"$defs": {"PlayerResponse": {"properties": {"player_id": {"description": "ID of the player", "title": "Player Id", "type": "integer"}, "report_summary": {

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

bert score:  {'precision': [0.9427976608276367], 'recall': [0.8977413773536682], 'f1': [0.9197180271148682], 'hashcode': 'distilbert-base-uncased_L5_no-idf_version=0.3.12(hug_trans=4.41.2)'}
