In [None]:
import json
import re

In [None]:
def load_data(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return json.load(file)

def calculate_precision_recall(true_positives, false_positives, false_negatives):
    precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 0
    recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0
    return precision, recall

def evaluate_data(validation_data, extracted_data):
    total_foods = 1
    food_matches = 0
    phenotype_tp, phenotype_fp, phenotype_fn = 0, 0, 0
    target_population_tp, target_population_fp, target_population_fn = 0, 0, 0
    citations_tp, citations_fp, citations_fn = 0, 0, 0

    # Assuming validation data is not empty and all entries should have the same Food item
    if validation_data:
        validation_food = validation_data[0]['Food']
        if all(data['Food'] == validation_food for data in extracted_data):
            food_matches = 1  # Food matches for all entries

    for val_item in validation_data:
        phenotype_match_found = False
        for ext_item in extracted_data:
            if val_item['Phenotype'] == ext_item.get('Phenotype'):
                phenotype_match_found = True
                validate_citations(val_item, ext_item, citations_tp, citations_fp, citations_fn)
                if val_item['Target_population'] == ext_item.get('Target_population'):
                    target_population_tp += 1
                else:
                    target_population_fp += 1
                break
        phenotype_tp += 1 if phenotype_match_found else 0
        phenotype_fn += 1 if not phenotype_match_found else 0
            

    phenotype_precision, phenotype_recall = calculate_precision_recall(phenotype_tp, phenotype_fp, phenotype_fn)
    target_population_precision, target_population_recall = calculate_precision_recall(target_population_tp, target_population_fp, target_population_fn)
    citations_precision, citations_recall = calculate_precision_recall(citations_tp, citations_fp, citations_fn)

    return {
        'Food': (food_matches / total_foods, food_matches / total_foods),  # Precision and recall are the same in this case
        'Phenotype': (phenotype_precision, phenotype_recall),
        'Target Population': (target_population_precision, target_population_recall),
        'Citations': (citations_precision, citations_recall)
    }

def validate_citations(val_item, ext_item, tp, fp, fn):
    val_citations = val_item['Citations'].split(', ')
    val_dois = val_item['DOI'].split(', ')
    ext_citations = ext_item.get('Citations', '').split(', ')
    ext_dois = ext_item.get('DOI', '').split(', ')

    for val_doi, val_citation in zip(val_dois, val_citations):
        if re.match(r"10\.\d{4,9}/[-._;()/:A-Z0-9]+", val_doi, re.I):
            if val_doi in ext_dois:
                tp += 1
            else:
                fn += 1
        elif val_citation in ext_citations:
            tp += 1
        else:
            fn += 1

    for ext_doi, ext_citation in zip(ext_dois, ext_citations):
        if (ext_doi not in val_dois and not re.match(r"10\.\d{4,9}/[-._;()/:A-Z0-9]+", ext_doi, re.I)) or (ext_citation not in val_citations):
            fp += 1

def main():
    validation_file = 'path_to_validation_data.json'
    extracted_file = 'path_to_extracted_data.json'
    validation_data = load_data(validation_file)
    extracted_data = load_data(extracted_file)
    results = evaluate_data(validation_data, extracted_data)
    for key, (precision, recall) in results.items():
        print(f'{key}: Precision = {precision:.4f}, Recall = {recall:.4f}')

main()