In [37]:
import json
import re
import torch
from transformers import AutoModel, AutoTokenizer
from scipy.spatial.distance import cosine
import os

In [9]:
def get_doi(citations_data, citation_key):
    citation_info = citations_data.get(citation_key, {})
    return citation_info.get('doi', 'DOI not found')

def get_pmid(citations_data, citation_key):
    citation_info = citations_data.get(citation_key, {})
    return citation_info.get('pmid', 'PMID not found')
def check_text(text):
    doi_pattern = r'^10\.\d{4,9}/[-._;()/:A-Za-z0-9]+$'

    if re.match(doi_pattern, text, re.I):
        return "DOI"

    elif text.startswith("PMID "):
        return "PMID"

    else:
        return "None of the patterns matched"
def extract_pmid(text):
    match = re.search(r'PMID (\d+)', text)
    if match:
        return match.group(1)  # Return the captured group of digits
    return 'PMID not found'
    
def evaluate_data(validation_data, extracted_data,reference_data, model, tokenizer,extracted_file):
    results = {
        'Food': {'TP': 0, 'FP': 0, 'FN': 0},
        'Phenotype': {'TP': 0, 'FP': 0, 'FN': 0},
        'Target Population': {'TP': 0, 'FP': 0, 'FN': 0},
        'Citations': {'TP': 0, 'FP': 0, 'FN': 0}
    }
    if extracted_data:
        if extracted_data[0]['Food'].lower() == validation_data[0]['Food'].lower():
            results['Food']['TP'] += 1  # All entries have correct Food
        else:
            results['Food']['FN'] += 1 
           
            
            # print(validation_data[-1]['Food'])

        primary_threshold = 0.75
        for val_item in validation_data:
            best_similarity = 0
            best_match = None

            for ext_item in extracted_data:
                phenotype_similarity = calculate_similarity(val_item['Phenotype'], ext_item['Phenotype'], model, tokenizer)
                if phenotype_similarity > best_similarity:
                    best_similarity = phenotype_similarity
                    best_match = ext_item

            if best_similarity > primary_threshold:
                results['Phenotype']['TP'] += 1
                # Proceed to compare other fields only if the best phenotype match is strong enough
                target_pop_similarity = calculate_similarity(val_item.get('Target Population', ''), best_match.get('Target Population', ''), model, tokenizer)
                if target_pop_similarity > primary_threshold:
                    results['Target Population']['TP'] += 1
                else:
                    results['Target Population']['FP'] += 1

                val_citations = val_item.get('Citations', '').split(',')
                val_dois = val_item.get('DOI', '').split(',')
                ext_citations = best_match.get('Citations', '').split(';')
                ext_dois = [get_doi(reference_data, citation_key) for citation_key in ext_citations]
                ext_pmids = [get_pmid(reference_data, citation_key) for citation_key in ext_citations]

                for citation , doi in zip(val_citations, val_dois):
                    match_found_cite = False
                    if check_text(doi)== 'DOI':
                        for doi_ext in ext_dois:
                            if doi_ext:
                                if doi.lower() == doi_ext.lower():
                                    match_found_cite = True
                                    results['Citations']['TP'] += 1
                                    break
                        if not match_found_cite:
                            for ext_citation in ext_citations:
                                citation_similarity = calculate_similarity(citation, ext_citation, model, tokenizer)
                                if citation_similarity > 0.9:
                                    match_found_cite = True
                                    results['Citations']['TP'] += 1
                                    break
                                
                    elif check_text(doi)== 'PMID':
                        for pmid_ext in ext_pmids:
                            if extract_pmid(doi) == pmid_ext:
                                match_found_cite = True
                                results['Citations']['TP'] += 1
                                break
                        if not match_found_cite:
                            for ext_citation in ext_citations:
                                citation_similarity = calculate_similarity(citation, ext_citation, model, tokenizer)
                                if citation_similarity > 0.9:
                                    match_found_cite = True
                                    results['Citations']['TP'] += 1
                                    break
                    else:
                        for ext_citation in ext_citations:
                            citation_similarity = calculate_similarity(citation, ext_citation, model, tokenizer)
                            if citation_similarity > 0.9:
                                match_found_cite = True
                                results['Citations']['TP'] += 1
                                break
                    if not match_found_cite:
                        results['Citations']['FN'] += 1

            else:
                results['Phenotype']['FN'] += 1
    return results


def calculate_similarity(text1, text2, model, tokenizer):
    encoded_input1 = tokenizer(text1, return_tensors='pt', padding=True, truncation=True)
    encoded_input2 = tokenizer(text2, return_tensors='pt', padding=True, truncation=True)
    
    with torch.no_grad():
        model_output1 = model(**encoded_input1)
        model_output2 = model(**encoded_input2)
    
    embeddings1 = model_output1.last_hidden_state.mean(dim=1)
    embeddings2 = model_output2.last_hidden_state.mean(dim=1)
    
    # Calculate cosine similarity
    cosine_sim = 1 - cosine(embeddings1.squeeze().numpy(), embeddings2.squeeze().numpy())
    return cosine_sim

def calculate_precision_recall(true_positives, false_positives, false_negatives):
    precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 0
    recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0
    return precision, recall

def evaluate_directory(input_directory, model, tokenizer):
    aggregate_results = {
        'Food': {'TP': 0, 'FP': 0, 'FN': 0},
        'Phenotype': {'TP': 0, 'FP': 0, 'FN': 0},
        'Target Population': {'TP': 0, 'FP': 0, 'FN': 0},
        'Citations': {'TP': 0, 'FP': 0, 'FN': 0}
    }
    
    for root, dirs, files in os.walk(input_directory):
        validation_file = os.path.join(root, 'data.json')
        extracted_file = os.path.join(root, 'claims', 'combined_data.json')
        reference_file = os.path.join(root,'claims', 'citation_references.json')
        
        if os.path.exists(validation_file) and os.path.exists(extracted_file):
            count += 1
            validation_data = load_data(validation_file)
            extracted_data = load_data(extracted_file)
            reference_data = load_data(reference_file)
            results = evaluate_data(validation_data, extracted_data, reference_data, model, tokenizer,extracted_file)
            
            for category in aggregate_results:
                aggregate_results[category]['TP'] += results[category]['TP']
                aggregate_results[category]['FP'] += results[category]['FP']
                aggregate_results[category]['FN'] += results[category]['FN']

    # Calculate overall precision and recall for each category
    final_results = {category: calculate_precision_recall(aggregate_results[category]['TP'],
                                                          aggregate_results[category]['FP'],
                                                          aggregate_results[category]['FN'])
                     for category in aggregate_results}

    return final_results
def load_data(file_path):
    import json
    with open(file_path, 'r', encoding='utf-8') as file:
        return json.load(file)
def get_model_and_tokenizer(model_name):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name)
    return model, tokenizer
def main():
    input_directory = 'RootDirectory'
    model_name = "allenai/scibert_scivocab_uncased"
    model, tokenizer = get_model_and_tokenizer(model_name)
    results = evaluate_directory(input_directory, model, tokenizer)
    for category, metrics in results.items():
        precision, recall = metrics
        print(f'{category} - Precision: {precision:.4f}, Recall: {recall:.4f}')

if __name__ == "__main__":
    main()


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


***********Phenotype mismatch in file************
/Users/AliTarik/Documents/LastAttempt/2010_1759/claims/combined_data.json
***********Food mismatch in file************
/Users/AliTarik/Documents/LastAttempt/2011_2062/claims/combined_data.json
***********Phenotype mismatch in file************
/Users/AliTarik/Documents/LastAttempt/2011_2205/claims/combined_data.json
***********Food mismatch in file************
/Users/AliTarik/Documents/LastAttempt/2010_1734/claims/combined_data.json
***********Food mismatch in file************
/Users/AliTarik/Documents/LastAttempt/2011_2211/claims/combined_data.json
***********No extracted data************
/Users/AliTarik/Documents/LastAttempt/2011_2226/claims/combined_data.json
***********No extracted data************
/Users/AliTarik/Documents/LastAttempt/2011_2040/claims/combined_data.json
***********No extracted data************
/Users/AliTarik/Documents/LastAttempt/2011_2071/claims/combined_data.json
***********Food mismatch in file************
/User