In [1]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, random_split
from torchvision import transforms
from torchvision.transforms import Compose, Resize, ToTensor, CenterCrop


import numpy as np

from typing import List
from typing import Tuple

import tempfile
from pathlib import Path


from IPython.display import display
from IPython.display import Markdown

from health_multimodal.common.visualization import plot_phrase_grounding_similarity_map
from health_multimodal.text import get_bert_inference
from health_multimodal.text.utils import BertEncoderType
from health_multimodal.image import get_image_inference
from health_multimodal.image.utils import ImageModelType
from health_multimodal.vlp import ImageTextInferenceEngine
from health_multimodal.image.data.io import load_image
from health_multimodal.image.data.transforms import create_chest_xray_transform_for_inference

import os
import glob
from PIL import Image
import copy


import pandas as pd
from tqdm.notebook import tqdm
from sklearn.metrics import classification_report


import gc


import re
import sys

from transformers import AutoTokenizer, AutoModelForCausalLM


Data Pre-Processing

In [None]:
#Report Processing with Mistral-7B Base Code

device = "cuda:0" if torch.cuda.is_available() else "cpu"

index = 13
indexStart = 5500 * (index - 1)
indexEnd = 5500 * (index*5)



tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.2", use_auth_token="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")
model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-Instruct-v0.2", use_auth_token="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")


paFiles = np.load('/home/csgrad/byalavar/medicalXAI/paFiles.npy', allow_pickle=True)
paFilesPart = paFiles[indexStart:indexEnd]
model=model.half()
model.to(device)

def extractConcepts(model,text):
    messages = [
    {"role":"user","content":"Given a radiology report extract fine grained atomic concepts from it and make seperate sentences for each concept. Each sentence should completely be described in a single concept. Report :  The cardiac, mediastinal and hilar contours are normal. Pulmonary vasculature is normal.  Lungs are clear. No pleural effusion or pneumothorax is present. Multiple clips are again seen projecting over the left breast.  Remote left-sided rib fractures are also re- demonstrated."},
    {"role": "assistant", "content": "Cardiac contours are normal. Mediastinal contours are normal.  Hilar contours are normal.  Pulmonary vasculature is normal.  Lungs are clear.  No pleural effusion is present.  No pneumothorax is present.  Multiple clips are seen over the left breast.  Remote left-sided rib fractures."},
    {"role": "user", "content": "Do the same for this report:" + text}]

    encodeds = tokenizer.apply_chat_template(messages, return_tensors="pt")

    model_inputs = encodeds.to(device)
 
    generated_ids = model.generate(model_inputs, max_new_tokens=300, do_sample=False,pad_token_id=tokenizer.eos_token_id)
    decoded = tokenizer.batch_decode(generated_ids)
    final_response = decoded[0].split("assistant:")[-1].strip()
 

    sentences = final_response.split("[/INST]")

    return sentences[2]




def extract_findings(model, file_paths, output_dir):
    # Create the output directory if it doesn't exist
    #os.makedirs(output_dir, exist_ok=True)
    count = 0
    for file_path in file_paths:
        
        
        #print(file_path)    
        if file_path.endswith('.txt'):
            try:
                with open(file_path, 'r') as f:
                    content = f.read()
                    findings = re.search('FINDINGS:(.*?)(?:IMPRESSION:|$)', content, re.DOTALL)
                    if findings:
                        findings_text = findings.group(1).strip()
                        output_file = os.path.join(output_dir, f'{os.path.splitext(os.path.basename(file_path))[0]}_findings.txt')
                        
                        if os.path.exists(output_file):
                            count += 1
                            continue

                        output = extractConcepts(model,findings_text)
                        output_file = os.path.join(output_dir, f'{os.path.splitext(os.path.basename(file_path))[0]}_findings.txt')
                        
                        # Create the output file and write the output
                        with open(output_file, 'w') as out_f:
                            out_f.write(output)
                        count += 1
                        print(count,output_file)
            except FileNotFoundError:
                print(f"File {file_path} not found.")
        else:
            print(f"File {file_path} is not a text file.")
    print(f"Processed {count} files.")   
# Call the function with the input directory and output directory
extract_findings(model,paFilesPart, '/data/bharat/XAI/txt_files_concepts2')

In [2]:

splitPath = '/home/csgrad/byalavar/medicalXAI/mimic-cxr-2.0.0-split.csv'
labelsPath = '/home/easgrad/ajanrao/t2i/datasets/mimic/physionet.org/files/mimic-cxr-jpg/2.0.0/mimic-cxr-2.0.0-chexpert_original.csv'
conceptFilePath = '/data/bharat/XAI/txt_files_concepts2'
xRayFilePath = '/home/easgrad/ajanrao/t2i/datasets/mimic/physionet.org/files/mimic-cxr-jpg/2.0.0/files'

#Get the splits
df = pd.read_csv(splitPath)

paFiles = np.load('paFiles.npy', allow_pickle=True)       
paFilesTemp = np.load('paFilesTemp.npy', allow_pickle=True)       

paFiles = np.append(paFiles, paFilesTemp)
print(len(paFiles))
#df_train = df[df['split'] == 'train']

df_train = df

subject_ids_in_files = [os.path.basename(path).split('_')[0] for path in paFiles]
df_train['study_id'] = 's' + df_train['study_id'].astype(str) + '.txt'

df_filtered = df_train[df_train['study_id'].isin(subject_ids_in_files)]

df_filtered['study_id'] = df_filtered['study_id'].str.replace('.txt', '')
df_filtered['study_id'] = df_filtered['study_id'].str.replace('s', '')
totalSplit = list(df_filtered['study_id'])

print(len(totalSplit))
#Labels
dfLabels = pd.read_csv(labelsPath)

disease_columns = list(dfLabels.columns)[2:]

for column in disease_columns:
    dfLabels[column] = dfLabels[column].replace({-1: 0, 'missing': 0})
    
dfLabels = dfLabels.fillna(0)
dfLabels['study_id'] = dfLabels['study_id'].astype(str)
dfLabelsTotal = dfLabels[dfLabels['study_id'].isin(totalSplit)]
dfLabelsTotal = dfLabelsTotal[~dfLabelsTotal['subject_id'].astype(str).str.startswith('19')]

#dfLabelsTrain = dfLabelsTrain[dfLabelsTrain['Pleural Effusion'].astype(int) == 1]

111176


  df_filtered['study_id'] = df_filtered['study_id'].str.replace('.txt', '')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered['study_id'] = df_filtered['study_id'].str.replace('.txt', '')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered['study_id'] = df_filtered['study_id'].str.replace('s', '')


125131


In [3]:
def read_text(study_id, folder_path):
    file_path = os.path.join(folder_path, f's{study_id}_findings.txt')
    if os.path.exists(file_path):
        with open(file_path, 'r') as file:
            return file.read().strip()
    else:
        return None

dfLabelsTotal['findings'] = dfLabelsTotal['study_id'].apply(lambda x: read_text(x, conceptFilePath))

dfLabelsTotal = dfLabelsTotal[dfLabelsTotal['findings'].notnull()].reset_index(drop=True)


def get_jpg_files(row, root_dir):
   
    subject_id = row['subject_id']
    study_id = row['study_id']

  
    dir_path = os.path.join(root_dir, f'p{str(subject_id)[:2]}', f'p{subject_id}', f's{study_id}')

 
    jpg_files = glob.glob(os.path.join(dir_path, '*.jpg'))

 
    if jpg_files:
        return jpg_files[0]
    else:
        return None


dfLabelsTotal['path'] = dfLabelsTotal.apply(lambda row: get_jpg_files(row, xRayFilePath), axis=1)

In [None]:
dfLabelsTotal

In [5]:

df_zero = dfLabelsTotal[dfLabelsTotal['Pleural Effusion'] == 0]


df_sample = df_zero.sample(n=30000, random_state=1)


study_id_list = df_sample['study_id'].tolist()
study_id_list = ['s' + study_id +'_findings.txt'  for study_id in study_id_list]


print(study_id_list)


df_one = dfLabelsTotal[dfLabelsTotal['Pleural Effusion'] == 1]

df_sample = df_one.sample(n=12000, random_state=1)


study_id_list1 = df_sample['study_id'].tolist()
study_id_list1 = ['s' + study_id +'_findings.txt'  for study_id in study_id_list1]


print(study_id_list1)

study_id_list.extend(study_id_list1)
print(len(study_id_list))
import os
import shutil



src_dir = conceptFilePath
dst_dir = "/data/bharat/XAI/txt_files_concepts2PE"


os.makedirs(dst_dir, exist_ok=True)
count = 0

for filename in os.listdir(src_dir):

    if filename in study_id_list:
        count = count + 1
        shutil.copy(os.path.join(src_dir, filename), dst_dir)

print(count)
np.save('study_id_list.npy', study_id_list)

['s51470787_findings.txt', 's55555749_findings.txt', 's53315108_findings.txt', 's54187163_findings.txt', 's59383121_findings.txt', 's52543824_findings.txt', 's54313424_findings.txt', 's59462228_findings.txt', 's53998193_findings.txt', 's55826281_findings.txt', 's50147779_findings.txt', 's52918145_findings.txt', 's55144009_findings.txt', 's57985371_findings.txt', 's55302510_findings.txt', 's53543899_findings.txt', 's50223611_findings.txt', 's51187729_findings.txt', 's54089740_findings.txt', 's52245997_findings.txt', 's56862374_findings.txt', 's55808600_findings.txt', 's56965256_findings.txt', 's58660216_findings.txt', 's54566940_findings.txt', 's53093891_findings.txt', 's53961763_findings.txt', 's55260199_findings.txt', 's58645919_findings.txt', 's52859232_findings.txt', 's52999158_findings.txt', 's57438283_findings.txt', 's51169502_findings.txt', 's58342026_findings.txt', 's53738211_findings.txt', 's51499550_findings.txt', 's52260003_findings.txt', 's54002232_findings.txt', 's55733655_

In [6]:
study_id_list = np.load('study_id_list.npy', allow_pickle=True)

In [7]:
len(study_id_list)

42000

In [8]:
import re

def process_files(input_dir):
    for root, dirs, files in os.walk(input_dir):
        for file in files:
            if file.endswith('.txt'):
                file_path = os.path.join(root, file)
                with open(file_path, 'r') as f:
                    content = f.read()

                # Remove numbers like 1), 2), etc.
                content = re.sub(r'\d+\)', '', content)
                content = re.sub(r'\d+\.', '', content)
                # Break text into multiple lines using "."
                content = content.replace('. ', '.\n')

                # Remove "</s>"
                content = content.replace('</s>', '')

                # Remove extra space at the start of the file
                content = content.lstrip()

                lines = content.split('\n')
                processed_lines = []
                for line in lines:
                    line = line.strip()
                    if line == '' or '*' in line or '_' in line or ']]' in line or 'a.m' in line or 'p.m' in line:
                        continue
                    if line.endswith('is') or line.endswith('is ') or re.search(r'\[.*\]', line) or ('(' in line and ')' in line and 'previous' in line) or 'concept' in line or 'Concept' in line:
                        continue
                    words = line.split()
                    if len(words) <= 2 or len(words) > 35:
                        continue
                    line = line.replace('sentence', '').replace('-', '').replace('concept', '').replace('Sentence', '')
                    processed_lines.append(line)
                content = '\n'.join(processed_lines)

                # Write the processed content back to the file
                with open(file_path, 'w') as f:
                    f.write(content)

# Call the function with the path to your folder
process_files('/data/bharat/XAI/txt_files_concepts2PE')

In [9]:
import os


dir_path = '/data/bharat/XAI/txt_files_concepts2PE'

# Get a list of all files in the directory
files = [f for f in os.listdir(dir_path) if os.path.isfile(os.path.join(dir_path, f))]

print(len(files))

42000


In [10]:
len(files)

42000

In [None]:
files

In [12]:
files = [f.replace('_findings.txt','').replace('s','') for f in files]

In [None]:


dfFinal= dfLabelsTotal[dfLabelsTotal['study_id'].isin(files)]
dfFinal = dfFinal.reset_index(drop=True)
print(dfFinal)

In [None]:
dfFinal

In [None]:
def read_text(study_id, folder_path):
    file_path = os.path.join(folder_path, f's{study_id}_findings.txt')
    if os.path.exists(file_path):
        with open(file_path, 'r') as file:
            return file.read().strip()
    else:
        return None

dfFinal['findings'] = dfFinal['study_id'].apply(lambda x: read_text(x, '/data/bharat/XAI/txt_files_concepts2PE'))
#dfFinal = dfFinal.dropna(subset=['findings'])

print(dfFinal)

In [17]:
dfFinal.to_csv('dfFinal.csv', index=False)

In [3]:
dfFinal = pd.read_csv('dfFinal.csv')
dfFinal = dfFinal.dropna(subset=['findings'])
dfFinal = dfFinal.reset_index(drop=True)

In [4]:
nan_count = dfFinal.isna().sum()

print(nan_count)

subject_id                    0
study_id                      0
Atelectasis                   0
Cardiomegaly                  0
Consolidation                 0
Edema                         0
Enlarged Cardiomediastinum    0
Fracture                      0
Lung Lesion                   0
Lung Opacity                  0
No Finding                    0
Pleural Effusion              0
Pleural Other                 0
Pneumonia                     0
Pneumothorax                  0
Support Devices               0
findings                      0
path                          0
dtype: int64


In [5]:
dfFinal

Unnamed: 0,subject_id,study_id,Atelectasis,Cardiomegaly,Consolidation,Edema,Enlarged Cardiomediastinum,Fracture,Lung Lesion,Lung Opacity,No Finding,Pleural Effusion,Pleural Other,Pneumonia,Pneumothorax,Support Devices,findings,path
0,10000032,53911762,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,Single frontal view of the chest is provided.\...,/home/easgrad/ajanrao/t2i/datasets/mimic/physi...
1,10000032,56699142,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,Lungs are clear of focal consolidation.\nLungs...,/home/easgrad/ajanrao/t2i/datasets/mimic/physi...
2,10000935,50578979,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,Lung volumes are low.\nMultiple small pulmonar...,/home/easgrad/ajanrao/t2i/datasets/mimic/physi...
3,10000935,58219844,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Lung volumes are low.\nThis results in crowdin...,/home/easgrad/ajanrao/t2i/datasets/mimic/physi...
4,10000980,58206436,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,Lung volumes have decreased since the last stu...,/home/easgrad/ajanrao/t2i/datasets/mimic/physi...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41978,18998395,59410361,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,Lungs are clear without consolidation.\nLungs ...,/home/easgrad/ajanrao/t2i/datasets/mimic/physi...
41979,18998535,55876437,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,Central pulmonary vascular engorgement is pres...,/home/easgrad/ajanrao/t2i/datasets/mimic/physi...
41980,18998535,56886868,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,Right PICC tip is located in the midSuperior V...,/home/easgrad/ajanrao/t2i/datasets/mimic/physi...
41981,18998596,58312279,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,Cardiomediastinal contours are normal.\nHilar ...,/home/easgrad/ajanrao/t2i/datasets/mimic/physi...


Concept Stats

In [6]:

conceptList = dfFinal['findings'].tolist()

In [7]:
conceptLists = [i.split('\n') for i in conceptList]

In [8]:
totalCount = 0

for i in conceptLists:
    totalCount = totalCount + len(i)

print(totalCount)

331495


In [9]:
from collections import Counter

def find_top_100_recurring_strings(lists_of_lists):
    # Flatten the list of lists into a single list
    all_strings = [item for sublist in lists_of_lists for item in sublist]

    string_counts = Counter(all_strings)

    top_100_strings = string_counts.most_common(200)
    # Extract just the strings from the top 100
    top_100_strings_only = [string for string, count in top_100_strings]
    return top_100_strings_only


topRecLists = find_top_100_recurring_strings(conceptLists)

In [10]:
def count_lists_with_queries(query_strings, lists_of_lists):

    count = 0

    for ref_list in lists_of_lists:

        matches = sum(1 for query in query_strings if query in ref_list)

        if matches >= 2:
            count += 1
    return count


count_lists_with_queries(topRecLists,conceptLists)

19976

In [None]:
def find_indices_with_queries(query_strings, lists_of_lists):
  
    matching_indices = []

    for index, ref_list in enumerate(lists_of_lists):

        matches = sum(1 for query in query_strings if query in ref_list)

        if matches >= 2:
            matching_indices.append(index)
    return matching_indices


matching_indices = find_indices_with_queries(topRecLists, conceptLists)
print(matching_indices)

In [12]:
topRecLists

['No pneumothorax is present.',
 'No pleural effusion is present.',
 'Lung volumes are low.',
 'Heart size is normal.',
 'Lungs are clear.',
 'No pneumothorax is detected.',
 'No focal consolidation is present.',
 'Cardiomediastinal silhouette is normal.',
 'No pulmonary edema is present.',
 'Mediastinal contours are normal.',
 'Hilar contours are normal.',
 'No pleural effusions are present.',
 'No large pleural effusion is present.',
 'No pneumothorax is identified.',
 'Bony structures are intact.',
 'Cardiomediastinal silhouette is unchanged.',
 'Cardiomediastinal silhouette is stable.',
 'Cardiomediastinal contours are stable.',
 'Pulmonary vasculature is normal.',
 'Hilar contours are unremarkable.',
 'Mediastinal contours are unremarkable.',
 'Right lung is clear.',
 'Cardiac silhouette is enlarged.',
 'Bilateral pleural effusions are present.',
 'Bibasilar atelectasis is present.',
 'Cardiomediastinal contours are normal.',
 'Small bilateral pleural effusions are present.',
 'Lo

In [13]:
len(topRecLists)

200

In [14]:
unique_concepts = topRecLists

with open('unique_concepts_filtered.txt', 'w') as f:

    for line in unique_concepts:
        f.write(line + '\n')

In [15]:

def filter_dataframe_and_findings(df, topRecLists, matching_indices):

    filtered_df = df.iloc[matching_indices].copy(deep=True)
    

    for index, row in filtered_df.iterrows():
 
        findings_list = row['findings'].split("\n")

        
        filtered_findings = [word for word in findings_list if word in topRecLists]

        filtered_df.at[index, 'findings'] = '\n'.join(filtered_findings,)
        
    
    return filtered_df


filtered_df = filter_dataframe_and_findings(dfFinal, topRecLists, matching_indices)

In [None]:
filtered_df

In [17]:
count = filtered_df['Pleural Effusion'].value_counts()
print(count)

0.0    15878
1.0     4098
Name: Pleural Effusion, dtype: int64


In [18]:
filtered_df.reset_index(drop=True, inplace=True)

In [19]:
filtered_df_copy = filtered_df.copy(deep=True)

In [20]:
ref_replacement_dict={'No focal consolidation':'No focal consolidation is present.',
                      'No pleural effusion':'No pleural effusion is present.',
                      'Cardiomediastinal silhouette':'Cardiomediastinal silhouette is normal.',
                      'No pneumothorax':'No pneumothorax is present.',
                      'Hilar contours':'Hilar contours are normal.',
                      'Mediastinal':'Cardiomediastinal silhouette is normal.',
                      'No pneumonia':'No pneumonia is present.',}

In [None]:
# Assuming `df` is your DataFrame and `column_name` is the name of the column containing the concepts
# `ref_replacement_dict` is your reference and replacement dictionary

for index, row in filtered_df_copy.iterrows():
    # Split the string by '\n' to get individual concepts
    concepts = row['findings'].split('\n')
    # Replace each concept if it exists in the reference dictionary
    print(concepts)
    replaced_concepts = [next((ref_replacement_dict[key] for key in ref_replacement_dict if key.lower() in concept.lower()), concept) for concept in concepts]
    # Join the concepts back together with '\n' and update the DataFrame
    filtered_df_copy.at[index, 'findings'] = '\n'.join(replaced_concepts)

In [22]:
filtered_df_copy.iloc[0]['findings']

'No focal consolidation is present.\nNo pleural effusion is present.\nNo pneumothorax is present.\nCardiomediastinal silhouette is normal.'

In [23]:
conceptList = filtered_df_copy['findings'].tolist()
conceptLists = [i.split('\n') for i in conceptList]

topRecLists = set(item for sublist in conceptLists for item in sublist)

In [24]:
len(topRecLists)

147

In [24]:

def get_unique_concepts(input_dir):
    unique_concepts = {}

    count = 0
    for root, dirs, files in os.walk(input_dir):
        for file in files:
            if file.endswith('.txt'):
                file_path = os.path.join(root, file)
                with open(file_path, 'r') as f:
                    lines = f.readlines()
                    count = count + len(lines)
                    for line in lines:
                        unique_concepts[line.strip()] = None
    print(count)


    unique_concepts = list(unique_concepts.keys())

    return unique_concepts


unique_concepts = get_unique_concepts('/data/bharat/XAI/txt_files_concepts2PE')



with open('unique_concepts.txt', 'w') as f:

    for line in unique_concepts:
        f.write(line + '\n')



331495


In [25]:
with open('unique_concepts.txt', 'r') as f:

    unique_concepts = f.readlines()
unique_concepts = [concept.strip() for concept in unique_concepts]


In [26]:
len(unique_concepts)

170941

In [27]:
unique_concepts = topRecLists

with open('unique_concepts_filtered.txt', 'w') as f:
    # Write each line to the file
    for line in unique_concepts:
        f.write(line + '\n')

unique_concepts = [concept.strip() for concept in unique_concepts]

In [28]:
#Load BERT and Image Models

text_inference = get_bert_inference(BertEncoderType.CXR_BERT)
image_inference = get_image_inference(ImageModelType.BIOVIL)

image_text_inference = ImageTextInferenceEngine(
    image_inference_engine=image_inference,
    text_inference_engine=text_inference,
)

tempConcepts = copy.deepcopy(unique_concepts)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
embeddings = torch.zeros((len(tempConcepts), 128))
embeddings = embeddings.to(device)
image_text_inference.to(device)

batch_size = 1024

#Get BERT Embeddings for the concepts
for i in range(0, len(tempConcepts), batch_size):
    batch_concepts = tempConcepts[i:i+batch_size]
    batch_embeddings = image_text_inference.text_inference_engine.get_embeddings_from_prompt(batch_concepts)
    embeddings[i:i+batch_size] = batch_embeddings


#embeddings = embeddings.half()
torch.save(embeddings, 'conceptEmbeddings.pt')
#parts = torch.chunk(embeddings, 16, dim=0)

Using downloaded and verified file: /tmp/biovil_image_resnet50_proj_size_128.pt


In [29]:
embeddings = torch.load('conceptEmbeddings.pt')

In [30]:
embeddings.half()
embeddings = embeddings.to(device)
embeddings = nn.functional.normalize(embeddings, dim=-1)

In [32]:
embeddings.shape

torch.Size([147, 128])

Similairity Removal

In [None]:
removeDict = {}

threshold = 0.90

for i in range(0,embeddings.shape[0]):
    temp= torch.cosine_similarity(embeddings[i].unsqueeze(0), embeddings, dim=-1)
    #print(temp.shape)
    #break

    mask = (temp > threshold)
    length = embeddings.shape[0]


    midpoint = i
    

    tensor = torch.cat([torch.ones(midpoint), torch.zeros(length - midpoint)]).to(device)

    mask = mask * tensor
    indices = torch.nonzero(mask,as_tuple=True)
    if(indices[0].shape[0]>0):
        removeDict.setdefault(i, []).append(indices[0])
    # for j in range(0,i+1):
    #     if(i!=j):
    #         if temp[j].item() > 0.995:
    #             removeDict.setdefault(i, []).append(j)
    if(i%100==0 and i!=0):
        print(i)
        
                




In [85]:
len(removeDict)

91931

In [86]:
len(unique_concepts)

92287

In [None]:
tempList1 = []
tempList2 = []

for i in removeDict.keys():
    for j in removeDict[i][0]:
        #if(j.item() not in removeDict.keys()):
         #print(unique_concepts[j.item()],unique_concepts[i],i,j.item())
         tempList1.append(unique_concepts[i])
         tempList2.append(unique_concepts[j.item()])
    print(i)


In [88]:
len(tempList1)

71205941

In [None]:
count = 0
for i,j in zip(tempList1,tempList2):
    print(i,j)
    count = count + 1
    if(count>1000):
        break

In [90]:
for i in range(0,len(tempList1)):
    tempList1[i] = tempList1[i].strip()
    tempList2[i] = tempList2[i].strip()

In [None]:
import os


line_dict = {line: replacement for line, replacement in zip(tempList2, tempList1)}

txt_files = [f for f in os.listdir('/data/bharat/XAI/txt_files_concepts2PE') if f.endswith('.txt')]

count0 = 0
count1 = 0

# Process each file
for filename in txt_files:
    print(count0)
    count0 += 1
    
    full_path = os.path.join('/data/bharat/XAI/txt_files_concepts2PE', filename)
    #print(full_path)
    
    with open(full_path, 'r') as file:
        lines = file.readlines()


    for i, line in enumerate(lines):
        stripped_line =  line.strip()
        
        if stripped_line in line_dict:
      
            count1 += 1
            lines[i] = line_dict[stripped_line] + '\n'
        else:
            pass

    with open(full_path, 'w') as file:
        file.writelines(lines)
    

In [92]:
count1

160725

Fine-Tuning

In [33]:
conceptBank = unique_concepts
embeddingsCB = embeddings

In [34]:
len(conceptBank)

147

In [35]:
conceptBank = [str(element) for element in conceptBank]
#embeddingsCB =embeddings

In [None]:
filtered_df_copy

In [45]:
# from imblearn.over_sampling import RandomOverSampler
# from sklearn.model_selection import train_test_split
# import pandas as pd


# X = filtered_df_copy.drop(['Pleural Effusion'], axis=1)  # Features
# y = filtered_df_copy['Pleural Effusion']  # Labels

# # Apply RandomOverSampler
# ros = RandomOverSampler(random_state=0)
# X_resampled, y_resampled = ros.fit_resample(X, y)

# # Combine resampled features and labels back into a DataFrame
# resampled_df = pd.concat([X_resampled, y_resampled], axis=1)

# # Now, use resampled_df to create your CustomDataset
# totalDataset = CustomDataset(resampled_df, '/home/easgrad/ajanrao/t2i/datasets/mimic/physionet.org/files/mimic-cxr-jpg/2.0.0/files', conceptBank)



Training

In [37]:
class CustomDataset(Dataset):
    def __init__(self, df, root_dir,conceptBank, transform=None):
        self.df = df
        self.root_dir = root_dir
        self.conceptBank = conceptBank
        self.transform =  create_chest_xray_transform_for_inference(resize=512, center_crop_size=512)

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()


        jpg_files = self.df.iloc[idx]['path']
        #print(jpg_files)
       
        #print(jpg_files)
        if jpg_files:
            image = load_image(Path(jpg_files))
            image = self.transform(image)
            #image = jpg_files[0]
        else:
            image = None

        # Get the label
        #print(self.df.iloc[idx, 2:].values)
        
        labels = torch.tensor(self.df.iloc[idx, 11].astype(float))  # Replace 'label' with your actual label column name
        #print(labels)
        conceptLabels = self.df.loc[idx, 'findings']
        #print(self.df.loc[idx, 'subject_id'],self.df.loc[idx, 'study_id'],conceptLabels)
        #print(type(image))
        return image, labels,conceptLabels

def collate_fn(batch):
    images, labels = zip(*batch)
    return list(images), torch.stack(labels)

# Create DataLoader
totalDataset = CustomDataset(filtered_df_copy, '/home/easgrad/ajanrao/t2i/datasets/mimic/physionet.org/files/mimic-cxr-jpg/2.0.0/files',conceptBank)

train_size = int(0.85 * len(totalDataset))  # 80% for training
test_size = len(totalDataset) - train_size  # 20% for testing

torch.manual_seed(0)
# Split the dataset
trainDataset, testDataset = random_split(totalDataset, [train_size, test_size])

# Create DataLoaders for train and test sets
trainDataLoader = DataLoader(trainDataset, batch_size=64, shuffle=True, pin_memory=True, num_workers=4)
testDataLoader = DataLoader(testDataset, batch_size=1, shuffle=False, pin_memory=False, num_workers=1)


In [40]:
#Sanity Check 1
testDataLoader = DataLoader(testDataset, batch_size=64, shuffle=False, pin_memory=False, num_workers=1)
a,b,c = next(iter(testDataLoader))

#Sanity Check 2
row_with_x = filtered_df_copy[filtered_df_copy['study_id'] == 56401460]
print(row_with_x['findings'].iloc[0])

In [None]:
text_inference = get_bert_inference(BertEncoderType.CXR_BERT)
image_inference = get_image_inference(ImageModelType.BIOVIL)

image_text_inference = ImageTextInferenceEngine(
    image_inference_engine=image_inference,
    text_inference_engine=text_inference,
)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

image_text_inference.to(device)


#Freezing Weights of certain layer in the image encoder and BERT

def freezeWeghts(imageEncoder,layerNumber):
    count = 0
    for i,j in imageEncoder.named_parameters():
        if(count>=layerNumber):
            j.requires_grad = True
        else:
            j.requires_grad = False
        #print(i,j.requires_grad,count)
        count = count + 1
    return imageEncoder

imageEncoder = image_text_inference.image_inference_engine.model
bert = image_text_inference.text_inference_engine.model

imageEncoder = freezeWeghts(imageEncoder,170)
bert = freezeWeghts(bert,190)



class resConnect(nn.Module):

    def __init__(self):
        super(resConnect, self).__init__()
        self.fc1 = nn.Linear(128, 128)
        self.fc2 = nn.Linear(128, 128)
    
    def forward(self, x):
        x = nn.functional.relu(self.fc1(x))
        x = nn.functional.relu(self.fc2(x))
        return x


class fc(nn.Module):

    def __init__(self):
        super(fc, self).__init__()
        self.fc = nn.Linear(embeddingsCB.shape[0], 2)
        #self.fc = nn.Linear(128, 2)
    
    def forward(self, x):
        x = self.fc(x)
        return x


resNet = resConnect()
resNet.to(device)

fcNet = fc()
fcNet.to(device)

embeddingsCB = nn.functional.normalize(embeddingsCB, dim=-1)

#fcNet = fcNet.half()

# #print(fcNet)
# count =0
# for name,param in fcNet.named_parameters():
#    if(name == 'fc.weight'):
 
#     #print(param.shape,param.data) 
#     param.data = fcWeights
#     #print("----------------------------")
#     #print(param.shape,param.data)
#     count = count + 1


In [56]:
len(trainDataLoader)

266

In [None]:

trainable_params = [p.numel() for p in imageEncoder.parameters() if p.requires_grad]


print(sum(trainable_params))

In [None]:
count = 0
for i,j in imageEncoder.named_parameters():
 
 print(i,j.requires_grad,count)
 count = count + 1

count = 0
for i,j in bert.named_parameters():
 
 print(i,j.requires_grad,count)
 count = count + 1

In [52]:
imageTensors = None
labelTensors = None
conceptLabelTensors = None

In [None]:
#Input Tensor Extraction


with torch.no_grad():

 acc = 0  
 running_loss = 0.0
 running_corrects = 0
 clLoss = []
 conLoss = []
 for tdi,data in tqdm(enumerate(trainDataLoader)):
    print(tdi)
    #print("Check1")
    images,labels,conceptLabels = data
    images = images.half()
    
    #print(images.shape,labels.shape)
    #print(conceptLabels,conceptLabels.shape)
   # print("Check1")
    #print(conceptLabels.shape)
    #print(conceptLabels)
    conceptLabels = [[s.lstrip() for s in cl.split('\n')] for cl in conceptLabels]

    
    #break  
    #print("Check2")
    indices = []
    #print(conceptLabels[0],conceptLabels[1])
    for j in range(0,len(conceptLabels)):
        indices.append([conceptBank.index(str(s)) for s in conceptLabels[j] if str(s) in conceptBank])
   # print("Check3")
    
    #break
    #labelConcepts = torch.tensor(indices)
    labelConcepts = [torch.tensor(lst, dtype=torch.long) for lst in indices]
  
    #print(labelConcepts)
 
    #print(conceptLabelTensors)
    if(imageTensors is None):
        imageTensors = images
        labelTensors = labels
        conceptLabelTensors = labelConcepts
    else:
        imageTensors = torch.cat((imageTensors,images),0)
        labelTensors = torch.cat((labelTensors,labels),0)
        conceptLabelTensors.extend(labelConcepts)
    

if(True):
        torch.save(imageTensors, 'imageTensors.pt')
        torch.save(labelTensors, 'labelTensors.pt')
        torch.save(conceptLabelTensors, 'conceptLabelTensors.pt')
        imageTensors = None
        labelTensors = None
        conceptLabelTensors = None
        gc.collect()

       
    
    
    

    

In [39]:
imageTensors = torch.load('imageTensors.pt').to(device)
labelTensors = torch.load('labelTensors.pt').to(device)
conceptLabelTensors = torch.load('conceptLabelTensors.pt')


In [40]:
imageTensors.shape

torch.Size([16979, 3, 512, 512])

In [41]:
labelTensors = labelTensors.to(torch.long)

In [None]:
#Training



epochs = 15

alpha = 2

optimizer = torch.optim.AdamW([{"params":fcNet.parameters(), "lr":0.005,"weight_decay:":0.0001},
                              {"params":resNet.parameters(), "lr":0.005,"weight_decay":0.0001},
                              {"params":imageEncoder.parameters(), "lr":0.00001,"weight_decay":0.0001},
                              {"params":bert, "lr":0.000001,"weight_decay":0.00001}]) 
lossFunction = nn.CrossEntropyLoss()


#lossFunctionConcepts = nn.MSELoss(reduction='sum')
lossFunctionConcepts = nn.L1Loss(reduction='sum')

batchSize = 128


conceptLearning = False
classLearning = False

for e in range(0,epochs):

 acc = 0  
 running_loss = 0.0
 running_corrects = 0
 clLoss = []
 conLoss = []

 count = 0
 count1 = 0 

 for tdi in range(0,imageTensors.shape[0],batchSize):


    if(tdi<20000):
      
      images = imageTensors[count:count+batchSize].to(device)
      labels = labelTensors[count:count+batchSize].to(device)
      labelConcepts =  conceptLabelTensors[count:count+batchSize]
      count = count + images.shape[0]
    


    
    images = images.to(torch.float32)

    imageEmbeddings = imageEncoder(images).projected_global_embedding

    

    resOut = resNet(imageEmbeddings)

    imageEmbeddings = 0.85*imageEmbeddings + 0.15*resOut

    imageEmbeddings = nn.functional.normalize(imageEmbeddings, dim=-1)
    
  
    embeddingsCB = bert.get_embeddings_from_prompt(conceptBank)
 

    
    conceptScores = torch.mm(imageEmbeddings,embeddingsCB.t())

    output = fcNet(conceptScores)
    #output = fcNet(imageEmbeddings)

    

    output = output.to(torch.float32)
    labels = labels.to(torch.long)

    conceptScoresCopy = conceptScores.clone().detach().to(device)

    maxValue =  torch.max(conceptScoresCopy,dim=1)[0]

    
    for i, idx in enumerate(labelConcepts):

      conceptScoresCopy[i, idx] = torch.tensor(0.90)
     
    


    lossClassification = lossFunction(output, labels)

    lossConcepts = lossFunctionConcepts(conceptScores,conceptScoresCopy)

    if(conceptLearning):
     loss = alpha*lossConcepts
    elif(classLearning):
      loss = lossClassification
    else:
      loss = lossClassification + alpha*lossConcepts

    clLoss.append(lossClassification.item())
    conLoss.append(lossConcepts.item())

    optimizer.zero_grad()

    loss.backward()
   
    optimizer.step()
  
    

    running_loss += lossConcepts.item()


    #running_corrects += torch.sum(torch.argmax(output, dim=1) == labels).item()


    
    acc = acc + torch.sum(torch.argmax(output,dim=1) == labels).item()
   
 print("Accuracy: ", acc/imageTensors.shape[0],"Loss:", running_loss/imageTensors.shape[0])
    

    

In [None]:
#Tensor Extraction Test


imageTensorsTest = None
labelTensorsTest = None
conceptLabelTensorsTest = None


epochs = 1

alpha = 2

optimizer = torch.optim.Adam([{"params":fcNet.parameters(), "lr":0.001},
                              {"params":imageEncoder.parameters(), "lr":0.0005,"weight_decay":0.00001}])
lossFunction = nn.CrossEntropyLoss()


lossFunctionConcepts = nn.L1Loss()




with torch.no_grad():

 acc = 0  
 running_loss = 0.0
 running_corrects = 0
 clLoss = []
 conLoss = []
 for tdi,data in tqdm(enumerate(testDataLoader)):
    print(tdi)
    #print("Check1")
    images,labels,conceptLabels = data
    images = images.half()
    
    #print(images.shape,labels.shape)
    #print(conceptLabels,conceptLabels.shape)
   # print("Check1")
    #print(conceptLabels.shape)
    #print(conceptLabels)
    conceptLabels = [[s.lstrip() for s in cl.split('\n')] for cl in conceptLabels]

    
    #break  
    #print("Check2")
    indices = []
    #print(conceptLabels[0],conceptLabels[1])
    for j in range(0,len(conceptLabels)):
        indices.append([conceptBank.index(str(s)) for s in conceptLabels[j] if str(s) in conceptBank])
   # print("Check3")
    
    #break
    #labelConcepts = torch.tensor(indices)
    labelConcepts = [torch.tensor(lst, dtype=torch.long) for lst in indices]
  
    #print(labelConcepts)
 
    #print(conceptLabelTensors)
    if(imageTensorsTest is None):
        imageTensorsTest = images
        labelTensorsTest = labels
        conceptLabelTensorsTest = labelConcepts
    else:
        imageTensorsTest = torch.cat((imageTensorsTest,images),0)
        labelTensorsTest = torch.cat((labelTensorsTest,labels),0)
        conceptLabelTensorsTest.extend(labelConcepts)
    

if(True):
        torch.save(imageTensorsTest, 'imageTensorsTest.pt')
        torch.save(labelTensorsTest, 'labelTensorsTest.pt')
        torch.save(conceptLabelTensorsTest, 'conceptLabelTensorsTest.pt')
        imageTensorsTest = None
        labelTensorsTest = None
        conceptLabelTensorsTest = None
        gc.collect()

       
    
    
    

    

In [47]:
imageTensorsTest = torch.load('imageTensorsTest.pt').to(device)
labelTensorsTest = torch.load('labelTensorsTest.pt').to(device)
conceptLabelTensorsTest = torch.load('conceptLabelTensorsTest.pt')


In [None]:
#Test



epochs = 1

alpha = 1


batchSize = 1

predicted = []
labelsList = []

selectedConcepts = []
cLabels = []

conceptLearning = False
classLearning = False

pruneConcepts = False

for e in range(0,epochs):

 acc = 0  
 running_loss = 0.0
 running_corrects = 0
 clLoss = []
 conLoss = []

 count = 0
 count1 = 0 
 

 for tdi in range(0,imageTensorsTest.shape[0],batchSize):

    print(tdi)


    if(tdi<20000):
      
      images = imageTensorsTest[count:count+batchSize].to(device)
      labels = labelTensorsTest[count:count+batchSize].to(device)
      labelConcepts =  conceptLabelTensorsTest[count:count+batchSize]
      count = count + images.shape[0]
      print(labelConcepts)


    
    images = images.to(torch.float32)

    imageEmbeddings = imageEncoder(images).projected_global_embedding

    

    resOut = resNet(imageEmbeddings)

    imageEmbeddings = 0.9*imageEmbeddings + 0.1*resOut

    imageEmbeddings = nn.functional.normalize(imageEmbeddings, dim=-1)
    

    
    conceptScores = torch.mm(imageEmbeddings,embeddingsCB.t())

    output = fcNet(conceptScores)

    #output = fcNet(imageEmbeddings)

    

    output = output.to(torch.float32)
    labels = labels.to(torch.long)


    predicted.extend(torch.argmax(output,dim=1).tolist())
    labelsList.extend(labels.tolist())
    
    conceptScoresCopy = conceptScores.clone().detach().to(device)

    maxValue =  torch.max(conceptScoresCopy,dim=1)[0]

    
    for i, idx in enumerate(labelConcepts):

      conceptScoresCopy[i, idx] = torch.tensor(0.90)
     
    
    #print(conceptScores,conceptScoresCopy)
    #print(conceptScores.dtype,conceptScoresCopy.dtype)

    lossClassification = lossFunction(output, labels)

    lossConcepts = lossFunctionConcepts(conceptScores,conceptScoresCopy)
    #print(alpha * lossConcepts)
    #loss = lossClassification + alpha * lossConcepts
    if(conceptLearning):
     loss = alpha*lossConcepts
    elif(classLearning):
      loss = lossClassification
    else:
      loss = lossClassification + alpha*lossConcepts

    clLoss.append(lossClassification.item())
    conLoss.append(lossConcepts.item())

    # optimizer.zero_grad()

    # loss.backward()
   
    # optimizer.step()

    k = 30
    values, indices = torch.topk(conceptScores, k)



    if(tdi<20000):
    

     k = 30
     if(pruneConcepts):
        pruneK = 100
        
        values, indices = torch.topk(conceptScores, pruneK)
        #print(indices)
        
        prunedConcepts = [] 
        for i in range(pruneK):
          for j in range(i+1,pruneK):
            #print(torch.cosine_similarity(embeddingsCB[indices[0][i]],embeddingsCB[indices[0][j]],dim=-1),conceptBank[indices[0][i]],conceptBank[indices[0][j]])
            if(torch.cosine_similarity(embeddingsCB[indices[0][i]],embeddingsCB[indices[0][j]],dim=-1)>=0.94):
                prunedConcepts.append(indices[0][j].item())
        #print(prunedConcepts)
        countK = 0
        tempSConcepts = ''         
        for i in range(pruneK):
         if(indices[0][i].item() not in prunedConcepts):
            
            if(values[0][i].item()>0.5):
              countK = countK + 1
              print(conceptBank[indices[0][i].item()],values[0][i].item())
            
         if(countK>=k):
            break

                                  

      
    #print(indices)
     tempList3 = ''
     tempConcepts3 = []
     for i in range(k):
        if(values[0][i].item()>0.5):
         tempList3 = tempList3 + conceptBank[indices[0][i].item()] + '\n'
         
         #print(conceptBank[indices[0][i].item()],values[0][i].item())
     selectedConcepts.append(tempList3)
     #print("----------------")

     tempList4=''
     tempConcepts4 = []
     for i in labelConcepts[0]:
        tempList4 = tempList4 + str(conceptBank[i.item()]) + '\n'
        #print(conceptBank[i.item()])
     cLabels.append(tempList4)   
    
     #print("-------------------------------------------------------------------")
     #break
   
     
  
    

    running_loss += lossConcepts.item()


    #running_corrects += torch.sum(torch.argmax(output, dim=1) == labels).item()


    
    acc = acc + torch.sum(torch.argmax(output,dim=1) == labels).item()



   
 print("Accuracy: ", acc/imageTensorsTest.shape[0],"Loss:", running_loss/imageTensorsTest.shape[0])
    

    

In [None]:
selectedConcepts

In [None]:
cLabels

In [70]:
torch.save(fcNet, 'fcNet1.pt')
torch.save(imageEncoder, 'imageEncoder1.pt')

In [19]:
imageTensorsTest = torch.load('/data/bharat/XAI/preComp/testImageTensors.pt')
labelTensorsTest = torch.load('/data/bharat/XAI/preComp/testLabelTensors.pt')
conceptLabelTensorsTest = torch.load('/data/bharat/XAI/preComp/testConceptLabelTensors.pt')


In [20]:
imageTensorsTest.shape

torch.Size([2999, 3, 512, 512])

In [93]:
from rouge import Rouge 

# Assuming hypotheses and references are your sets of sentences


rouge = Rouge()

# Compute ROUGE scores for each pair of sentences
scores = [rouge.get_scores(hyp, ref,avg=False) for hyp, ref in zip(selectedConcepts, cLabels)]

print(scores)

[[{'rouge-1': {'r': 0.8421052631578947, 'p': 0.34782608695652173, 'f': 0.49230768817041426}, 'rouge-2': {'r': 0.65, 'p': 0.14444444444444443, 'f': 0.23636363338842975}, 'rouge-l': {'r': 0.8421052631578947, 'p': 0.34782608695652173, 'f': 0.49230768817041426}}], [{'rouge-1': {'r': 1.0, 'p': 0.23255813953488372, 'f': 0.37735848750445006}, 'rouge-2': {'r': 0.9090909090909091, 'p': 0.11363636363636363, 'f': 0.2020202000448934}, 'rouge-l': {'r': 1.0, 'p': 0.23255813953488372, 'f': 0.37735848750445006}}], [{'rouge-1': {'r': 0.3333333333333333, 'p': 0.09803921568627451, 'f': 0.1515151480027549}, 'rouge-2': {'r': 0.2, 'p': 0.03260869565217391, 'f': 0.0560747639444494}, 'rouge-l': {'r': 0.3333333333333333, 'p': 0.09803921568627451, 'f': 0.1515151480027549}}], [{'rouge-1': {'r': 0.6428571428571429, 'p': 0.2, 'f': 0.30508474214306236}, 'rouge-2': {'r': 0.46153846153846156, 'p': 0.06896551724137931, 'f': 0.11999999773800002}, 'rouge-l': {'r': 0.6428571428571429, 'p': 0.2, 'f': 0.30508474214306236}}

In [94]:
totals = {
    'rouge-1': {'r': 0, 'p': 0, 'f': 0},
    'rouge-2': {'r': 0, 'p': 0, 'f': 0},
    'rouge-l': {'r': 0, 'p': 0, 'f': 0}
}

# Count the number of items
num_items = len(rouge_scores)

# Sum up all scores
for item in rouge_scores:
    for rouge_key, scores in item[0].items():
        for score_key, score_value in scores.items():
            totals[rouge_key][score_key] += score_value

# Calculate averages
averages = {rouge_key: {score_key: score_value / num_items for score_key, score_value in scores.items()} for rouge_key, scores in totals.items()}

print(averages)

{'rouge-1': {'r': 0.7386320399292163, 'p': 0.20185426670577797, 'f': 0.30973748088374436}, 'rouge-2': {'r': 0.49789618921502454, 'p': 0.07605868649141383, 'f': 0.12928144271809633}, 'rouge-l': {'r': 0.7384946178963695, 'p': 0.20181303249592353, 'f': 0.3096744791572917}}


In [101]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.translate.meteor_score import meteor_score

# Ensure the necessary NLTK data is downloaded
nltk.download('punkt')
nltk.download('wordnet')

# Initialize the METEOR score accumulator
score = 0

# Assuming 'selectedConcepts' and 'cLabels' are defined and have the same length
for i in range(len(selectedConcepts)):
	# Tokenize both the reference and the hypothesis
	reference_tokens = word_tokenize(cLabels[i])
	hypothesis_tokens = word_tokenize(selectedConcepts[i])
	
	# Calculate and accumulate the METEOR score
	score += meteor_score([reference_tokens], hypothesis_tokens)

# Calculate the average METEOR score
average_score = score / len(selectedConcepts)
print(f"METEOR Score: {average_score}")

[nltk_data] Downloading package punkt to
[nltk_data]     /home/csgrad/byalavar/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/csgrad/byalavar/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


METEOR Score: 0.2756530100674194
