In [55]:
# %pip install spacy
# %pip install textstat
# %pip install -U sentence-transformers
# %pip install spacy nltk

# Import Libraries

In [56]:
import os
import glob
import csv
import numpy as np
import string
import spacy
from spacy.tokenizer import Tokenizer
from spacy.lang.it import Italian
from spacy.lang.it.stop_words import STOP_WORDS
from nltk.stem import SnowballStemmer
from textstat import gulpease_index
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity


In [57]:
nlp = Italian()
# Create a blank Tokenizer with just the English vocab
tokenizer = Tokenizer(nlp.vocab)

In [58]:
PROJECT_PATH = os.path.dirname(os.path.dirname(os.path.dirname(os.getcwd())))
RESULT_PATH = PROJECT_PATH + '/results/LLM Reports/SegResNet'

# Functions

## TTR

In [59]:
def compute_TypeTokenRatio(content):
    tokenized_content = tokenizer(content)
    tokenized_text = [i.text for i in tokenized_content]
    types = len(set(tokenized_text))
    tokens = len(tokenized_text)
    type_token_ratio = types / tokens
    return type_token_ratio

## MAAS Index

In [60]:
def compute_Maas(content):
    tokenized_content = tokenizer(content)
    tokenized_text = [i.text for i in tokenized_content]
    types = len(set(tokenized_text))
    tokens = len(tokenized_text)
    maas = (np.log(tokens) - np.log(types)) / np.log(tokens)**2
    return maas

## Gulpease (italian)

In [61]:
def compute_Gulpease(text):
    return gulpease_index(text)

## Coherence Score

In [62]:
def remove_special_characters(text):
    cleaned_text = text.replace('-', '').replace('\n', '').replace('*', '').replace('#','')
    return cleaned_text

In [63]:
def compute_CoherenceScore(text):
    text = remove_special_characters(text)
    model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
    sentences = text.split('.')
    embeddings = model.encode(sentences)
    similarities = [cosine_similarity([embeddings[i]], [embeddings[i+1]])[0][0] for i in range(len(embeddings)-1)]
    avg_similarity = sum(similarities) / len(similarities)
    return avg_similarity

## Coverage Score Embedding

In [64]:
def compute_CoverageScoreEmbedding(text, reference_text):
    model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
    reference_text = reference_text.replace('.\n\n', '. ').replace('\n\t', '. ').replace('\n\n', '.').replace('\n', '.').split('. ')
    
    reference_embeddings = model.encode(reference_text)
    text_embeddings = model.encode([text])

    similarities = cosine_similarity(text_embeddings, reference_embeddings)
    coverage = sum(similarities[0]) / len(reference_text)
    return coverage

## Coverage Token Score


In [65]:
def remove_stopwords_punctuation(text):
    # Load the English model
    nlp  = Italian()
    # Process the text with spaCy
    doc = nlp(text)
    # Filter out stopwords and punctuation
    cleaned_tokens = [token.text for token in doc if not token.is_stop and token.text not in string.punctuation]

    # Join the tokens back into a single string
    cleaned_text = ' '.join(cleaned_tokens)
    cleaned_text = cleaned_text.replace('.', '')
    return cleaned_text

In [66]:
def stem_tokens(tokens):
    stemmer = SnowballStemmer('italian')
    return [stemmer.stem(token) for token in tokens]

In [67]:
def remove_numbers(text):
    return ''.join([i for i in text if not i.isdigit()])

In [68]:
def compute_CoverageScoreToken(text, reference_text):
    text = remove_special_characters(text)
    text = remove_stopwords_punctuation(text)
    text = remove_numbers(text)
    text = text.lower()
    reference_text = remove_special_characters(reference_text)
    reference_text = remove_stopwords_punctuation(reference_text)
    reference_text = remove_numbers(reference_text)
    reference_text = reference_text.lower()


    tokenized_text = tokenizer(text)
    tokenized_text = [i.text for i in tokenized_text]

    tokenized_reference_text = tokenizer(reference_text)
    tokenized_reference_text = [i.text for i in tokenized_reference_text]
    
    stemmed_text = stem_tokens(tokenized_text)
    stemmed_reference_text = stem_tokens(tokenized_reference_text)

    set_tokenized_text = set(stemmed_text)
    set_tokenized_reference_text = set(stemmed_reference_text)

    intersection = set_tokenized_text.intersection(set_tokenized_reference_text)
    union = set_tokenized_text.union(set_tokenized_reference_text)
    coverage = len(intersection) / len(union)
    
    
    return coverage

# Prompt Path

In [69]:
PATH_PROMPT1_IT = PROJECT_PATH + '/notebooks/Explainability/Prompt/PROMPT1_IT.txt'
PATH_PROMPT2_IT = PROJECT_PATH + '/notebooks/Explainability/Prompt/PROMPT2_IT.txt'

# Gemma

In [70]:
md_files = glob.glob(RESULT_PATH+'/gemma/'+'*.md')
md_files

['d:\\User\\Salvarki\\Desktop\\GitHub Tesi Project\\Visual-and-Textual-Explainability-in-Brain-Multiple-Sclerosis-Detection-with-3D-MRI/results/LLM Reports/SegResNet/gemma\\gemma-7b-it_response_PROMPT1.md',
 'd:\\User\\Salvarki\\Desktop\\GitHub Tesi Project\\Visual-and-Textual-Explainability-in-Brain-Multiple-Sclerosis-Detection-with-3D-MRI/results/LLM Reports/SegResNet/gemma\\gemma-7b-it_response_PROMPT1_IT.md',
 'd:\\User\\Salvarki\\Desktop\\GitHub Tesi Project\\Visual-and-Textual-Explainability-in-Brain-Multiple-Sclerosis-Detection-with-3D-MRI/results/LLM Reports/SegResNet/gemma\\gemma-7b-it_response_PROMPT2.md',
 'd:\\User\\Salvarki\\Desktop\\GitHub Tesi Project\\Visual-and-Textual-Explainability-in-Brain-Multiple-Sclerosis-Detection-with-3D-MRI/results/LLM Reports/SegResNet/gemma\\gemma-7b-it_response_PROMPT2_IT.md']

### PROMPT 1 - Italian


In [71]:
path_prompt_1_it_response = md_files[1]
path_prompt_1_it_response

'd:\\User\\Salvarki\\Desktop\\GitHub Tesi Project\\Visual-and-Textual-Explainability-in-Brain-Multiple-Sclerosis-Detection-with-3D-MRI/results/LLM Reports/SegResNet/gemma\\gemma-7b-it_response_PROMPT1_IT.md'

In [72]:
with open(path_prompt_1_it_response, 'r') as file:
    content = file.read()

with open(PATH_PROMPT1_IT, 'r') as file:
    prompt = file.read()

In [73]:
type_token_ratio = compute_TypeTokenRatio(content)
maas_index = compute_Maas(content)
gulpease = compute_Gulpease(content)
coherence_score = compute_CoherenceScore(content)
coverage_score = compute_CoverageScoreEmbedding(content, prompt)
coverage_score_token = compute_CoverageScoreToken(content, prompt)

metric_names = ['Type Token Ratio', 'Maas Index', 'Gulpease', 'Coherence Score', 'Coverage Score', 'Coverage Score Token']
metric_values = [type_token_ratio, maas_index, gulpease, coherence_score, coverage_score, coverage_score_token]

# Create a list of dictionaries for each metric
metrics_data = []
for name, value in zip(metric_names, metric_values):
    metrics_data.append({'Metric Name': name, 'Metric Value': round(value, 3)})
    
# Print metric name and value
for name, value in zip(metric_names, metric_values):
    print(f"{name}: {value}")

# Write metrics to a CSV file
csv_file = 'gemma-7b-it_response_PROMPT1_IT_metrics.csv'
with open(csv_file, 'w', newline='') as file:
    writer = csv.DictWriter(file, fieldnames=['Metric Name', 'Metric Value'])
    writer.writeheader()
    writer.writerows(metrics_data)

print(f"Metrics saved to {csv_file}")



Type Token Ratio: 0.5034965034965035
Maas Index: 0.021449618355036476
Gulpease: 41.2
Coherence Score: 0.49918018182118734
Coverage Score: 0.31838043521230036
Coverage Score Token: 0.34615384615384615
Metrics saved to gemma-7b-it_response_PROMPT1_IT_metrics.csv


### PROMPT 2 - Italian

In [74]:
path_prompt_2_it_response = md_files[3]
path_prompt_2_it_response

'd:\\User\\Salvarki\\Desktop\\GitHub Tesi Project\\Visual-and-Textual-Explainability-in-Brain-Multiple-Sclerosis-Detection-with-3D-MRI/results/LLM Reports/SegResNet/gemma\\gemma-7b-it_response_PROMPT2_IT.md'

In [75]:
with open(path_prompt_2_it_response, 'r') as file:
    content = file.read()

with open(PATH_PROMPT2_IT, 'r') as file:
    prompt = file.read()

In [76]:
type_token_ratio = compute_TypeTokenRatio(content)
maas_index = compute_Maas(content)
gulpease = compute_Gulpease(content)
coherence_score = compute_CoherenceScore(content)
coverage_score = compute_CoverageScoreEmbedding(content, prompt)
coverage_score_token = compute_CoverageScoreToken(content, prompt)

metric_names = ['Type Token Ratio', 'Maas Index', 'Gulpease', 'Coherence Score', 'Coverage Score', 'Coverage Score Token']
metric_values = [type_token_ratio, maas_index, gulpease, coherence_score, coverage_score, coverage_score_token]

# Create a list of dictionaries for each metric
metrics_data = []
for name, value in zip(metric_names, metric_values):
    metrics_data.append({'Metric Name': name, 'Metric Value': round(value, 3)})
    
# Print metric name and value
for name, value in zip(metric_names, metric_values):
    print(f"{name}: {value}")

# Write metrics to a CSV file
csv_file = 'gemma-7b-it_response_PROMPT2_IT_metrics.csv'
with open(csv_file, 'w', newline='') as file:
    writer = csv.DictWriter(file, fieldnames=['Metric Name', 'Metric Value'])
    writer.writeheader()
    writer.writerows(metrics_data)

print(f"Metrics saved to {csv_file}")



Type Token Ratio: 0.5330578512396694
Maas Index: 0.02088144546773994
Gulpease: 34.6
Coherence Score: 0.419590728978316
Coverage Score: 0.4452822215648161
Coverage Score Token: 0.23577235772357724
Metrics saved to gemma-7b-it_response_PROMPT2_IT_metrics.csv


# LLAMA3 

In [77]:
md_files = glob.glob(RESULT_PATH+'/llama3/'+'*.md')
md_files

['d:\\User\\Salvarki\\Desktop\\GitHub Tesi Project\\Visual-and-Textual-Explainability-in-Brain-Multiple-Sclerosis-Detection-with-3D-MRI/results/LLM Reports/SegResNet/llama3\\llama3-70b_response_PROMPT1.md',
 'd:\\User\\Salvarki\\Desktop\\GitHub Tesi Project\\Visual-and-Textual-Explainability-in-Brain-Multiple-Sclerosis-Detection-with-3D-MRI/results/LLM Reports/SegResNet/llama3\\llama3-70b_response_PROMPT1_IT.md',
 'd:\\User\\Salvarki\\Desktop\\GitHub Tesi Project\\Visual-and-Textual-Explainability-in-Brain-Multiple-Sclerosis-Detection-with-3D-MRI/results/LLM Reports/SegResNet/llama3\\llama3-70b_response_PROMPT2.md',
 'd:\\User\\Salvarki\\Desktop\\GitHub Tesi Project\\Visual-and-Textual-Explainability-in-Brain-Multiple-Sclerosis-Detection-with-3D-MRI/results/LLM Reports/SegResNet/llama3\\llama3-70b_response_PROMPT2_IT.md',
 'd:\\User\\Salvarki\\Desktop\\GitHub Tesi Project\\Visual-and-Textual-Explainability-in-Brain-Multiple-Sclerosis-Detection-with-3D-MRI/results/LLM Reports/SegResNet/

## LLAMA 3 8B

### PROMPT 1 - Italian

In [78]:
path_prompt_1_response = md_files[1]

with open(path_prompt_1_response, 'r') as file:
    content = file.read()

with open(PATH_PROMPT1_IT, 'r') as file:
    prompt = file.read()

In [79]:
type_token_ratio = compute_TypeTokenRatio(content)
maas_index = compute_Maas(content)
gulpease = compute_Gulpease(content)
coherence_score = compute_CoherenceScore(content)
coverage_score = compute_CoverageScoreEmbedding(content, prompt)
coverage_score_token = compute_CoverageScoreToken(content, prompt)

metric_names = ['Type Token Ratio', 'Maas Index', 'Gulpease', 'Coherence Score', 'Coverage Score', 'Coverage Score Token']
metric_values = [type_token_ratio, maas_index, gulpease, coherence_score, coverage_score, coverage_score_token]

# Create a list of dictionaries for each metric
metrics_data = []
for name, value in zip(metric_names, metric_values):
    metrics_data.append({'Metric Name': name, 'Metric Value': round(value, 3)})
    
# Print metric name and value
for name, value in zip(metric_names, metric_values):
    print(f"{name}: {value}")

# Write metrics to a CSV file
csv_file = 'llama3_8b_response_PROMPT1_IT_metrics.csv'
with open(csv_file, 'w', newline='') as file:
    writer = csv.DictWriter(file, fieldnames=['Metric Name', 'Metric Value'])
    writer.writeheader()
    writer.writerows(metrics_data)

print(f"Metrics saved to {csv_file}")



Type Token Ratio: 0.3752577319587629
Maas Index: 0.02562885567550792
Gulpease: 43.8
Coherence Score: 0.37882575962473364
Coverage Score: 0.3445963068650319
Coverage Score Token: 0.3761467889908257
Metrics saved to llama3_8b_response_PROMPT1_IT_metrics.csv


### PROMPT 2 - ENGLISH

In [80]:
path_prompt_2_response = md_files[3]

with open(path_prompt_2_response, 'r') as file:
    content = file.read()

with open(PATH_PROMPT2_IT, 'r') as file:
    prompt = file.read()

In [81]:
type_token_ratio = compute_TypeTokenRatio(content)
maas_index = compute_Maas(content)
gulpease = compute_Gulpease(content)
coherence_score = compute_CoherenceScore(content)
coverage_score = compute_CoverageScoreEmbedding(content, prompt)
coverage_score_token = compute_CoverageScoreToken(content, prompt)

metric_names = ['Type Token Ratio', 'Maas Index', 'Gulpease', 'Coherence Score', 'Coverage Score', 'Coverage Score Token']
metric_values = [type_token_ratio, maas_index, gulpease, coherence_score, coverage_score, coverage_score_token]

# Create a list of dictionaries for each metric
metrics_data = []
for name, value in zip(metric_names, metric_values):
    metrics_data.append({'Metric Name': name, 'Metric Value': round(value, 3)})
    
# Print metric name and value
for name, value in zip(metric_names, metric_values):
    print(f"{name}: {value}")

# Write metrics to a CSV file
csv_file = 'llama3_8b_response_PROMPT2_IT_metrics.csv'
with open(csv_file, 'w', newline='') as file:
    writer = csv.DictWriter(file, fieldnames=['Metric Name', 'Metric Value'])
    writer.writeheader()
    writer.writerows(metrics_data)

print(f"Metrics saved to {csv_file}")



Type Token Ratio: 0.3407534246575342
Maas Index: 0.026533067551857605
Gulpease: 33.7
Coherence Score: 0.4723692656579343
Coverage Score: 0.41511964197787976
Coverage Score Token: 0.3125
Metrics saved to llama3_8b_response_PROMPT2_IT_metrics.csv


## LLAMA 3 70B

### PROMPT 1 - ENGLISH

In [82]:
path_prompt_1_response = md_files[5]

with open(path_prompt_1_response, 'r') as file:
    content = file.read()

with open(PATH_PROMPT1_IT, 'r') as file:
    prompt = file.read()

In [83]:
type_token_ratio = compute_TypeTokenRatio(content)
maas_index = compute_Maas(content)
gulpease = compute_Gulpease(content)
coherence_score = compute_CoherenceScore(content)
coverage_score = compute_CoverageScoreEmbedding(content, prompt)
coverage_score_token = compute_CoverageScoreToken(content, prompt)

metric_names = ['Type Token Ratio', 'Maas Index', 'Gulpease', 'Coherence Score', 'Coverage Score', 'Coverage Score Token']
metric_values = [type_token_ratio, maas_index, gulpease, coherence_score, coverage_score, coverage_score_token]

# Create a list of dictionaries for each metric
metrics_data = []
for name, value in zip(metric_names, metric_values):
    metrics_data.append({'Metric Name': name, 'Metric Value': round(value, 3)})
    
# Print metric name and value
for name, value in zip(metric_names, metric_values):
    print(f"{name}: {value}")

# Write metrics to a CSV file
csv_file = 'llama3_70b_response_PROMPT1_IT_metrics.csv'
with open(csv_file, 'w', newline='') as file:
    writer = csv.DictWriter(file, fieldnames=['Metric Name', 'Metric Value'])
    writer.writeheader()
    writer.writerows(metrics_data)

print(f"Metrics saved to {csv_file}")



Type Token Ratio: 0.41452991452991456
Maas Index: 0.0232943075161592
Gulpease: 37.4
Coherence Score: 0.6020091601780483
Coverage Score: 0.37371335508158576
Coverage Score Token: 0.32142857142857145
Metrics saved to llama3_70b_response_PROMPT1_IT_metrics.csv


### PROMPT 2 - ENGLISH

In [84]:
path_prompt_2_response = md_files[7]

with open(path_prompt_2_response, 'r') as file:
    content = file.read()

with open(PATH_PROMPT2_IT, 'r') as file:
    prompt = file.read()

In [85]:
type_token_ratio = compute_TypeTokenRatio(content)
maas_index = compute_Maas(content)
gulpease = compute_Gulpease(content)
coherence_score = compute_CoherenceScore(content)
coverage_score = compute_CoverageScoreEmbedding(content, prompt)
coverage_score_token = compute_CoverageScoreToken(content, prompt)

metric_names = ['Type Token Ratio', 'Maas Index', 'Gulpease', 'Coherence Score', 'Coverage Score', 'Coverage Score Token']
metric_values = [type_token_ratio, maas_index, gulpease, coherence_score, coverage_score, coverage_score_token]

# Create a list of dictionaries for each metric
metrics_data = []
for name, value in zip(metric_names, metric_values):
    metrics_data.append({'Metric Name': name, 'Metric Value': round(value, 3)})
    
# Print metric name and value
for name, value in zip(metric_names, metric_values):
    print(f"{name}: {value}")

# Write metrics to a CSV file
csv_file = 'llama3_70b_response_PROMPT2_IT_metrics.csv'
with open(csv_file, 'w', newline='') as file:
    writer = csv.DictWriter(file, fieldnames=['Metric Name', 'Metric Value'])
    writer.writeheader()
    writer.writerows(metrics_data)

print(f"Metrics saved to {csv_file}")



Type Token Ratio: 0.35654008438818563
Maas Index: 0.02716795795670246
Gulpease: 38.1
Coherence Score: 0.6155580254761797
Coverage Score: 0.4413571297708485
Coverage Score Token: 0.4322033898305085
Metrics saved to llama3_70b_response_PROMPT2_IT_metrics.csv


# MIXTRAL

In [86]:
md_files = glob.glob(RESULT_PATH+'/mixtral/'+'*.md')
md_files

['d:\\User\\Salvarki\\Desktop\\GitHub Tesi Project\\Visual-and-Textual-Explainability-in-Brain-Multiple-Sclerosis-Detection-with-3D-MRI/results/LLM Reports/SegResNet/mixtral\\mixtral-8x7b_response_PROMPT1.md',
 'd:\\User\\Salvarki\\Desktop\\GitHub Tesi Project\\Visual-and-Textual-Explainability-in-Brain-Multiple-Sclerosis-Detection-with-3D-MRI/results/LLM Reports/SegResNet/mixtral\\mixtral-8x7b_response_PROMPT1_IT.md',
 'd:\\User\\Salvarki\\Desktop\\GitHub Tesi Project\\Visual-and-Textual-Explainability-in-Brain-Multiple-Sclerosis-Detection-with-3D-MRI/results/LLM Reports/SegResNet/mixtral\\mixtral-8x7b_response_PROMPT2.md',
 'd:\\User\\Salvarki\\Desktop\\GitHub Tesi Project\\Visual-and-Textual-Explainability-in-Brain-Multiple-Sclerosis-Detection-with-3D-MRI/results/LLM Reports/SegResNet/mixtral\\mixtral-8x7b_response_PROMPT2_IT.md']

## PROMPT 1 - Italian

In [87]:
path_prompt_1_response = md_files[1]

with open(path_prompt_1_response, 'r') as file:
    content = file.read()

with open(PATH_PROMPT1_IT, 'r') as file:
    prompt = file.read()

In [88]:
type_token_ratio = compute_TypeTokenRatio(content)
maas_index = compute_Maas(content)
gulpease = compute_Gulpease(content)
coherence_score = compute_CoherenceScore(content)
coverage_score = compute_CoverageScoreEmbedding(content, prompt)
coverage_score_token = compute_CoverageScoreToken(content, prompt)

metric_names = ['Type Token Ratio', 'Maas Index', 'Gulpease', 'Coherence Score', 'Coverage Score', 'Coverage Score Token']
metric_values = [type_token_ratio, maas_index, gulpease, coherence_score, coverage_score, coverage_score_token]

# Create a list of dictionaries for each metric
metrics_data = []
for name, value in zip(metric_names, metric_values):
    metrics_data.append({'Metric Name': name, 'Metric Value': round(value, 3)})
    
# Print metric name and value
for name, value in zip(metric_names, metric_values):
    print(f"{name}: {value}")

# Write metrics to a CSV file
csv_file = 'mixtral_8x7b_response_PROMPT1_IT_metrics.csv'
with open(csv_file, 'w', newline='') as file:
    writer = csv.DictWriter(file, fieldnames=['Metric Name', 'Metric Value'])
    writer.writeheader()
    writer.writerows(metrics_data)

print(f"Metrics saved to {csv_file}")



Type Token Ratio: 0.4609164420485175
Maas Index: 0.022128756834580137
Gulpease: 53.0
Coherence Score: 0.3187603836831374
Coverage Score: 0.367372308499538
Coverage Score Token: 0.5222222222222223
Metrics saved to mixtral_8x7b_response_PROMPT1_IT_metrics.csv


## PROMPT 2 - ENGLISH

In [89]:
path_prompt_2_response = md_files[3]

with open(path_prompt_2_response, 'r') as file:
    content = file.read()

with open(PATH_PROMPT2_IT, 'r') as file:
    prompt = file.read()

In [90]:
type_token_ratio = compute_TypeTokenRatio(content)
maas_index = compute_Maas(content)
gulpease = compute_Gulpease(content)
coherence_score = compute_CoherenceScore(content)
coverage_score = compute_CoverageScoreEmbedding(content, prompt)
coverage_score_token = compute_CoverageScoreToken(content, prompt)

metric_names = ['Type Token Ratio', 'Maas Index', 'Gulpease', 'Coherence Score', 'Coverage Score', 'Coverage Score Token']
metric_values = [type_token_ratio, maas_index, gulpease, coherence_score, coverage_score, coverage_score_token]

# Create a list of dictionaries for each metric
metrics_data = []
for name, value in zip(metric_names, metric_values):
    metrics_data.append({'Metric Name': name, 'Metric Value': round(value, 3)})
    
# Print metric name and value
for name, value in zip(metric_names, metric_values):
    print(f"{name}: {value}")

# Write metrics to a CSV file
csv_file = 'mixtral_8x7b_response_PROMPT2_IT_metrics.csv'
with open(csv_file, 'w', newline='') as file:
    writer = csv.DictWriter(file, fieldnames=['Metric Name', 'Metric Value'])
    writer.writeheader()
    writer.writerows(metrics_data)

print(f"Metrics saved to {csv_file}")



Type Token Ratio: 0.49480249480249483
Maas Index: 0.01844708812796763
Gulpease: 51.1
Coherence Score: 0.34592713597748015
Coverage Score: 0.44034856744110584
Coverage Score Token: 0.30303030303030304
Metrics saved to mixtral_8x7b_response_PROMPT2_IT_metrics.csv
