In [1]:
import datetime as dt
import pandas as pd
import numpy as np
import pickle
from langchain_ollama.chat_models import ChatOllama

grand_t0 = dt.datetime.now()

# Create Dataset

In [2]:
# Dataset: Unclassified distortions (halilbabacan)
# Paper: https://papers.ssrn.com/sol3/papers.cfm?abstract_id=4582307
# Data: https://huggingface.co/datasets/halilbabacan/autotrain-data-cognitive_distortions
# https://huggingface.co/datasets/halilbabacan/autotrain-data-cognitive_distortions/tree/main/raw
# https://huggingface.co/datasets/halilbabacan/autotrain-data-cognitive_distortions/blob/main/raw/Cognitive_distortions.csv
    
binary_dataset_file_path = "../../data/corpora/English/distortions/halilbabacan/raw_Cognitive_distortions.csv" 

In [3]:
df1 = pd.read_csv(binary_dataset_file_path)
df1 = df1.rename(columns={'Text': 'Patient Question', 'Label': 'Dominant Distortion'})
df1.insert(1, "Distorted part", value = np.nan)
df1.insert(3, "Secondary Distortion (Optional)", value = np.nan)
df1

Unnamed: 0,Patient Question,Distorted part,Dominant Distortion,Secondary Distortion (Optional)
0,I'm such a failure I never do anything right.,,Distortion,
1,Nobody likes me because I'm not interesting.,,Distortion,
2,I can't try new things because I'll just mess...,,Distortion,
3,My boss didn't say 'good morning' she must be...,,Distortion,
4,My friend didn't invite me to the party I mus...,,Distortion,
...,...,...,...,...
3522,Since then whenever my mother is out alone I b...,,Distortion,
3523,My family hate him but they didn’t met him at ...,,Distortion,
3524,However I am not happy at the least only half ...,,Distortion,
3525,Now I am at university my peers around me all ...,,Distortion,


In [4]:
df3 = df1
df3

Unnamed: 0,Patient Question,Distorted part,Dominant Distortion,Secondary Distortion (Optional)
0,I'm such a failure I never do anything right.,,Distortion,
1,Nobody likes me because I'm not interesting.,,Distortion,
2,I can't try new things because I'll just mess...,,Distortion,
3,My boss didn't say 'good morning' she must be...,,Distortion,
4,My friend didn't invite me to the party I mus...,,Distortion,
...,...,...,...,...
3522,Since then whenever my mother is out alone I b...,,Distortion,
3523,My family hate him but they didn’t met him at ...,,Distortion,
3524,However I am not happy at the least only half ...,,Distortion,
3525,Now I am at university my peers around me all ...,,Distortion,


## Create 3 split datasets for evaluation

### Evaluate new split models individually for each of 3 parts against the models learned on the other 2 parts 
- parts
  - part1 = df[df.index % 3 == 1] # dfs[1] - test on model from third_split
  - part2 = df[df.index % 3 == 2] # dfs[2] - test on model from second_split
  - part3 = df[df.index % 3 == 0] # dfs[0] - test on model from first_split
- train-test splits
  - (pd.concat([part1, part2]), part3),  # first_split:  (1 + 2) -> train, (3) -> test
  - (pd.concat([part1, part3]), part2),  # second_split: (1 + 3) -> train, (2) -> test
  - (pd.concat([part2, part3]), part1)   # third_split:  (2 + 3) -> train, (1) -> test

In [5]:
df3s = [ df3.iloc[:0,:].copy(), df3.iloc[:0,:].copy(), df3.iloc[:0,:].copy()]

row_n = 0 
for _, row in df3.iterrows():
    r3 = row_n % 3
    row_n += 1
    df = df3s[r3]
    df.loc[len(df)] = row

print(len(df3))
for df in df3s:
    print(len(df))

3527
1176
1176
1175


# Create Evaluation Functions

In [6]:
def f1_from_counts(true_positive, true_negative, false_positive, false_negative):
    precision = true_positive / (true_positive + false_positive) if (true_positive + false_positive) > 0 else 0
    recall = true_positive / (true_positive + false_negative) if (true_positive + false_negative) > 0 else 0
    return 2 * precision * recall / (precision + recall) if precision > 0 or recall > 0 else 0 

def evaluate_df_counts(df,evaluator,threshold,debug=False):
    true_positive = 0
    true_negative = 0
    false_positive = 0
    false_negative = 0
    for _, row in df.iterrows():
        # Text definition: first, check the 2nd column; if NaN, take the text from the 1st column.
        text = row.iloc[1] if pd.notna(row.iloc[1]) else row.iloc[0]
        primary_distortion = row.iloc[2]  # The main cognitive distortion from the 3rd column
        secondary_distortion = row.iloc[3] if pd.notna(row.iloc[3]) else None  # The secondary distortion from the 4th column, if it exists
        ground_distortion = False if primary_distortion == 'No Distortion' else True
                       
        our_distortion = evaluator(text,threshold)
        
        # https://en.wikipedia.org/wiki/F-score
        if ground_distortion == True and our_distortion == True:
            true_positive += 1
        if ground_distortion == False and our_distortion == True:
            false_positive += 1
        if ground_distortion == False and our_distortion == False:
            true_negative += 1
        if ground_distortion == True and our_distortion == False:
            false_negative += 1

        if debug:
            print(ground_distortion,our_distortion,text)

    return true_positive, true_negative, false_positive, false_negative


def evaluate_df(df,evaluator,threshold,debug=False):
    true_positive, true_negative, false_positive, false_negative = evaluate_df_counts(df,evaluator,threshold,debug)
    return f1_from_counts(true_positive, true_negative, false_positive, false_negative) 


def evaluate_df_acc_f1(df,evaluator,threshold,debug=False):
    true_positive, true_negative, false_positive, false_negative = evaluate_df_counts(df,evaluator,threshold,debug)
    return (true_positive + true_negative) / len(df), f1_from_counts(true_positive, true_negative, false_positive, false_negative) 


# Evaluate Different Models

In [7]:
results = {}


## llama3.2:3b

In [8]:
llm_llama32 = ChatOllama(model="llama3.2", base_url="http://localhost:11434")  # Explicitly set base_url
def evaluator_llm_llama32(text,threshold=0,debug=False):
    query = f"Be concise. Answer simply yes or no. Does the following text contain cognitive distortions known in cognitive behavioral therapy?\n{text}?"
    r = llm_llama32.invoke(query).content
    return r.lower().startswith("yes")

t0 = dt.datetime.now()

result = [ 
    evaluate_df_acc_f1(df3s[0],evaluator_llm_llama32,0,debug=False),
    evaluate_df_acc_f1(df3s[1],evaluator_llm_llama32,0,debug=False),
    evaluate_df_acc_f1(df3s[2],evaluator_llm_llama32,0,debug=False),
]

t1 = dt.datetime.now()
delta = t1 - t0
results["llama3.2:3b"] = (result,delta.total_seconds(),delta.total_seconds()/len(df3))
print(results["llama3.2:3b"])


([(0.7372448979591837, 0.847256549678695), (0.7440476190476191, 0.850620347394541), (0.737872340425532, 0.8475247524752475)], 329.626776, 0.09345811624610151)


In [9]:
results

{'llama3.2:3b': ([(0.7372448979591837, 0.847256549678695),
   (0.7440476190476191, 0.850620347394541),
   (0.737872340425532, 0.8475247524752475)],
  329.626776,
  0.09345811624610151)}

## qwen2:7b

In [10]:
llm_qwen2 = ChatOllama(model="qwen2", base_url="http://localhost:11434")  # Explicitly set base_url
def evaluator_llm_qwen2(text,threshold=0):
    query = f"Be concise. Answer simply yes or no. Does the following text contain cognitive distortions known in cognitive behavioral therapy?\n{text}?"
    r = llm_qwen2.invoke(query).content
    return r.lower().startswith("yes")

t0 = dt.datetime.now()

result = [ 
    evaluate_df_acc_f1(df3s[0],evaluator_llm_qwen2,0,debug=False),
    evaluate_df_acc_f1(df3s[1],evaluator_llm_qwen2,0,debug=False),
    evaluate_df_acc_f1(df3s[2],evaluator_llm_qwen2,0,debug=False),
]

t1 = dt.datetime.now()
delta = t1 - t0
results["qwen2:7b"] = (result,delta.total_seconds(),delta.total_seconds()/len(df3))
print(results["qwen2:7b"])

([(0.7474489795918368, 0.8427739544732662), (0.7389455782312925, 0.8383359662980516), (0.7659574468085106, 0.8551869404949973)], 398.222089, 0.11290674482563084)


In [11]:
results

{'llama3.2:3b': ([(0.7372448979591837, 0.847256549678695),
   (0.7440476190476191, 0.850620347394541),
   (0.737872340425532, 0.8475247524752475)],
  329.626776,
  0.09345811624610151),
 'qwen2:7b': ([(0.7474489795918368, 0.8427739544732662),
   (0.7389455782312925, 0.8383359662980516),
   (0.7659574468085106, 0.8551869404949973)],
  398.222089,
  0.11290674482563084)}

## qwen2.5:7b

In [12]:
llm_qwen25_7 = ChatOllama(model="qwen2.5:7b", base_url="http://localhost:11434")  # Explicitly set base_url
def evaluator_llm_qwen25_7(text,threshold=0):
    query = f"Be concise. Answer simply yes or no. Does the following text contain cognitive distortions known in cognitive behavioral therapy?\n{text}?"
    r = llm_qwen25_7.invoke(query).content
    return r.lower().startswith("yes")

t0 = dt.datetime.now()

result = [ 
    evaluate_df_acc_f1(df3s[0],evaluator_llm_qwen25_7,0,debug=False),
    evaluate_df_acc_f1(df3s[1],evaluator_llm_qwen25_7,0,debug=False),
    evaluate_df_acc_f1(df3s[2],evaluator_llm_qwen25_7,0,debug=False),
]

t1 = dt.datetime.now()
delta = t1 - t0
results["qwen2.5:7b"] = (result,delta.total_seconds(),delta.total_seconds()/len(df3))
print(results["qwen2.5:7b"])

([(0.6989795918367347, 0.7870036101083032), (0.701530612244898, 0.7931644077784326), (0.7302127659574468, 0.8123149792776792)], 413.610572, 0.11726979642755883)


In [13]:
results

{'llama3.2:3b': ([(0.7372448979591837, 0.847256549678695),
   (0.7440476190476191, 0.850620347394541),
   (0.737872340425532, 0.8475247524752475)],
  329.626776,
  0.09345811624610151),
 'qwen2:7b': ([(0.7474489795918368, 0.8427739544732662),
   (0.7389455782312925, 0.8383359662980516),
   (0.7659574468085106, 0.8551869404949973)],
  398.222089,
  0.11290674482563084),
 'qwen2.5:7b': ([(0.6989795918367347, 0.7870036101083032),
   (0.701530612244898, 0.7931644077784326),
   (0.7302127659574468, 0.8123149792776792)],
  413.610572,
  0.11726979642755883)}

## qwen2.5:14b

In [14]:
llm_qwen25_14 = ChatOllama(model="qwen2.5:14b", base_url="http://localhost:11434")  # Explicitly set base_url
def evaluator_llm_qwen25_14(text,threshold=0):
    query = f"Be concise. Answer simply yes or no. Does the following text contain cognitive distortions known in cognitive behavioral therapy?\n{text}?"
    r = llm_qwen25_14.invoke(query).content
    return r.lower().startswith("yes")

t0 = dt.datetime.now()

f1 = [ 
    evaluate_df_acc_f1(df3s[0],evaluator_llm_qwen25_14,0,debug=False),
    evaluate_df_acc_f1(df3s[1],evaluator_llm_qwen25_14,0,debug=False),
    evaluate_df_acc_f1(df3s[2],evaluator_llm_qwen25_14,0,debug=False),
]

t1 = dt.datetime.now()
delta = t1 - t0
results["qwen2.5:14b"] = (result,delta.total_seconds(),delta.total_seconds()/len(df3))
print(results["qwen2.5:14b"])

([(0.6989795918367347, 0.7870036101083032), (0.701530612244898, 0.7931644077784326), (0.7302127659574468, 0.8123149792776792)], 2217.164514, 0.6286261735185711)


In [15]:
results

{'llama3.2:3b': ([(0.7372448979591837, 0.847256549678695),
   (0.7440476190476191, 0.850620347394541),
   (0.737872340425532, 0.8475247524752475)],
  329.626776,
  0.09345811624610151),
 'qwen2:7b': ([(0.7474489795918368, 0.8427739544732662),
   (0.7389455782312925, 0.8383359662980516),
   (0.7659574468085106, 0.8551869404949973)],
  398.222089,
  0.11290674482563084),
 'qwen2.5:7b': ([(0.6989795918367347, 0.7870036101083032),
   (0.701530612244898, 0.7931644077784326),
   (0.7302127659574468, 0.8123149792776792)],
  413.610572,
  0.11726979642755883),
 'qwen2.5:14b': ([(0.6989795918367347, 0.7870036101083032),
   (0.701530612244898, 0.7931644077784326),
   (0.7302127659574468, 0.8123149792776792)],
  2217.164514,
  0.6286261735185711)}

In [16]:
# save results to pickle file
file = open('llm_evaluation_results_babacan', 'wb')
pickle.dump(results, file)
file.close()

In [17]:
# read results from pickle file for sanity check
file = open('llm_evaluation_results_babacan', 'rb')
results = pickle.load(file)
file.close()
results

{'llama3.2:3b': ([(0.7372448979591837, 0.847256549678695),
   (0.7440476190476191, 0.850620347394541),
   (0.737872340425532, 0.8475247524752475)],
  329.626776,
  0.09345811624610151),
 'qwen2:7b': ([(0.7474489795918368, 0.8427739544732662),
   (0.7389455782312925, 0.8383359662980516),
   (0.7659574468085106, 0.8551869404949973)],
  398.222089,
  0.11290674482563084),
 'qwen2.5:7b': ([(0.6989795918367347, 0.7870036101083032),
   (0.701530612244898, 0.7931644077784326),
   (0.7302127659574468, 0.8123149792776792)],
  413.610572,
  0.11726979642755883),
 'qwen2.5:14b': ([(0.6989795918367347, 0.7870036101083032),
   (0.701530612244898, 0.7931644077784326),
   (0.7302127659574468, 0.8123149792776792)],
  2217.164514,
  0.6286261735185711)}

In [18]:
grand_t1 = dt.datetime.now()
grand_delta = grand_t1 - grand_t0
str(grand_delta)

'0:56:02.085951'