In [1]:
import datetime as dt
import pandas as pd
import numpy as np
import pickle
from langchain_ollama.chat_models import ChatOllama

grand_t0 = dt.datetime.now()

# Create Datasets
## Merge original binary and multiclass datasets into one

In [2]:
# Dataset: Multiple Distorions (sagarikashreevastava)
# Paper: https://aclanthology.org/2021.clpsych-1.17/
# Data: https://www.kaggle.com/datasets/sagarikashreevastava/cognitive-distortion-detetction-dataset

# !pip install kagglehub
import kagglehub
multiclass_dataset_path = kagglehub.dataset_download("sagarikashreevastava/cognitive-distortion-detetction-dataset")
print("Path to dataset files:", multiclass_dataset_path)
multiclass_dataset_file_path = multiclass_dataset_path + "/Annotated_data.csv"


Path to dataset files: C:\Users\anton\.cache\kagglehub\datasets\sagarikashreevastava\cognitive-distortion-detetction-dataset\versions\1


In [3]:
df2 = pd.read_csv(multiclass_dataset_file_path) 
df2 = df2.drop('Id_Number', axis=1) # delete columnb with id 
df2

Unnamed: 0,Patient Question,Distorted part,Dominant Distortion,Secondary Distortion (Optional)
0,"Hello, I have a beautiful,smart,outgoing and a...",The voice are always fimilar (someone she know...,Personalization,
1,Since I was about 16 years old I’ve had these ...,I feel trapped inside my disgusting self and l...,Labeling,Emotional Reasoning
2,So I’ve been dating on and off this guy for a...,,No Distortion,
3,My parents got divorced in 2004. My mother has...,,No Distortion,
4,I don’t really know how to explain the situati...,I refused to go because I didn’t know if it wa...,Fortune-telling,Emotional Reasoning
...,...,...,...,...
2525,I’m a 21 year old female. I spent most of my l...,,No Distortion,
2526,I am 21 female and have not had any friends fo...,Now I am at university my peers around me all ...,Overgeneralization,
2527,From the U.S.: My brother is 19 years old and ...,He claims he’s severely depressed and has outb...,Mental filter,Mind Reading
2528,From the U.S.: I am a 21 year old woman who ha...,,No Distortion,


In [4]:
df3 = df2
df3

Unnamed: 0,Patient Question,Distorted part,Dominant Distortion,Secondary Distortion (Optional)
0,"Hello, I have a beautiful,smart,outgoing and a...",The voice are always fimilar (someone she know...,Personalization,
1,Since I was about 16 years old I’ve had these ...,I feel trapped inside my disgusting self and l...,Labeling,Emotional Reasoning
2,So I’ve been dating on and off this guy for a...,,No Distortion,
3,My parents got divorced in 2004. My mother has...,,No Distortion,
4,I don’t really know how to explain the situati...,I refused to go because I didn’t know if it wa...,Fortune-telling,Emotional Reasoning
...,...,...,...,...
2525,I’m a 21 year old female. I spent most of my l...,,No Distortion,
2526,I am 21 female and have not had any friends fo...,Now I am at university my peers around me all ...,Overgeneralization,
2527,From the U.S.: My brother is 19 years old and ...,He claims he’s severely depressed and has outb...,Mental filter,Mind Reading
2528,From the U.S.: I am a 21 year old woman who ha...,,No Distortion,


## Create 3 split datasets for evaluation

### Evaluate new split models individually for each of 3 parts against the models learned on the other 2 parts 
- parts
  - part1 = df[df.index % 3 == 1] # dfs[1] - test on model from third_split
  - part2 = df[df.index % 3 == 2] # dfs[2] - test on model from second_split
  - part3 = df[df.index % 3 == 0] # dfs[0] - test on model from first_split
- train-test splits
  - (pd.concat([part1, part2]), part3),  # first_split:  (1 + 2) -> train, (3) -> test
  - (pd.concat([part1, part3]), part2),  # second_split: (1 + 3) -> train, (2) -> test
  - (pd.concat([part2, part3]), part1)   # third_split:  (2 + 3) -> train, (1) -> test

In [5]:
df3s = [ df3.iloc[:0,:].copy(), df3.iloc[:0,:].copy(), df3.iloc[:0,:].copy()]

row_n = 0 
for _, row in df3.iterrows():
    r3 = row_n % 3
    row_n += 1
    df = df3s[r3]
    df.loc[len(df)] = row

print(len(df3))
for df in df3s:
    print(len(df))

2530
844
843
843


# Create Evaluation Functions

In [6]:
def f1_from_counts(true_positive, true_negative, false_positive, false_negative):
    precision = true_positive / (true_positive + false_positive) if (true_positive + false_positive) > 0 else 0
    recall = true_positive / (true_positive + false_negative) if (true_positive + false_negative) > 0 else 0
    return 2 * precision * recall / (precision + recall) if precision > 0 or recall > 0 else 0 

def evaluate_df_counts(df,evaluator,threshold,debug=False):
    true_positive = 0
    true_negative = 0
    false_positive = 0
    false_negative = 0
    for _, row in df.iterrows():
        # Text definition: first, check the 2nd column; if NaN, take the text from the 1st column.
        text = row.iloc[1] if pd.notna(row.iloc[1]) else row.iloc[0]
        primary_distortion = row.iloc[2]  # The main cognitive distortion from the 3rd column
        secondary_distortion = row.iloc[3] if pd.notna(row.iloc[3]) else None  # The secondary distortion from the 4th column, if it exists
        ground_distortion = False if primary_distortion == 'No Distortion' else True
                       
        our_distortion = evaluator(text,threshold)
        
        # https://en.wikipedia.org/wiki/F-score
        if ground_distortion == True and our_distortion == True:
            true_positive += 1
        if ground_distortion == False and our_distortion == True:
            false_positive += 1
        if ground_distortion == False and our_distortion == False:
            true_negative += 1
        if ground_distortion == True and our_distortion == False:
            false_negative += 1

        if debug:
            print(ground_distortion,our_distortion,text)

    return true_positive, true_negative, false_positive, false_negative


def evaluate_df(df,evaluator,threshold,debug=False):
    true_positive, true_negative, false_positive, false_negative = evaluate_df_counts(df,evaluator,threshold,debug)
    return f1_from_counts(true_positive, true_negative, false_positive, false_negative) 


def evaluate_df_acc_f1(df,evaluator,threshold,debug=False):
    true_positive, true_negative, false_positive, false_negative = evaluate_df_counts(df,evaluator,threshold,debug)
    return (true_positive + true_negative) / len(df), f1_from_counts(true_positive, true_negative, false_positive, false_negative) 


# Evaluate Different Models

In [7]:
results = {}


## llama3.2:3b

In [8]:
llm_llama32 = ChatOllama(model="llama3.2", base_url="http://localhost:11434")  # Explicitly set base_url
def evaluator_llm_llama32(text,threshold=0,debug=False):
    query = f"Be concise. Answer simply yes or no. Does the following text contain cognitive distortions known in cognitive behavioral therapy?\n{text}?"
    r = llm_llama32.invoke(query).content
    return r.lower().startswith("yes")

t0 = dt.datetime.now()

result = [ 
    evaluate_df_acc_f1(df3s[0],evaluator_llm_llama32,0,debug=False),
    evaluate_df_acc_f1(df3s[1],evaluator_llm_llama32,0,debug=False),
    evaluate_df_acc_f1(df3s[2],evaluator_llm_llama32,0,debug=False),
]

t1 = dt.datetime.now()
delta = t1 - t0
results["llama3.2:3b"] = (result,delta.total_seconds(),delta.total_seconds()/len(df3))
print(results["llama3.2:3b"])


([(0.6208530805687204, 0.7629629629629631), (0.6465005931198102, 0.7840579710144928), (0.638196915776987, 0.7732342007434944)], 1500.192061, 0.5929612889328063)


In [9]:
results

{'llama3.2:3b': ([(0.6208530805687204, 0.7629629629629631),
   (0.6465005931198102, 0.7840579710144928),
   (0.638196915776987, 0.7732342007434944)],
  1500.192061,
  0.5929612889328063)}

## qwen2:7b

In [10]:
llm_qwen2 = ChatOllama(model="qwen2", base_url="http://localhost:11434")  # Explicitly set base_url
def evaluator_llm_qwen2(text,threshold=0):
    query = f"Be concise. Answer simply yes or no. Does the following text contain cognitive distortions known in cognitive behavioral therapy?\n{text}?"
    r = llm_qwen2.invoke(query).content
    return r.lower().startswith("yes")

t0 = dt.datetime.now()

result = [ 
    evaluate_df_acc_f1(df3s[0],evaluator_llm_qwen2,0,debug=False),
    evaluate_df_acc_f1(df3s[1],evaluator_llm_qwen2,0,debug=False),
    evaluate_df_acc_f1(df3s[2],evaluator_llm_qwen2,0,debug=False),
]

t1 = dt.datetime.now()
delta = t1 - t0
results["qwen2:7b"] = (result,delta.total_seconds(),delta.total_seconds()/len(df3))
print(results["qwen2:7b"])

([(0.6753554502369669, 0.7705192629815746), (0.693950177935943, 0.787828947368421), (0.6654804270462633, 0.7653910149750416)], 377.589437, 0.14924483675889327)


In [11]:
results

{'llama3.2:3b': ([(0.6208530805687204, 0.7629629629629631),
   (0.6465005931198102, 0.7840579710144928),
   (0.638196915776987, 0.7732342007434944)],
  1500.192061,
  0.5929612889328063),
 'qwen2:7b': ([(0.6753554502369669, 0.7705192629815746),
   (0.693950177935943, 0.787828947368421),
   (0.6654804270462633, 0.7653910149750416)],
  377.589437,
  0.14924483675889327)}

## qwen2.5:7b

In [12]:
llm_qwen25_7 = ChatOllama(model="qwen2.5:7b", base_url="http://localhost:11434")  # Explicitly set base_url
def evaluator_llm_qwen25_7(text,threshold=0):
    query = f"Be concise. Answer simply yes or no. Does the following text contain cognitive distortions known in cognitive behavioral therapy?\n{text}?"
    r = llm_qwen25_7.invoke(query).content
    return r.lower().startswith("yes")

t0 = dt.datetime.now()

result = [ 
    evaluate_df_acc_f1(df3s[0],evaluator_llm_qwen25_7,0,debug=False),
    evaluate_df_acc_f1(df3s[1],evaluator_llm_qwen25_7,0,debug=False),
    evaluate_df_acc_f1(df3s[2],evaluator_llm_qwen25_7,0,debug=False),
]

t1 = dt.datetime.now()
delta = t1 - t0
results["qwen2.5:7b"] = (result,delta.total_seconds(),delta.total_seconds()/len(df3))
print(results["qwen2.5:7b"])

([(0.6457345971563981, 0.6895119418483905), (0.6298932384341637, 0.6835699797160244), (0.6215895610913404, 0.6734902763561924)], 5460.135811, 2.158156447035573)


In [13]:
results

{'llama3.2:3b': ([(0.6208530805687204, 0.7629629629629631),
   (0.6465005931198102, 0.7840579710144928),
   (0.638196915776987, 0.7732342007434944)],
  1500.192061,
  0.5929612889328063),
 'qwen2:7b': ([(0.6753554502369669, 0.7705192629815746),
   (0.693950177935943, 0.787828947368421),
   (0.6654804270462633, 0.7653910149750416)],
  377.589437,
  0.14924483675889327),
 'qwen2.5:7b': ([(0.6457345971563981, 0.6895119418483905),
   (0.6298932384341637, 0.6835699797160244),
   (0.6215895610913404, 0.6734902763561924)],
  5460.135811,
  2.158156447035573)}

## qwen2.5:14b

In [14]:
llm_qwen25_14 = ChatOllama(model="qwen2.5:14b", base_url="http://localhost:11434")  # Explicitly set base_url
def evaluator_llm_qwen25_14(text,threshold=0):
    query = f"Be concise. Answer simply yes or no. Does the following text contain cognitive distortions known in cognitive behavioral therapy?\n{text}?"
    r = llm_qwen25_14.invoke(query).content
    return r.lower().startswith("yes")

t0 = dt.datetime.now()

f1 = [ 
    evaluate_df_acc_f1(df3s[0],evaluator_llm_qwen25_14,0,debug=False),
    evaluate_df_acc_f1(df3s[1],evaluator_llm_qwen25_14,0,debug=False),
    evaluate_df_acc_f1(df3s[2],evaluator_llm_qwen25_14,0,debug=False),
]

t1 = dt.datetime.now()
delta = t1 - t0
results["qwen2.5:14b"] = (result,delta.total_seconds(),delta.total_seconds()/len(df3))
print(results["qwen2.5:14b"])

([(0.6457345971563981, 0.6895119418483905), (0.6298932384341637, 0.6835699797160244), (0.6215895610913404, 0.6734902763561924)], 34236.625295, 13.532262962450591)


In [15]:
results

{'llama3.2:3b': ([(0.6208530805687204, 0.7629629629629631),
   (0.6465005931198102, 0.7840579710144928),
   (0.638196915776987, 0.7732342007434944)],
  1500.192061,
  0.5929612889328063),
 'qwen2:7b': ([(0.6753554502369669, 0.7705192629815746),
   (0.693950177935943, 0.787828947368421),
   (0.6654804270462633, 0.7653910149750416)],
  377.589437,
  0.14924483675889327),
 'qwen2.5:7b': ([(0.6457345971563981, 0.6895119418483905),
   (0.6298932384341637, 0.6835699797160244),
   (0.6215895610913404, 0.6734902763561924)],
  5460.135811,
  2.158156447035573),
 'qwen2.5:14b': ([(0.6457345971563981, 0.6895119418483905),
   (0.6298932384341637, 0.6835699797160244),
   (0.6215895610913404, 0.6734902763561924)],
  34236.625295,
  13.532262962450591)}

In [16]:
# save results to pickle file
file = open('llm_evaluation_results_shreevastava', 'wb')
pickle.dump(results, file)
file.close()

In [17]:
# read results from pickle file for sanity check
file = open('llm_evaluation_results_shreevastava', 'rb')
results = pickle.load(file)
file.close()
results

{'llama3.2:3b': ([(0.6208530805687204, 0.7629629629629631),
   (0.6465005931198102, 0.7840579710144928),
   (0.638196915776987, 0.7732342007434944)],
  1500.192061,
  0.5929612889328063),
 'qwen2:7b': ([(0.6753554502369669, 0.7705192629815746),
   (0.693950177935943, 0.787828947368421),
   (0.6654804270462633, 0.7653910149750416)],
  377.589437,
  0.14924483675889327),
 'qwen2.5:7b': ([(0.6457345971563981, 0.6895119418483905),
   (0.6298932384341637, 0.6835699797160244),
   (0.6215895610913404, 0.6734902763561924)],
  5460.135811,
  2.158156447035573),
 'qwen2.5:14b': ([(0.6457345971563981, 0.6895119418483905),
   (0.6298932384341637, 0.6835699797160244),
   (0.6215895610913404, 0.6734902763561924)],
  34236.625295,
  13.532262962450591)}

In [18]:
grand_t1 = dt.datetime.now()
grand_delta = grand_t1 - grand_t0
str(grand_delta)

'11:32:57.018331'