In [48]:
import os, sys, math
cwd = os.getcwd()
project_path = cwd[:cwd.find('pygents')+7]
if project_path not in sys.path: sys.path.append(project_path)
os.chdir(project_path)

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import defaultdict

from pygents.aigents_api import tokenize_re
from pygents.util import dictcount
from pygents.aigents_api import build_ngrams
from pygents.plot import plot_dict

from pygents.aigents_api import TextMetrics

def language_metrics(metrics_list):
    metrics = {}
    for m in metrics_list:
        metrics[m] = './data/models/distortions/overfitting_combined/multiclass_view/' + m + '.txt'
        #metrics[m] = './data/dict/' + 'en' + '/' + m + '.txt'
    return metrics

In [49]:
binary_dataset_file_path = "./data/corpora/English/distortions/halilbabacan/raw_Cognitive_distortions.csv" 

import kagglehub
multiclass_dataset_path = kagglehub.dataset_download("sagarikashreevastava/cognitive-distortion-detetction-dataset")
multiclass_dataset_file_path = multiclass_dataset_path + "/Annotated_data.csv"

df1 = pd.read_csv(binary_dataset_file_path)
df1 = df1.rename(columns={'Text': 'Patient Question', 'Label': 'Dominant Distortion'})
df1.insert(1, "Distorted part", value = np.nan)
df1.insert(3, "Secondary Distortion (Optional)l", value = np.nan)

df2 = pd.read_csv(multiclass_dataset_file_path) 
df2 = df2.drop('Id_Number', axis=1) # delete columnb with id 

df3 = pd.concat([df1, df2], ignore_index=True)
df3



Unnamed: 0,Patient Question,Distorted part,Dominant Distortion,Secondary Distortion (Optional)l,Secondary Distortion (Optional)
0,I'm such a failure I never do anything right.,,Distortion,,
1,Nobody likes me because I'm not interesting.,,Distortion,,
2,I can't try new things because I'll just mess...,,Distortion,,
3,My boss didn't say 'good morning' she must be...,,Distortion,,
4,My friend didn't invite me to the party I mus...,,Distortion,,
...,...,...,...,...,...
6052,I’m a 21 year old female. I spent most of my l...,,No Distortion,,
6053,I am 21 female and have not had any friends fo...,Now I am at university my peers around me all ...,Overgeneralization,,
6054,From the U.S.: My brother is 19 years old and ...,He claims he’s severely depressed and has outb...,Mental filter,,Mind Reading
6055,From the U.S.: I am a 21 year old woman who ha...,,No Distortion,,


In [50]:
def analyse_normalized_ngrams(ngram_max, df, analytics_method):
    distortions = defaultdict(int)

    # Creating dictionaries for counting n-grams
    n_gram_dicts = defaultdict(lambda: defaultdict(int))  # A dictionary for each distortion (distortion-n-gram-n_gram_frequency)
    all_n_grams = defaultdict(int)  # A general dictionary for all n-grams
    
    uniq_n_gram_dicts = defaultdict(lambda: defaultdict(int)) # Counts of uniq N-grams by Distortion
    uniq_all_n_grams = defaultdict(int)  # A general dictionary for all n-grams uniq by text
    n_gram_distortions = defaultdict(lambda: defaultdict(int)) # Counts of distortiions by N-gram

    # Loop through the rows of the DataFrame
    for _, row in df.iterrows():
        # Text identification: first, check the 2nd column; if NaN, take the text from the 1st column
        text = row.iloc[1] if pd.notna(row.iloc[1]) else row.iloc[0]
        primary_distortion = row.iloc[2]  # The primary cognitive distortion from the 3rd column
        secondary_distortion = row.iloc[3] if pd.notna(row.iloc[3]) else None  # The secondary distortion from the 4th column, if present

        dictcount(distortions,primary_distortion)
        if secondary_distortion:
            dictcount(distortions,secondary_distortion)
        
        # Text tokenization
        tokens = tokenize_re(text)

        # Generation and counting of n-grams (from 1 to 4)
        for n in range(1, ngram_max + 1):
            n_grams = build_ngrams(tokens, n)
            dictcount(all_n_grams, n_grams)
            dictcount(n_gram_dicts[primary_distortion], n_grams)  # Increment the counter for the corresponding primary distortion
            if secondary_distortion:
                dictcount(n_gram_dicts[secondary_distortion], n_grams) # Increment the counter for the corresponding secondary distortion (if present)

            uniq_n_grams = set(n_grams)
            for uniq_n_gram in uniq_n_grams:
                dictcount(uniq_n_gram_dicts[primary_distortion], uniq_n_gram)
                dictcount(uniq_all_n_grams, uniq_n_gram)
                dictcount(n_gram_distortions[uniq_n_gram],primary_distortion)
                if secondary_distortion:
                    dictcount(uniq_n_gram_dicts[secondary_distortion], uniq_n_gram)
                    dictcount(n_gram_distortions[uniq_n_gram],secondary_distortion)
                
    # Normalizing distortion-specific counts by total counts
    norm_n_gram_dicts = {}
    for n_gram_dict in n_gram_dicts:
        norm_n_gram_dict = {}
        norm_n_gram_dicts[n_gram_dict] = norm_n_gram_dict
        dic = n_gram_dicts[n_gram_dict]
        for n_gram in dic:
            #print(dic[n_gram])
            #print(all_n_grams[n_gram])
            #break
            if len(n_gram) <= ngram_max:
                norm_n_gram_dict[n_gram] = float( dic[n_gram] ) / all_n_grams[n_gram]

    # Normalize uniq counts 
    norm_uniq_n_gram_dicts = {}
    for uniq_n_gram_dict in uniq_n_gram_dicts:
        norm_uniq_n_gram_dict = {}
        norm_uniq_n_gram_dicts[uniq_n_gram_dict] = norm_uniq_n_gram_dict
        dic = uniq_n_gram_dicts[uniq_n_gram_dict]
        nonuniq_dic = n_gram_dicts[n_gram_dict]
        # Normalize uniq Document counts of N-grams by distortion by Documents count by Distortion
        for n_gram in dic:
            if len(n_gram) <= ngram_max:
                #norm_uniq_n_gram_dict[n_gram] = float( dic[n_gram] ) * nonuniq_dic[n_gram] / distortions[uniq_n_gram_dict] / len(n_gram_distortions[n_gram]) / all_n_grams[n_gram]
                norm_uniq_n_gram_dict[n_gram] = float( dic[n_gram] ) / distortions[uniq_n_gram_dict] / len(n_gram_distortions[n_gram])

    if analytics_method == 'normalize':
        return norm_n_gram_dicts
    else:
        return norm_uniq_n_gram_dicts



def analyse_frequency(ngram_max, df):
    print('ANALYSE FREQUENCY')
    # Analyze the frequency of n-grams for each cognitive distortion

    # Creating dictionaries for counting n-grams
    n_gram_dicts = defaultdict(lambda: defaultdict(int))  # A dictionary for each distortion (distortion-n-gram-n_gram_frequency)

    # Loop through the rows of the DataFrame
    for _, row in df.iterrows():
        # Text identification: first, check the 2nd column; if NaN, take the text from the 1st column
        text = row[1] if pd.notna(row[1]) else row[0]
        primary_distortion = row[2]  # The primary cognitive distortion from the 3rd column
        secondary_distortion = row[3] if pd.notna(row[3]) else None  # The secondary distortion from the 4th column, if present

        # Text tokenization
        tokens = tokenize_re(text)

        # Generation and counting of n-grams (from 1 to 4)
        for n in range(1, ngram_max + 1):
            n_grams = build_ngrams(tokens, n)
            dictcount(n_gram_dicts[primary_distortion], n_grams)  # Increment the counter for the corresponding primary distortion
            if secondary_distortion:
                dictcount(n_gram_dicts[secondary_distortion], n_grams) # Increment the counter for the corresponding secondary distortion (if present)

    return n_gram_dicts



def analyse_TF_IDF(ngram_max, df):
    print('ANALYSE TF-IDF')
    # Analyze TF-IDF values for n-grams for each cognitive distortion

    # Creating dictionaries for counting n-grams
    n_gram_dicts = defaultdict(lambda: defaultdict(int))  # A dictionary for each distortion (distortion-n-gram-n_gram_frequency)
    all_n_grams = defaultdict(int)  # A general dictionary for all n-grams
    doc_counts = defaultdict(int)  # The number of documents in which each n-gram appears


    # Loop through the rows of the DataFrame
    for _, row in df.iterrows():
        # Text identification: first, check the 2nd column; if NaN, take the text from the 1st column
        text = row[1] if pd.notna(row[1]) else row[0]
        primary_distortion = row[2]  # The primary cognitive distortion from the 3rd column
        secondary_distortion = row[3] if pd.notna(row[3]) else None  # The secondary distortion from the 4th column, if present

        # Text tokenization
        tokens = tokenize_re(text)

        # Generate n-grams and update the document counters where they appear
        unique_ngrams = set()
        for n in range(1, ngram_max + 1):
            n_grams = build_ngrams(tokens, n)
            unique_ngrams.update(n_grams)
            dictcount(all_n_grams, n_grams)
            dictcount(n_gram_dicts[primary_distortion], n_grams)  # Increment the counter for the corresponding primary distortion
            if secondary_distortion:
                dictcount(n_gram_dicts[secondary_distortion], n_grams) # Increment the counter for the corresponding secondary distortion (if present)

        for n_gram in unique_ngrams:
            doc_counts[n_gram] += 1

    # The total number of texts
    total_docs = len(df)

    # TF-IDF Calculation
    tfidf_dicts = defaultdict(dict)
    for distortion, ngram_dict in n_gram_dicts.items(): # For each distortion (distortion), analyze the n-grams (ngram_dict)
        for n_gram, count in ngram_dict.items(): # For each n-gram (n_gram), check its frequency of occurrence (count) for the given distortion
            tf = count / sum(ngram_dict.values())  # Frequency of the n-gram in the text (TF): TF = (Number of occurrences of the given n-gram for the specific cognitive distortion) / (Total number of occurrences of all other n-grams for the same cognitive distortion)
            idf = math.log(total_docs / (1 + doc_counts[n_gram]))  # Inverse Document Frequency (IDF): IDF = Total number of documents / Number of documents containing the given n-gram
            tfidf_dicts[distortion][n_gram] = tf * idf  # TF-IDF

    return tfidf_dicts



def analyse_TFTF_tf(ngram_max, df):
    print('ANALYSE TFTF')
    # Analyze TFTF_tf values for n-grams for each cognitive distortion (see http://webstructor.net/papers/Kolonin-HP-ACA-IC-text.pdf)

    # Dictionaries for metrics
    n_gram_dicts = defaultdict(lambda: defaultdict(int))  # A dictionary for each distortion (distortion-n-gram-n_gram_frequency)
    all_n_grams = defaultdict(int)  # A general dictionary for all n-grams
    unique_n_gram_dicts = defaultdict(lambda: defaultdict(int))  # A dictionary for each distortion (distortion-unique_n-gram-n_gram_frequency)

    # Counting documents for n-grams
    doc_counts = defaultdict(int)  # The number of texts in which each n-gram appears

    # The main loop through the rows of the DataFrame
    for _, row in df.iterrows():
        # Text identification: first, check the 2nd column; if NaN, take the text from the 1st column
        text = row[1] if pd.notna(row[1]) else row[0] 
        primary_distortion = row[2]  # The primary cognitive distortion from the 3rd column
        secondary_distortion = row[3] if pd.notna(row[3]) else None  # The secondary distortion from the 4th column, if present

        # Text tokenization
        tokens = tokenize_re(text)

        # Look at unique n-grams and increment the document counters
        unique_ngrams = set()
        for n in range(1, ngram_max + 1):
            n_grams = build_ngrams(tokens, n)
            unique_ngrams.update(n_grams)
            dictcount(all_n_grams, n_grams)
            dictcount(n_gram_dicts[primary_distortion], n_grams)
            if secondary_distortion:
                dictcount(n_gram_dicts[secondary_distortion], n_grams)

        for n_gram in unique_ngrams:
            doc_counts[n_gram] += 1
            unique_n_gram_dicts[primary_distortion][n_gram] += 1
            if secondary_distortion:
                unique_n_gram_dicts[secondary_distortion][n_gram] += 1

    # The total number of texts
    total_docs = len(df)

    # Calculation of the TFTF_tf
    tftf_results = defaultdict(dict)
    for distortion, ngram_dict in n_gram_dicts.items(): # For each distortion (distortion), examine the n-grams (ngram_dict)
        for n_gram, count in ngram_dict.items(): # For each n-gram (n_gram), check its frequency of occurrence (count) for the given distortion
            # TFTF: Mutual relevance of features and text (how important a given n-gram is for describing the text, looking for specific n-grams within the text)
            tf = count / sum(ngram_dict.values()) # Frequency of the n-gram in the text (TF): TF = (Number of times the given n-gram appears for the specific cognitive distortion) / (Number of times all other n-grams appear for the same cognitive distortion)
            ft = doc_counts[n_gram] / total_docs # The number of texts in which the current n-gram appears / the total number of texts in the dataset
            Ft = sum(ngram_dict.values())
            tftf = (tf ** 2) / (ft*Ft if ft*Ft > 0 else 1)

            tftf_results[distortion][n_gram] = tftf

    return tftf_results



def analyse_TCTC_tc(ngram_max, df):
    print('ANALYSE TCTC')
    # Analyze TCTC_tc values for n-grams for each cognitive distortion (see http://webstructor.net/papers/Kolonin-HP-ACA-IC-text.pdf)

    # Dictionaries for metrics
    n_gram_dicts = defaultdict(lambda: defaultdict(int))  # A dictionary for each distortion (distortion-n-gram-n_gram_frequency)
    all_n_grams = defaultdict(int)  # A general dictionary for all n-grams
    unique_n_gram_dicts = defaultdict(lambda: defaultdict(int))  # A dictionary for each distortion (distortion-unique_n-gram-n_gram_frequency)

    # Counting documents for n-grams
    doc_counts = defaultdict(int)  # The number of texts in which each n-gram appears

    # The main loop through the rows of the DataFrame
    for _, row in df.iterrows():
        # Text identification: first, check the 2nd column; if NaN, take the text from the 1st column
        text = row[1] if pd.notna(row[1]) else row[0] 
        primary_distortion = row[2]  # The primary cognitive distortion from the 3rd column
        secondary_distortion = row[3] if pd.notna(row[3]) else None  # The secondary distortion from the 4th column, if present

        # Text tokenization
        tokens = tokenize_re(text)

        # Look at unique n-grams and increment the document counters
        unique_ngrams = set()
        for n in range(1, ngram_max + 1):
            n_grams = build_ngrams(tokens, n)
            unique_ngrams.update(n_grams)
            dictcount(all_n_grams, n_grams)
            dictcount(n_gram_dicts[primary_distortion], n_grams)
            if secondary_distortion:
                dictcount(n_gram_dicts[secondary_distortion], n_grams)

        for n_gram in unique_ngrams:
            doc_counts[n_gram] += 1
            unique_n_gram_dicts[primary_distortion][n_gram] += 1
            if secondary_distortion:
                unique_n_gram_dicts[secondary_distortion][n_gram] += 1

    # The total number of texts
    total_docs = len(df)

    # Calculation of the TCTC_tc
    tctc_results = defaultdict(dict)
    for distortion, ngram_dict in n_gram_dicts.items(): # For each distortion (distortion), examine the n-grams (ngram_dict)
        for n_gram, count in ngram_dict.items(): # For each n-gram (n_gram), check its frequency of occurrence (count) for the given distortion
            # TCTC: Mutual relevance of categories and text (how the given n-gram is distributed across all texts of a given category)
            ct = unique_n_gram_dicts[distortion][n_gram] # The number of texts in which the current n-gram (n_gram) is associated with the current distortion (distortion)
            tctc = (ct ** 2) / (doc_counts[n_gram] * total_docs) # In the denominator: the number of texts containing the given n-gram, regardless of distortion * the total number of texts

            tctc_results[distortion][n_gram] = tctc

    return tctc_results



def analyse_CFCF_cf(ngram_max, df):
    print('ANALYSE CFCF')
    # Analyze CFCF_cf values for n-grams for each cognitive distortion (see http://webstructor.net/papers/Kolonin-HP-ACA-IC-text.pdf)

    # Dictionaries for metrics
    n_gram_dicts = defaultdict(lambda: defaultdict(int))  # A dictionary for each distortion (distortion-n-gram-n_gram_frequency)
    all_n_grams = defaultdict(int)  # A general dictionary for all n-grams
    unique_n_gram_dicts = defaultdict(lambda: defaultdict(int))  # A dictionary for each distortion (distortion-unique_n-gram-n_gram_frequency)

    # Counting documents for n-grams
    doc_counts = defaultdict(int)  # The number of texts in which each n-gram appears

    # The main loop through the rows of the DataFrame
    for _, row in df.iterrows():
        # Text identification: first, check the 2nd column; if NaN, take the text from the 1st column
        text = row[1] if pd.notna(row[1]) else row[0] 
        primary_distortion = row[2]  # The primary cognitive distortion from the 3rd column
        secondary_distortion = row[3] if pd.notna(row[3]) else None  # The secondary distortion from the 4th column, if present

        # Text tokenization
        tokens = tokenize_re(text)

        # Look at unique n-grams and increment the document counters
        unique_ngrams = set()
        for n in range(1, ngram_max + 1):
            n_grams = build_ngrams(tokens, n)
            unique_ngrams.update(n_grams)
            dictcount(all_n_grams, n_grams)
            dictcount(n_gram_dicts[primary_distortion], n_grams)
            if secondary_distortion:
                dictcount(n_gram_dicts[secondary_distortion], n_grams)

        for n_gram in unique_ngrams:
            doc_counts[n_gram] += 1
            unique_n_gram_dicts[primary_distortion][n_gram] += 1
            if secondary_distortion:
                unique_n_gram_dicts[secondary_distortion][n_gram] += 1

    # Calculation of the CFCF_cf
    cfcf_results = defaultdict(dict)
    for distortion, ngram_dict in n_gram_dicts.items(): # For each distortion (distortion), examine the n-grams (ngram_dict)
        for n_gram, count in ngram_dict.items(): # For each n-gram (n_gram), check its frequency of occurrence (count) for the given distortion
            # CFCF: Mutual relevance of features and categories (how characteristic the given n-gram is for a particular category)
            cf = unique_n_gram_dicts[distortion][n_gram]
            fc = sum(unique_n_gram_dicts[distortion].values()) # The total number of texts for all n-grams associated with the given distortion
            cfcf = (cf ** 2) / (cf * fc if cf * fc > 0 else 1)

            cfcf_results[distortion][n_gram] = cfcf

    return cfcf_results

In [63]:
def cut_off_upper_ngrams(model_ngram, upper_cut_off_percentage):
    filtered_model_ngram = {}
    for distortion, ngram_dict in model_ngram.items():
        # Find the maximum metric value for the current distortion
        max_value = max(ngram_dict.values()) if ngram_dict else 0
        threshold_value = max_value * (upper_cut_off_percentage / 100)

        # Filter n-grams that meet or exceed the threshold value
        filtered_model_ngram[distortion] = {
            ngram: metric for ngram, metric in ngram_dict.items() if metric >= threshold_value
        }
    return filtered_model_ngram


def create_model_files(filtered_model_ngram):
    # Create .txt files for each distortion
    output_dir = "./data/models/distortions/overfitting_combined/multiclass_view"
    distortion_file_path = f"{output_dir}/All_distortions.txt"
    distortions_labels = []

    with open(distortion_file_path, "w", encoding="utf-8") as distortion_file:
        for distortion, ngrams in filtered_model_ngram.items():
            file_path = f"{output_dir}/{distortion}.txt"
            sorted_ngrams = sorted(ngrams.items(), key=lambda x: x[1], reverse=True)
            distortions_labels.append(distortion)
            with open(file_path, "w", encoding="utf-8") as f:
                for ngram, metric_value in sorted_ngrams:
                    ngram_str = ' '.join(ngram)
                    f.write(f"{ngram_str}\t{metric_value}\n")
                    if distortion != "No Distortion":
                            distortion_file.write(f"{ngram_str}\t{metric_value}\n")
    
    old_distortion_file = f"{output_dir}/Distortion.txt"
    os.remove(old_distortion_file)
    os.rename(distortion_file_path, old_distortion_file)

    #return (list(set(distortions_labels) - {'No Distortion'} - {'Distortion'}))
    #return ({'Distortion'})
    #return (list(set(distortions_labels)))
    return (list(set(distortions_labels) - {'No Distortion'}))

In [52]:
def f1_from_counts(true_positive, true_negative, false_positive, false_negative):
    precision = true_positive / (true_positive + false_positive) if (true_positive + false_positive) > 0 else 0
    recall = true_positive / (true_positive + false_negative) if (true_positive + false_negative) > 0 else 0
    return 2 * precision * recall / (precision + recall) if precision > 0 or recall > 0 else 0 

def evaluate_df_counts(df,evaluator,threshold, tm, debug=False):
    true_positive = 0
    true_negative = 0
    false_positive = 0
    false_negative = 0
    for _, row in df.iterrows():
        # Text definition: first, check the 2nd column; if NaN, take the text from the 1st column.
        text = row.iloc[1] if pd.notna(row.iloc[1]) else row.iloc[0]
        primary_distortion = row.iloc[2]  # The main cognitive distortion from the 3rd column
        secondary_distortion = row.iloc[3] if pd.notna(row.iloc[3]) else None  # The secondary distortion from the 4th column, if it exists
        ground_distortion = False if primary_distortion == 'No Distortion' else True
                       
        our_distortion = evaluator(text,threshold, tm)
        
        # https://en.wikipedia.org/wiki/F-score
        if ground_distortion == True and our_distortion == True:
            true_positive += 1
        if ground_distortion == False and our_distortion == True:
            false_positive += 1
        if ground_distortion == False and our_distortion == False:
            true_negative += 1
        if ground_distortion == True and our_distortion == False:
            false_negative += 1

        if debug:
            print(ground_distortion,our_distortion,text[:20],metrics)

    return true_positive, true_negative, false_positive, false_negative

def evaluate_df(df,evaluator,threshold,tm, debug=False):
    true_positive, true_negative, false_positive, false_negative = evaluate_df_counts(df,evaluator,threshold,tm,debug)
    return f1_from_counts(true_positive, true_negative, false_positive, false_negative)

def evaluate_df_acc_f1(df,evaluator,threshold,tm,debug=False):
    true_positive, true_negative, false_positive, false_negative = evaluate_df_counts(df,evaluator,threshold,tm,debug)
    return (true_positive + true_negative) / len(df), f1_from_counts(true_positive, true_negative, false_positive, false_negative) 

def our_evaluator_any(text,threshold, tm):
    metrics = tm.get_sentiment_words(text)
    for m in metrics:
        if metrics[m] > threshold:
            return True
    return False

def our_evaluator_avg(text,threshold, tm):
    metrics = tm.get_sentiment_words(text)
    l = list(metrics.values())
    avg = sum(l) / len(l) if  len(l) > 0 else 0
    if avg > threshold:
        return True
    return False

In [53]:
def analyse_dataset(analytics_method, ngram_max, upper_cut_off_percentage, df):

    print('Analytics method:', analytics_method)
    print('N-gram max length:', ngram_max)
    print('Upper cut-off percentage:', upper_cut_off_percentage)

    if analytics_method == 'frequency':
        model_ngram = analyse_frequency(ngram_max, df)
    elif analytics_method == 'TF-IDF':
        model_ngram = analyse_TF_IDF(ngram_max, df)
    elif analytics_method == 'TFTF_tf':
        model_ngram = analyse_TFTF_tf(ngram_max, df)
    elif analytics_method == 'TCTC_tc':
        model_ngram = analyse_TCTC_tc(ngram_max, df)
    elif analytics_method == 'CFCF_cf':
        model_ngram = analyse_CFCF_cf(ngram_max, df)
    elif analytics_method == 'normalize' or analytics_method == 'normalize_uniq':
        model_ngram = analyse_normalized_ngrams(ngram_max, df, analytics_method)

    # Filter out values below the threshold
    filtered_model_ngram = cut_off_upper_ngrams(model_ngram, upper_cut_off_percentage)

    # Create .txt files for each distortion
    distortions_labels = create_model_files(filtered_model_ngram)
    tm = TextMetrics(language_metrics(distortions_labels), encoding = "utf-8", debug=False)

    print('\tAny distortion (threshold, accuracy, F1 score):')
    any_res_acc = {}
    any_res = {}
    for threshold in [0.0,0.01,0.05,0.1,0.2,0.4,0.6,0.8]:
        acc, f1 = evaluate_df_acc_f1(df3,our_evaluator_any,threshold, tm)
        any_res_acc[threshold] = acc
        any_res[threshold] = f1
        print('\t', threshold, acc, f1)
    avg_res_acc = {}
    avg_res = {}
    print('\tAverage distortion (threshold, accuracy, F1 score):')
    for threshold in [0.0,0.01,0.05,0.1,0.2,0.4,0.6,0.8]:
        acc, f1 = evaluate_df_acc_f1(df3,our_evaluator_avg,threshold, tm)
        avg_res_acc[threshold] = acc
        avg_res[threshold] = f1
        print('\t', threshold, acc, f1)

In [55]:

for analytics_method in ['normalize', 'normalize_uniq', 'TF-IDF', 'frequency', 'TFTF_tf', 'TCTC_tc', 'CFCF_cf']:
    for ngram_max in range (1, 5):
        for upper_cut_off_percentage in [10, 20, 30]:
            analyse_dataset(analytics_method, ngram_max, upper_cut_off_percentage, df3)


Analytics method: normalize
N-gram max length: 1
Upper cut-off percentage: 10
	Any distortion (threshold, accuracy, F1 score):
	 0.0 0.6919266963843487 0.8179156908665105
	 0.01 0.6919266963843487 0.8179156908665105
	 0.05 0.6919266963843487 0.8179156908665105
	 0.1 0.6919266963843487 0.8179156908665105
	 0.2 0.6919266963843487 0.8179156908665105
	 0.4 0.6919266963843487 0.8179156908665105
	 0.6 0.6919266963843487 0.8179156908665105
	 0.8 0.6919266963843487 0.8179156908665105
	Average distortion (threshold, accuracy, F1 score):
	 0.0 0.6919266963843487 0.8179156908665105
	 0.01 0.6919266963843487 0.8179156908665105
	 0.05 0.6919266963843487 0.8179156908665105
	 0.1 0.6919266963843487 0.8179156908665105
	 0.2 0.6922568928512465 0.8180753464766739
	 0.4 0.8872379065543998 0.9202568593111501
	 0.6 0.6291893676737659 0.6356262167423751
	 0.8 0.3954102691101205 0.22546531302876482
Analytics method: normalize
N-gram max length: 1
Upper cut-off percentage: 20
	Any distortion (threshold, accur

  text = row[1] if pd.notna(row[1]) else row[0]
  primary_distortion = row[2]  # The primary cognitive distortion from the 3rd column
  secondary_distortion = row[3] if pd.notna(row[3]) else None  # The secondary distortion from the 4th column, if present


	Any distortion (threshold, accuracy, F1 score):
	 0.0 0.6919266963843487 0.8179156908665105
	 0.01 0.6919266963843487 0.8179156908665105
	 0.05 0.6919266963843487 0.8179156908665105
	 0.1 0.6919266963843487 0.8179156908665105
	 0.2 0.6919266963843487 0.8179156908665105
	 0.4 0.6919266963843487 0.8179156908665105
	 0.6 0.6917615981508998 0.8178003318044307
	 0.8 0.689780419349513 0.816162802074161
	Average distortion (threshold, accuracy, F1 score):
	 0.0 0.6919266963843487 0.8179156908665105
	 0.01 0.6919266963843487 0.8179156908665105
	 0.05 0.6919266963843487 0.8179156908665105
	 0.1 0.6919266963843487 0.8179156908665105
	 0.2 0.6919266963843487 0.8179156908665105
	 0.4 0.6919266963843487 0.8179156908665105
	 0.6 0.6917615981508998 0.8178003318044307
	 0.8 0.6504870397886743 0.7836042113871
Analytics method: TF-IDF
N-gram max length: 1
Upper cut-off percentage: 20
ANALYSE TF-IDF
	Any distortion (threshold, accuracy, F1 score):
	 0.0 0.6919266963843487 0.8179156908665105
	 0.01 0.691

  text = row[1] if pd.notna(row[1]) else row[0]
  primary_distortion = row[2]  # The primary cognitive distortion from the 3rd column
  secondary_distortion = row[3] if pd.notna(row[3]) else None  # The secondary distortion from the 4th column, if present


	Any distortion (threshold, accuracy, F1 score):
	 0.0 0.6909361069836553 0.8172231985940246
	 0.01 0.6909361069836553 0.8172231985940246
	 0.05 0.6909361069836553 0.8172231985940246
	 0.1 0.6909361069836553 0.8172231985940246
	 0.2 0.6909361069836553 0.8172231985940246
	 0.4 0.6907710087502064 0.8171077043257495
	 0.6 0.689615321116064 0.8162267839687195
	 0.8 0.6569258708931814 0.7383530596826995
	Average distortion (threshold, accuracy, F1 score):
	 0.0 0.6909361069836553 0.8172231985940246
	 0.01 0.6909361069836553 0.8172231985940246
	 0.05 0.6909361069836553 0.8172231985940246
	 0.1 0.6909361069836553 0.8172231985940246
	 0.2 0.6909361069836553 0.8172231985940246
	 0.4 0.6907710087502064 0.8171077043257495
	 0.6 0.6838368829453525 0.811904528042432
	 0.8 0.448571900280667 0.361376673040153
Analytics method: frequency
N-gram max length: 1
Upper cut-off percentage: 20
ANALYSE FREQUENCY
	Any distortion (threshold, accuracy, F1 score):
	 0.0 0.6902757140498597 0.8167610861496386
	 0.0

  text = row[1] if pd.notna(row[1]) else row[0]
  primary_distortion = row[2]  # The primary cognitive distortion from the 3rd column
  secondary_distortion = row[3] if pd.notna(row[3]) else None  # The secondary distortion from the 4th column, if present


	Any distortion (threshold, accuracy, F1 score):
	 0.0 0.6906059105167575 0.8169921875
	 0.01 0.6906059105167575 0.8169921875
	 0.05 0.6906059105167575 0.8169921875
	 0.1 0.6906059105167575 0.8169921875
	 0.2 0.6906059105167575 0.8169921875
	 0.4 0.6904408122833086 0.816876648110167
	 0.6 0.6788839359418855 0.807939172509134
	 0.8 0.4480766055803203 0.3399802566633761
	Average distortion (threshold, accuracy, F1 score):
	 0.0 0.6906059105167575 0.8169921875
	 0.01 0.6906059105167575 0.8169921875
	 0.05 0.6906059105167575 0.8169921875
	 0.1 0.6906059105167575 0.8169921875
	 0.2 0.6906059105167575 0.8169921875
	 0.4 0.6901106158164108 0.8164303178484107
	 0.6 0.6518078256562655 0.750561797752809
	 0.8 0.30906389301634474 0.002859185132237312
Analytics method: TFTF_tf
N-gram max length: 1
Upper cut-off percentage: 20
ANALYSE TFTF
	Any distortion (threshold, accuracy, F1 score):
	 0.0 0.6868086511474327 0.8143290594107859
	 0.01 0.6868086511474327 0.8143290594107859
	 0.05 0.68680865114743

  text = row[1] if pd.notna(row[1]) else row[0]
  primary_distortion = row[2]  # The primary cognitive distortion from the 3rd column
  secondary_distortion = row[3] if pd.notna(row[3]) else None  # The secondary distortion from the 4th column, if present


	Any distortion (threshold, accuracy, F1 score):
	 0.0 0.6919266963843487 0.8179156908665105
	 0.01 0.6919266963843487 0.8179156908665105
	 0.05 0.6919266963843487 0.8179156908665105
	 0.1 0.6919266963843487 0.8179156908665105
	 0.2 0.6919266963843487 0.8179156908665105
	 0.4 0.6919266963843487 0.8179156908665105
	 0.6 0.6915964999174509 0.8176849502244778
	 0.8 0.7128941720323593 0.8186085323876081
	Average distortion (threshold, accuracy, F1 score):
	 0.0 0.6919266963843487 0.8179156908665105
	 0.01 0.6919266963843487 0.8179156908665105
	 0.05 0.6919266963843487 0.8179156908665105
	 0.1 0.6919266963843487 0.8179156908665105
	 0.2 0.6919266963843487 0.8179156908665105
	 0.4 0.6919266963843487 0.8179156908665105
	 0.6 0.6861482582136371 0.8135360470819029
	 0.8 0.49810137031533763 0.4634662901517825
Analytics method: TCTC_tc
N-gram max length: 1
Upper cut-off percentage: 20
ANALYSE TCTC
	Any distortion (threshold, accuracy, F1 score):
	 0.0 0.6909361069836553 0.8172231985940246
	 0.01 

  text = row[1] if pd.notna(row[1]) else row[0]
  primary_distortion = row[2]  # The primary cognitive distortion from the 3rd column
  secondary_distortion = row[3] if pd.notna(row[3]) else None  # The secondary distortion from the 4th column, if present


	Any distortion (threshold, accuracy, F1 score):
	 0.0 0.6919266963843487 0.8179156908665105
	 0.01 0.6919266963843487 0.8179156908665105
	 0.05 0.6919266963843487 0.8179156908665105
	 0.1 0.6919266963843487 0.8179156908665105
	 0.2 0.6919266963843487 0.8179156908665105
	 0.4 0.6919266963843487 0.8179156908665105
	 0.6 0.6917615981508998 0.8178003318044307
	 0.8 0.6835066864784547 0.8091207806432341
	Average distortion (threshold, accuracy, F1 score):
	 0.0 0.6919266963843487 0.8179156908665105
	 0.01 0.6919266963843487 0.8179156908665105
	 0.05 0.6919266963843487 0.8179156908665105
	 0.1 0.6919266963843487 0.8179156908665105
	 0.2 0.6919266963843487 0.8179156908665105
	 0.4 0.6919266963843487 0.8179156908665105
	 0.6 0.691431401684002 0.8175695461200585
	 0.8 0.6382697705134555 0.7593629873695771
Analytics method: CFCF_cf
N-gram max length: 1
Upper cut-off percentage: 20
ANALYSE CFCF
	Any distortion (threshold, accuracy, F1 score):
	 0.0 0.6912663034505531 0.8174541194845764
	 0.01 0.

In [64]:
analyse_dataset('normalize', 3, 30, df3)

Analytics method: normalize
N-gram max length: 3
Upper cut-off percentage: 30
	Any distortion (threshold, accuracy, F1 score):
	 0.0 0.6953937592867756 0.8195951892050455
	 0.01 0.6953937592867756 0.8195951892050455
	 0.05 0.6953937592867756 0.8195951892050455
	 0.1 0.6953937592867756 0.8195951892050455
	 0.2 0.6965494469209179 0.8201565557729942
	 0.4 0.7132243684992571 0.8283427216128076
	 0.6 0.861812778603269 0.9092092417832736
	 0.8 0.9963678388641242 0.9973821989528796
	Average distortion (threshold, accuracy, F1 score):
	 0.0 0.6953937592867756 0.8195951892050455
	 0.01 0.6953937592867756 0.8195951892050455
	 0.05 0.6953937592867756 0.8195951892050455
	 0.1 0.6953937592867756 0.8195951892050455
	 0.2 0.6990259204226514 0.8213620774130328
	 0.4 0.8205382202410434 0.8852043510402365
	 0.6 0.9648340762753839 0.9751255401144459
	 0.8 0.9739144791150735 0.9807972775887214
