In [30]:
import os, sys, math
cwd = os.getcwd()
project_path = cwd[:cwd.find('pygents')+7]
if project_path not in sys.path: sys.path.append(project_path)
os.chdir(project_path)

import pandas as pd
import numpy as np
#from sklearn.metrics import f1_score
import matplotlib.pyplot as plt
import seaborn as sns

from pygents.aigents_api import tokenize_re
from pygents.util import dictcount
from pygents.aigents_api import build_ngrams
from collections import defaultdict

from pygents.aigents_api import TextMetrics


def language_metrics(metrics_list):
    metrics = {}
    for m in metrics_list:
        metrics[m] = './data/corpora/English/distortions/our model/keywods for our model/' + m + '.txt'
    return metrics

In [31]:
#df = pd.read_csv(r"./data/corpora/English/distortions/sagarikashreevastava/test.csv")
#df = pd.read_csv(r"./notebooks/nlp/sentiment/first dataset dominant and secondary distortions.csv")
#df = pd.read_csv(r"./data/corpora/English/distortions/our model/first dataset dominant and secondary distortions.csv")
#df = df.drop('Id_Number', axis=1) # delete columnb with id 
binary_dataset_file_path = "./data/corpora/English/distortions/halilbabacan/raw_Cognitive_distortions.csv"
df = pd.read_csv(binary_dataset_file_path)
df.insert(1, "Distorted part", value = np.nan)
df.insert(3, "Secondary Distortion (Optional)", value = np.nan)
df = df.rename(columns={
    "Text": "Patient Question",
    "Label": "Dominant Distortion"
})
df

Unnamed: 0,Patient Question,Distorted part,Dominant Distortion,Secondary Distortion (Optional)
0,I'm such a failure I never do anything right.,,Distortion,
1,Nobody likes me because I'm not interesting.,,Distortion,
2,I can't try new things because I'll just mess...,,Distortion,
3,My boss didn't say 'good morning' she must be...,,Distortion,
4,My friend didn't invite me to the party I mus...,,Distortion,
...,...,...,...,...
3522,Since then whenever my mother is out alone I b...,,Distortion,
3523,My family hate him but they didn’t met him at ...,,Distortion,
3524,However I am not happy at the least only half ...,,Distortion,
3525,Now I am at university my peers around me all ...,,Distortion,


In [32]:
def model_comparison(test_df, distortions_labels):
    print("MODEL COMPARISON")
    # Get the results using our model
    
    l = TextMetrics(language_metrics(distortions_labels),debug=False)
    metric_to_index_and_column = {
        label: (i, label) for i, label in enumerate(distortions_labels)
    }
    columns_values = np.zeros((len(test_df), len(metric_to_index_and_column)))
    texts = test_df['Patient Question'] 

    for i, t in enumerate(texts):
        metrics = l.get_sentiment_words(t)
        for metric_name, metric in metrics.items():
            index, _ = metric_to_index_and_column[metric_name]
            columns_values[i][index] = metric

    for _, (index, column_name) in metric_to_index_and_column.items():
        our_column_name = f"Our {column_name}"
        their_column_name = f"Their {column_name}"
        df[our_column_name] = pd.Series(columns_values[:, index])
        df[their_column_name] = 0  # Fill "Their" columns with zeros

    # Update "Their" columns based on Dominant and Secondary Distortions
    for i, row in df.iterrows():
        dominant_distortion = row.get('Dominant Distortion', None)
        secondary_distortion = row.get('Secondary Distortion (Optional)', None)
        
        for _, (_, column_name) in metric_to_index_and_column.items():
            their_column_name = f"Their {column_name}"
            if dominant_distortion == column_name or secondary_distortion == column_name:
                df.at[i, their_column_name] = 1

        # Update "Their Distortion" column
        their_distortion_column = "Their Distortion"
        if dominant_distortion != "No Distortion":
            df.at[i, their_distortion_column] = 1

    # Calculate F1 scores for thresholds
    thresholds = np.arange(0, 1.1, 0.1)  # Thresholds from 0 to 1 with step 0.1
    accuracy_scores = pd.DataFrame(index=distortions_labels, columns=thresholds)

    for column_name in distortions_labels:
        our_column_name = f"Our {column_name}"
        their_column_name = f"Their {column_name}"
        for threshold in thresholds:
            # Binarize "Our" column based on the threshold
            our_binary = (df[our_column_name] >= threshold).astype(int)
            their_binary = df[their_column_name]
            
            # Calculate F1 score
            #f1 = f1_score(their_binary, our_binary, zero_division=0)
            #f1_scores.at[column_name, threshold] = f1

            # Calculate accuracy score
            accuracy = (our_binary == their_binary).mean()
            accuracy_scores.at[column_name, threshold] = accuracy
            

    # Display the accuracy scores as a heatmap
    plt.figure(figsize=(10, 6))

    # Format thresholds for better readability
    formatted_thresholds = [f"{th:.1f}" for th in thresholds]
    accuracy_scores.columns = formatted_thresholds

    sns.heatmap(accuracy_scores.astype(float), annot=True, cmap='coolwarm', fmt=".2f", annot_kws={"size": 13}, vmin=0, vmax=0.8)
    plt.title("Accuracy Heatmap")
    plt.xlabel("Threshold")
    plt.ylabel("Cognitive Distortion")

    # Adjust the x-axis labels to be horizontal
    plt.xticks(rotation=0)

    # Save the plot
    output_dir = "C:/Users/kzvau/Desktop/работа нгу"
    os.makedirs(output_dir, exist_ok=True)
    output_path = os.path.join(output_dir, "f1_scores_heatmap.png")
    plt.savefig(output_path, dpi=300, bbox_inches='tight')

    plt.show()

    threshold_05 = accuracy_scores['0.5'].astype(float)
    average_accuracy_05 = threshold_05.mean()
    return average_accuracy_05

In [33]:
def analyse_normalized_ngrams(ngram_max, df):
    print('ANALYSE NORMALIZE')
    # Analyze CFCF_cf values for n-grams for each cognitive distortion (see https://github.com/aigents/pygents/blob/main/notebooks/nlp/distortions/distortions_preliminary_data_checks.ipynb)

    # Dictionaries for metrics
    n_gram_dicts = defaultdict(lambda: defaultdict(int))  # A dictionary for each distortion (distortion-n-gram-n_gram_frequency)
    all_n_grams = defaultdict(int)  # A general dictionary for all n-grams
    unique_n_gram_dicts = defaultdict(lambda: defaultdict(int))  # A dictionary for each distortion (distortion-unique_n-gram-n_gram_frequency)
    distortions = defaultdict(int) # Counting the number of documents for each distortion
    n_gram_distortions = defaultdict(lambda: defaultdict(int))  # Distortions per n-gram

    # The main loop through the rows of the DataFrame
    for _, row in df.iterrows():
        # Text identification: first, check the 2nd column; if NaN, take the text from the 1st column
        text = row[1] if pd.notna(row[1]) else row[0] 
        primary_distortion = row[2]  # The primary cognitive distortion from the 3rd column
        secondary_distortion = row[3] if pd.notna(row[3]) else None  # The secondary distortion from the 4th column, if present

        dictcount(distortions, primary_distortion)
        if secondary_distortion:
            dictcount(distortions, secondary_distortion)

        # Text tokenization
        tokens = tokenize_re(text)

        # Generate and count n-grams
        for n in range(1, ngram_max + 1):
            n_grams = build_ngrams(tokens, n)
            dictcount(all_n_grams, n_grams)
            dictcount(n_gram_dicts[primary_distortion], n_grams)
            if secondary_distortion:
                dictcount(n_gram_dicts[secondary_distortion], n_grams)

            # Count unique n-grams
            uniq_n_grams = set(n_grams)
            for uniq_n_gram in uniq_n_grams:
                dictcount(unique_n_gram_dicts[primary_distortion], uniq_n_gram)
                dictcount(n_gram_distortions[uniq_n_gram], primary_distortion)
                if secondary_distortion:
                    dictcount(unique_n_gram_dicts[secondary_distortion], uniq_n_gram)
                    dictcount(n_gram_distortions[uniq_n_gram], secondary_distortion)

    
    # Normalizing distortion-specific counts by total counts
    norm_n_gram_dicts = {}
    for n_gram_dict in n_gram_dicts:
        norm_n_gram_dict = {}
        norm_n_gram_dicts[n_gram_dict] = norm_n_gram_dict
        dic = n_gram_dicts[n_gram_dict]
        for n_gram in dic:
            if len(n_gram) <= ngram_max:
                norm_n_gram_dict[n_gram] = float( dic[n_gram] ) / all_n_grams[n_gram]

    # Normalize uniq counts 
    norm_uniq_n_gram_dicts = {}
    for uniq_n_gram_dict in unique_n_gram_dicts:
        norm_uniq_n_gram_dict = {}
        norm_uniq_n_gram_dicts[uniq_n_gram_dict] = norm_uniq_n_gram_dict
        dic = unique_n_gram_dicts[uniq_n_gram_dict]
        # Normalize uniq Document counts of N-grams by distortion by Documents count by Distortion
        for n_gram in dic:
            if len(n_gram) <= ngram_max:
                norm_uniq_n_gram_dict[n_gram] = float( dic[n_gram] ) / distortions[uniq_n_gram_dict] / len(n_gram_distortions[n_gram])

    return norm_uniq_n_gram_dicts



def analyse_frequency(ngram_max, df):
    print('ANALYSE FREQUENCY')
    # Analyze the frequency of n-grams for each cognitive distortion

    # Creating dictionaries for counting n-grams
    n_gram_dicts = defaultdict(lambda: defaultdict(int))  # A dictionary for each distortion (distortion-n-gram-n_gram_frequency)

    # Loop through the rows of the DataFrame
    for _, row in df.iterrows():
        # Text identification: first, check the 2nd column; if NaN, take the text from the 1st column
        text = row[1] if pd.notna(row[1]) else row[0]
        primary_distortion = row[2]  # The primary cognitive distortion from the 3rd column
        secondary_distortion = row[3] if pd.notna(row[3]) else None  # The secondary distortion from the 4th column, if present

        # Text tokenization
        tokens = tokenize_re(text)

        # Generation and counting of n-grams (from 1 to 4)
        for n in range(1, ngram_max + 1):
            n_grams = build_ngrams(tokens, n)
            dictcount(n_gram_dicts[primary_distortion], n_grams)  # Increment the counter for the corresponding primary distortion
            if secondary_distortion:
                dictcount(n_gram_dicts[secondary_distortion], n_grams) # Increment the counter for the corresponding secondary distortion (if present)

    return n_gram_dicts



def analyse_TF_IDF(ngram_max, df):
    print('ANALYSE TF-IDF')
    # Analyze TF-IDF values for n-grams for each cognitive distortion

    # Creating dictionaries for counting n-grams
    n_gram_dicts = defaultdict(lambda: defaultdict(int))  # A dictionary for each distortion (distortion-n-gram-n_gram_frequency)
    all_n_grams = defaultdict(int)  # A general dictionary for all n-grams
    doc_counts = defaultdict(int)  # The number of documents in which each n-gram appears


    # Loop through the rows of the DataFrame
    for _, row in df.iterrows():
        # Text identification: first, check the 2nd column; if NaN, take the text from the 1st column
        text = row[1] if pd.notna(row[1]) else row[0]
        primary_distortion = row[2]  # The primary cognitive distortion from the 3rd column
        secondary_distortion = row[3] if pd.notna(row[3]) else None  # The secondary distortion from the 4th column, if present

        # Text tokenization
        tokens = tokenize_re(text)

        # Generate n-grams and update the document counters where they appear
        unique_ngrams = set()
        for n in range(1, ngram_max + 1):
            n_grams = build_ngrams(tokens, n)
            unique_ngrams.update(n_grams)
            dictcount(all_n_grams, n_grams)
            dictcount(n_gram_dicts[primary_distortion], n_grams)  # Increment the counter for the corresponding primary distortion
            if secondary_distortion:
                dictcount(n_gram_dicts[secondary_distortion], n_grams) # Increment the counter for the corresponding secondary distortion (if present)

        for n_gram in unique_ngrams:
            doc_counts[n_gram] += 1

    # The total number of texts
    total_docs = len(df)

    # TF-IDF Calculation
    tfidf_dicts = defaultdict(dict)
    for distortion, ngram_dict in n_gram_dicts.items(): # For each distortion (distortion), analyze the n-grams (ngram_dict)
        for n_gram, count in ngram_dict.items(): # For each n-gram (n_gram), check its frequency of occurrence (count) for the given distortion
            tf = count / sum(ngram_dict.values())  # Frequency of the n-gram in the text (TF): TF = (Number of occurrences of the given n-gram for the specific cognitive distortion) / (Total number of occurrences of all other n-grams for the same cognitive distortion)
            idf = math.log(total_docs / (1 + doc_counts[n_gram]))  # Inverse Document Frequency (IDF): IDF = Total number of documents / Number of documents containing the given n-gram
            tfidf_dicts[distortion][n_gram] = tf * idf  # TF-IDF

    return tfidf_dicts



def analyse_TFTF_tf(ngram_max, df):
    print('ANALYSE TFTF')
    # Analyze TFTF_tf values for n-grams for each cognitive distortion (see http://webstructor.net/papers/Kolonin-HP-ACA-IC-text.pdf)

    # Dictionaries for metrics
    n_gram_dicts = defaultdict(lambda: defaultdict(int))  # A dictionary for each distortion (distortion-n-gram-n_gram_frequency)
    all_n_grams = defaultdict(int)  # A general dictionary for all n-grams
    unique_n_gram_dicts = defaultdict(lambda: defaultdict(int))  # A dictionary for each distortion (distortion-unique_n-gram-n_gram_frequency)

    # Counting documents for n-grams
    doc_counts = defaultdict(int)  # The number of texts in which each n-gram appears

    # The main loop through the rows of the DataFrame
    for _, row in df.iterrows():
        # Text identification: first, check the 2nd column; if NaN, take the text from the 1st column
        text = row[1] if pd.notna(row[1]) else row[0] 
        primary_distortion = row[2]  # The primary cognitive distortion from the 3rd column
        secondary_distortion = row[3] if pd.notna(row[3]) else None  # The secondary distortion from the 4th column, if present

        # Text tokenization
        tokens = tokenize_re(text)

        # Look at unique n-grams and increment the document counters
        unique_ngrams = set()
        for n in range(1, ngram_max + 1):
            n_grams = build_ngrams(tokens, n)
            unique_ngrams.update(n_grams)
            dictcount(all_n_grams, n_grams)
            dictcount(n_gram_dicts[primary_distortion], n_grams)
            if secondary_distortion:
                dictcount(n_gram_dicts[secondary_distortion], n_grams)

        for n_gram in unique_ngrams:
            doc_counts[n_gram] += 1
            unique_n_gram_dicts[primary_distortion][n_gram] += 1
            if secondary_distortion:
                unique_n_gram_dicts[secondary_distortion][n_gram] += 1

    # The total number of texts
    total_docs = len(df)

    # Calculation of the TFTF_tf
    tftf_results = defaultdict(dict)
    for distortion, ngram_dict in n_gram_dicts.items(): # For each distortion (distortion), examine the n-grams (ngram_dict)
        for n_gram, count in ngram_dict.items(): # For each n-gram (n_gram), check its frequency of occurrence (count) for the given distortion
            # TFTF: Mutual relevance of features and text (how important a given n-gram is for describing the text, looking for specific n-grams within the text)
            tf = count / sum(ngram_dict.values()) # Frequency of the n-gram in the text (TF): TF = (Number of times the given n-gram appears for the specific cognitive distortion) / (Number of times all other n-grams appear for the same cognitive distortion)
            ft = doc_counts[n_gram] / total_docs # The number of texts in which the current n-gram appears / the total number of texts in the dataset
            Ft = sum(ngram_dict.values())
            tftf = (tf ** 2) / (ft*Ft if ft*Ft > 0 else 1)

            tftf_results[distortion][n_gram] = tftf

    return tftf_results



def analyse_TCTC_tc(ngram_max, df):
    print('ANALYSE TCTC')
    # Analyze TCTC_tc values for n-grams for each cognitive distortion (see http://webstructor.net/papers/Kolonin-HP-ACA-IC-text.pdf)

    # Dictionaries for metrics
    n_gram_dicts = defaultdict(lambda: defaultdict(int))  # A dictionary for each distortion (distortion-n-gram-n_gram_frequency)
    all_n_grams = defaultdict(int)  # A general dictionary for all n-grams
    unique_n_gram_dicts = defaultdict(lambda: defaultdict(int))  # A dictionary for each distortion (distortion-unique_n-gram-n_gram_frequency)

    # Counting documents for n-grams
    doc_counts = defaultdict(int)  # The number of texts in which each n-gram appears

    # The main loop through the rows of the DataFrame
    for _, row in df.iterrows():
        # Text identification: first, check the 2nd column; if NaN, take the text from the 1st column
        text = row[1] if pd.notna(row[1]) else row[0] 
        primary_distortion = row[2]  # The primary cognitive distortion from the 3rd column
        secondary_distortion = row[3] if pd.notna(row[3]) else None  # The secondary distortion from the 4th column, if present

        # Text tokenization
        tokens = tokenize_re(text)

        # Look at unique n-grams and increment the document counters
        unique_ngrams = set()
        for n in range(1, ngram_max + 1):
            n_grams = build_ngrams(tokens, n)
            unique_ngrams.update(n_grams)
            dictcount(all_n_grams, n_grams)
            dictcount(n_gram_dicts[primary_distortion], n_grams)
            if secondary_distortion:
                dictcount(n_gram_dicts[secondary_distortion], n_grams)

        for n_gram in unique_ngrams:
            doc_counts[n_gram] += 1
            unique_n_gram_dicts[primary_distortion][n_gram] += 1
            if secondary_distortion:
                unique_n_gram_dicts[secondary_distortion][n_gram] += 1

    # The total number of texts
    total_docs = len(df)

    # Calculation of the TCTC_tc
    tctc_results = defaultdict(dict)
    for distortion, ngram_dict in n_gram_dicts.items(): # For each distortion (distortion), examine the n-grams (ngram_dict)
        for n_gram, count in ngram_dict.items(): # For each n-gram (n_gram), check its frequency of occurrence (count) for the given distortion
            # TCTC: Mutual relevance of categories and text (how the given n-gram is distributed across all texts of a given category)
            ct = unique_n_gram_dicts[distortion][n_gram] # The number of texts in which the current n-gram (n_gram) is associated with the current distortion (distortion)
            tctc = (ct ** 2) / (doc_counts[n_gram] * total_docs) # In the denominator: the number of texts containing the given n-gram, regardless of distortion * the total number of texts

            tctc_results[distortion][n_gram] = tctc

    return tctc_results



def analyse_CFCF_cf(ngram_max, df):
    print('ANALYSE CFCF')
    # Analyze CFCF_cf values for n-grams for each cognitive distortion (see http://webstructor.net/papers/Kolonin-HP-ACA-IC-text.pdf)

    # Dictionaries for metrics
    n_gram_dicts = defaultdict(lambda: defaultdict(int))  # A dictionary for each distortion (distortion-n-gram-n_gram_frequency)
    all_n_grams = defaultdict(int)  # A general dictionary for all n-grams
    unique_n_gram_dicts = defaultdict(lambda: defaultdict(int))  # A dictionary for each distortion (distortion-unique_n-gram-n_gram_frequency)

    # Counting documents for n-grams
    doc_counts = defaultdict(int)  # The number of texts in which each n-gram appears

    # The main loop through the rows of the DataFrame
    for _, row in df.iterrows():
        # Text identification: first, check the 2nd column; if NaN, take the text from the 1st column
        text = row[1] if pd.notna(row[1]) else row[0] 
        primary_distortion = row[2]  # The primary cognitive distortion from the 3rd column
        secondary_distortion = row[3] if pd.notna(row[3]) else None  # The secondary distortion from the 4th column, if present

        # Text tokenization
        tokens = tokenize_re(text)

        # Look at unique n-grams and increment the document counters
        unique_ngrams = set()
        for n in range(1, ngram_max + 1):
            n_grams = build_ngrams(tokens, n)
            unique_ngrams.update(n_grams)
            dictcount(all_n_grams, n_grams)
            dictcount(n_gram_dicts[primary_distortion], n_grams)
            if secondary_distortion:
                dictcount(n_gram_dicts[secondary_distortion], n_grams)

        for n_gram in unique_ngrams:
            doc_counts[n_gram] += 1
            unique_n_gram_dicts[primary_distortion][n_gram] += 1
            if secondary_distortion:
                unique_n_gram_dicts[secondary_distortion][n_gram] += 1

    # Calculation of the CFCF_cf
    cfcf_results = defaultdict(dict)
    for distortion, ngram_dict in n_gram_dicts.items(): # For each distortion (distortion), examine the n-grams (ngram_dict)
        for n_gram, count in ngram_dict.items(): # For each n-gram (n_gram), check its frequency of occurrence (count) for the given distortion
            # CFCF: Mutual relevance of features and categories (how characteristic the given n-gram is for a particular category)
            cf = unique_n_gram_dicts[distortion][n_gram]
            fc = sum(unique_n_gram_dicts[distortion].values()) # The total number of texts for all n-grams associated with the given distortion
            cfcf = (cf ** 2) / (cf * fc if cf * fc > 0 else 1)

            cfcf_results[distortion][n_gram] = cfcf

    return cfcf_results

In [34]:
def create_model(train_df, test_df, analytics_method, ngram_max, treshold):
    print('CREATE MODEL')
    # Choose a method for analysis
    if analytics_method == 'frequency':
        model_ngram = analyse_frequency(ngram_max, train_df)
    elif analytics_method == 'TF-IDF':
        model_ngram = analyse_TF_IDF(ngram_max, train_df)
    elif analytics_method == 'TFTF_tf':
        model_ngram = analyse_TFTF_tf(ngram_max, train_df)
    elif analytics_method == 'TCTC_tc':
        model_ngram = analyse_TCTC_tc(ngram_max, train_df)
    elif analytics_method == 'CFCF_cf':
        model_ngram = analyse_CFCF_cf(ngram_max, train_df)
    elif analytics_method == 'normalize':
        model_ngram = analyse_normalized_ngrams(ngram_max, train_df)

    # Filter out values below the threshold
    filtered_model_ngram = {}
    for distortion, ngram_dict in model_ngram.items():
        # Find the maximum metric value for the current distortion
        max_value = max(ngram_dict.values()) if ngram_dict else 0
        threshold_value = max_value * (treshold / 100)

        # Filter n-grams that meet or exceed the threshold value
        filtered_model_ngram[distortion] = {
            ngram: metric for ngram, metric in ngram_dict.items() if metric >= threshold_value
        }

    # Create .txt files for each distortion
    output_dir = "./data/corpora/English/distortions/our model/keywods for our model"
    distortion_file_path = f"{output_dir}/Distortion.txt"
    distortions_labels = []

    with open(distortion_file_path, "w", encoding="utf-8") as distortion_file:
        distortions_labels.append("Distortion")
        for distortion, ngrams in filtered_model_ngram.items():
            file_path = f"{output_dir}/{distortion}.txt"
            distortions_labels.append(distortion)
            with open(file_path, "w", encoding="utf-8") as f:
                for ngram in ngrams.keys():
                    f.write(" ".join(ngram) + "\n")
                    if distortion != "No Distortion":
                            distortion_file.write(" ".join(ngram) + "\n")

    accuracy = model_comparison(test_df, list(set(distortions_labels)))
    return accuracy


In [35]:
# A function for creating a model based on hyperparameters. 
# The analytics_method can take values such as frequency, TF-IDF, TFTF_tf, TCTC_tc, CFCF_cf. 
# ngram_max is the number corresponding to the maximum length of n-grams we will identify. 
# threshold contains the desired cutoff values (in %) for filtering out irrelevant keywords for each cognitive distortion. 
# split_number reflects the number of parts the dataset will be split into for training and recognition.

def analyse_dataset(analytics_method, ngram_max, treshold, num_parts, df):
    print('ANALYSE DATASET')

    # Split the dataset into num_parts parts
    parts = []
    part_size = len(df) // num_parts
    
    for i in range(num_parts):
        start_idx = i * part_size
        end_idx = start_idx + part_size if i < num_parts - 1 else len(df)
        parts.append(df.iloc[start_idx:end_idx])

    # Generate combinations for training and testing
    combinations = [] # A dictionary of (train, test) pairs
    for i in range(num_parts):
        test_set = parts[i]
        train_set = pd.concat([parts[j] for j in range(num_parts) if j != i])
        combinations.append((train_set, test_set))
        # Create a model and comparing our model with markup
        accuracy = create_model(combinations[i][0], combinations[i][1], analytics_method, ngram_max, treshold)
        print(f"The average accuracy value for a threshold of 0.5 across all distortions for combination {i+1}: {accuracy}")
        

In [36]:
analyse_dataset('normalize', 4, 20, 3, df)

ANALYSE DATASET
CREATE MODEL
ANALYSE NORMALIZE


  text = row[1] if pd.notna(row[1]) else row[0]
  primary_distortion = row[2]  # The primary cognitive distortion from the 3rd column
  secondary_distortion = row[3] if pd.notna(row[3]) else None  # The secondary distortion from the 4th column, if present


MODEL COMPARISON


UnicodeDecodeError: 'charmap' codec can't decode byte 0x98 in position 1158: character maps to <undefined>