In [None]:
import pandas as pd
import numpy as np
import os
import glob
import re
from sklearn.model_selection import train_test_split

# Tokenizer

In [None]:
import spacy

# Classification packages

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import f1_score, accuracy_score
from sklearn.metrics import classification_report

# Load Data

In [None]:
import sqlite3

# Change the line below to point to the correct location 
# of the MultiCAT SQLite3 database
con = sqlite3.connect("multicat.db")

df = pd.read_sql_query("SELECT * from utterance", con)

In [None]:
CLC_trials = ["T000603",
              "T000604",
              "T000605",
              "T000606",
              "T000607",
              "T000608",
              "T000613",
              "T000627",
              "T000628",
              "T000631",
              "T000632",
              "T000633",
              "T000634",
              "T000635",
              "T000636",
              "T000637",
              "T000638",
              "T000671",
              "T000713",
              "T000714",
              "T000715",
              "T000716",
              "T000719",
              "T000720",
              "T000723",
              "T000724",
              "T000727",
              "T000728",
              "T000729",
              "T000730",
              "T000737",
              "T000738"]
len(CLC_trials)

In [None]:
# Group by 'trial_id', and pickup trials with clc annotation
grouped = df.groupby('trial')

clc_dfs = {}

for trial_id, group_df in grouped:
    if trial_id in CLC_trials:
        clc_dfs[trial_id] = group_df

In [None]:
Ayesha_test_split = ["T000605",
                    "T000606",
                    "T000671",
                    "T000672",
                    "T000625",
                    "T000626",
                    "T000727",
                    "T000728",
                    "T000737",
                    "T000738",
                    "T000609",
                    "T000610"]

In [None]:
test_files = [clc_dfs[trial] for trial in CLC_trials if any(trial_id in trial for trial_id in Ayesha_test_split)]
train_files = [clc_dfs[trial] for trial in CLC_trials if trial not in Ayesha_test_split]

In [None]:
# Exclude pre-game and post-game lines, keep the columns that will be used for baseline model

def readAnn(df):
    data = df[["participant",
                "start_timestamp",
                "asr_text",
                "corrected_text",
                "clc_label"]]
    data.reset_index(drop=True, inplace=True)
    # Find the first index where 'clc_label' is not None (start of the game)
    start_index = data[data['clc_label'].notnull()].index.min()
    # Find the last index where 'clc_label' is not None (end of the game)
    end_index = data[data['clc_label'].notnull()].index.max()
    # Slice the DataFrame to keep only rows between start_index and end_index
    data = data.iloc[start_index:end_index]
    # add a new column "notes" to store the boundary of each trail.
    data['notes'] = None
    data = data.append({'notes': 'end-of-the-trail'}, ignore_index=True)
    data = data.fillna('NA')
    return data


In [None]:
train_df = [readAnn(df) for df in train_files]
test_df = [readAnn(df) for df in test_files]

In [None]:
print("The training set contains", len(train_df), "files.")
print("The testing set contains", len(test_df), "files.")

In [None]:
# Combine corr_utt with utt column. If corr_utt, replace utt with corr_utt.

def replaceUtt(row):
    if row["corrected_text"] != "NA":
        return row["corrected_text"]
    else:
        return row["asr_text"]

for df in train_df:
    df["asr_text"] = df.apply(replaceUtt, axis=1)
    df.drop('corrected_text', axis=1, inplace=True)

for df in test_df:
    df["asr_text"] = df.apply(replaceUtt, axis=1)
    df.drop('corrected_text', axis=1, inplace=True)

In [None]:
# Combine files to only two dataframe: train and test
train_data = pd.concat(train_df, ignore_index=True)
test_data = pd.concat(test_df, ignore_index=True)

In [None]:
# Create a new column "utt_id" with unique identifiers for train and test data
train_data['utt_id'] = [f'utt_{i+1}' for i in range(len(train_data))]
test_data['utt_id'] = [f'utt_{i+1}' for i in range(len(test_data))]

In [None]:
print("The training set contains", len(train_data), "utterances.")
print("The testing set contains", len(test_data), "utterances.")

## Preprocess Data

In [None]:
# Lemmatize the utt tokens
nlp = spacy.load('en_core_web_sm')

In [None]:
# For step 1
def lemmatize_text(text):
    doc = nlp(text)
    lemmatized_tokens = [token.lemma_ for token in doc]
    lemmatized_text = " ".join(lemmatized_tokens)
    return lemmatized_text

train_data["lemmatized_text"] = train_data["asr_text"].apply(lemmatize_text)
test_data["lemmatized_text"] = test_data["asr_text"].apply(lemmatize_text)

In [None]:
# Give "a" label and "not-a" label to the utterance

def find_a_label(label):
    if "a" in label:
        return "a"
    else:
        return "not-a"

In [None]:
train_data["step1_label"] = train_data["clc_label"].apply(find_a_label)
test_data["step1_label"] = test_data["clc_label"].apply(find_a_label)

# Step1: Detect the "a" Stage

using logistic regression with TF-IDF vector features

In [None]:
vectorizer = TfidfVectorizer()

In [None]:
# Learn the vocabulary and idf values

def run_vectorizer(vectorizer, train, test):
    return vectorizer.fit_transform(train), vectorizer.transform(test)

In [None]:
train_x, test_x = run_vectorizer(vectorizer, train_data["lemmatized_text"], test_data["lemmatized_text"])
print(f"Shape of train input data: {train_x.get_shape()}")
print(f"Shape of test input data: {test_x.get_shape()}")

In [None]:
# Model creation
model = LogisticRegression(max_iter=1000)

In [None]:
# Model training

def run_model(model, train_x, train_y, test_x):
    # fit the model to the training data
    model.fit(train_x, train_y)
    # make predictions on the test data
    pred_y = model.predict(test_x)
    return pred_y

In [None]:
prediction = run_model(model, train_x, train_data["step1_label"], test_x)
test_data['step1_prediction'] = prediction
test_data[['asr_text', 'lemmatized_text', 'step1_label', 'step1_prediction']]

In [None]:
print("Detect a stage")
print(f"accuracy: {accuracy_score(test_data['step1_label'], test_data['step1_prediction']):0.4f}")
print(f"binary f1: {f1_score(test_data['step1_label'], test_data['step1_prediction'], average='binary', pos_label='a'):0.4f}")

In [None]:
# Check the balance of dataset

train_data.groupby(by="step1_label")[["asr_text"]].count().reset_index()

In [None]:
test_data

# Step 2: Detect "b" Stages within a Window for Each "a"

In [None]:
# Step2 need a dev dataset for the RoBERTa model, so that we neeed to split the train_files
Ayesha_dev_split = ["T000613",
                    "T000607",
                    "T000608",
                    "T000633",
                    "T000634",
                    "T000614"]

In [None]:
dev_files = [clc_dfs[trial] for trial in CLC_trials if any(trial_id in trial for trial_id in Ayesha_dev_split)]
step2_train_files = [clc_dfs[trial] for trial in CLC_trials if trial not in (Ayesha_test_split+Ayesha_dev_split)]

In [None]:
dev_df = [readAnn(df) for df in dev_files]
step2_train_df = [readAnn(df) for df in step2_train_files]

In [None]:
print("The dev set contains", len(dev_df), "files.")
print("The training set contains", len(step2_train_df), "files.")

In [None]:
for df in dev_df:
    df["asr_text"] = df.apply(replaceUtt, axis=1)
    df.drop('corrected_text', axis=1, inplace=True)

for df in step2_train_df:
    df["asr_text"] = df.apply(replaceUtt, axis=1)
    df.drop('corrected_text', axis=1, inplace=True)

In [None]:
# Combine files to only two dataframe: dev and step2_train
dev_data = pd.concat(dev_df, ignore_index=True)
step2_train_data = pd.concat(step2_train_df, ignore_index=True)

In [None]:
# Create a new column "utt_id" with unique identifiers for train and test data
dev_data['utt_id'] = [f'utt_{i+1}' for i in range(len(dev_data))]
step2_train_data['utt_id'] = [f'utt_{i+1}' for i in range(len(step2_train_data))]

In [None]:
print("The dev set contains", len(dev_data), "utterances.")
print("The training set contains", len(step2_train_data), "utterances.")

# Matching utterances
For each stage "a" utterance, find 3 utterances following that "a" utterance which produced by speaker other than the speaker of "a". 

In [None]:
# Set window size for looking for "b"
window_size = 3

In [None]:
# Split combined files into trials

def splittingTrials(combined_df):
    # Find the indices where "notes" is "end of the trial"
    trial_indices = combined_df[combined_df['notes'] == "end-of-the-trail"].index

    # Initialize a list to store DataFrames for each trial
    trial_dataframes = []

    # Initialize a variable to keep track of the previous "end of the trial" index
    prev_trial_end = 0

    # Loop through trial indices to split the DataFrame
    for trial_end in trial_indices:
        # Slice the DataFrame to extract the trial data
        trial_df = combined_df.iloc[prev_trial_end:trial_end + 1]
        trial_df.reset_index(drop=True, inplace=True)
        # Append the trial DataFrame to the list
        trial_dataframes.append(trial_df)

        # Update the previous trial end index for the next iteration
        prev_trial_end = trial_end + 1

    # print the number of trials
    print(f"Number of trials: {len(trial_dataframes)}")

    return trial_dataframes


In [None]:
splitted_train = splittingTrials(step2_train_data)
splitted_dev = splittingTrials(dev_data)
splitted_test = splittingTrials(test_data)

# Prepare Training, Dev, and Testing Data for step 2

In [None]:
def splitLabel(label):
    # For NA labels
    if label.strip() == "NA":
        return [["NA", "NA"]]

    splitted_labels = []
    clc_labels = [i.rstrip("+") for i in label.split(".")]
    for label in clc_labels:
        label = re.sub(r"(?<=[a-zA-Z]).*$", "", label)
        turn_id, label = re.findall(r'\d+|\D+', label)
        splitted_labels.append([turn_id, label])

    return splitted_labels


In [None]:
splitLabel("1b+.2a")

In [None]:
# For each stage "a" utterance, select {window size} utterance from speakers other than the speaker of "a"

def findCandidate(data, utt_id, turn_id, window_size):
    """
    Find a list of `utt_id` values for candidate utterances within a specified window after the input `utt_id`.

    Parameters:
    - data (pd.DataFrame): The DataFrame containing conversation data.
    - utt_id (str): The `utt_id` of the "a" stage utterance.
    - turn_id (str): The turn number of the "a" stage utterance.
    - window_size (int): The size of the window to look forward for candidate utterances.

    Returns:
    - candidate_utt_ids (list): A list of `utt_id` values for candidate utterances.
    - candidate_utt_labels (list): A list of labels for the candidate utterance("a", 1, 0).
    1 denotes "b" label, 0 denotes "not-b" label.
    """
    candidate_utt_ids, candidate_utt_labels = [utt_id], ["a"]
    len_data = len(data)
    maxlen_out_list = window_size + 1
    # Find the row index corresponding to the input `utt_id`
    a_stage_row = data[data['utt_id'] == utt_id]

    # Extract the participant from the input a_stage_row
    participant = a_stage_row.iloc[0]['participant']

    # Find the current index of the a_stage_row
    current_index = a_stage_row.index[0]

    if current_index == len_data - 1:
        return [utt_id, "[PAD]", "[PAD]", "[PAD]"], ["a", 0, 0, 0]

    # Iterate through the DataFrame from the current a_stage_row and onwards
    for _, row in data.iloc[current_index + 1:].iterrows():
        # Extract the participant from the current row
        current_participant = row['participant']
        # Extract the label and split it
        current_splited_labels = splitLabel(row['clc_label'])

        # Check if the current utterance is made by a different speaker and within maxlen_out_list
        if current_participant != participant and len(candidate_utt_ids) < maxlen_out_list:
            candidate_utt_ids.append(row['utt_id'])

            # Check the turn and append the label
            b_found = False
            for splitted_label in current_splited_labels:
                turn, label = splitted_label
                if turn == turn_id and label == "b":
                    b_found = True
                    break

            if b_found:
                candidate_utt_labels.append(1)
            else:
                candidate_utt_labels.append(0)


        # Check if the length of candidate_utt_ids has reached the maxlen_out_list
        if len(candidate_utt_ids) == maxlen_out_list:
            break  # Stop finding when the length reaches the maxlen_out_list

    if len(candidate_utt_ids) < window_size:
        candidate_utt_ids += ["[PAD]"] * (maxlen_out_list - len(candidate_utt_ids))
        candidate_utt_labels += [0] * (maxlen_out_list - len(candidate_utt_labels))

    return candidate_utt_ids, candidate_utt_labels


In [None]:
# For training data

train_candidate_utt_ids, train_candidate_utt_labels = [], []

for data in splitted_train:
    for _, row in data.iterrows():
        utt_id = row['utt_id']
        splitted_labels = splitLabel(row['clc_label'])
        for splitted_label in splitted_labels:
            turn_id, label = splitted_label
            if label == "a":
                utt_ids, utt_labels = findCandidate(data, utt_id, turn_id, window_size)
                train_candidate_utt_ids.append(utt_ids)
                train_candidate_utt_labels.append(utt_labels)


In [None]:
# For dev data

dev_candidate_utt_ids, dev_candidate_utt_labels = [], []

for data in splitted_dev:
    for _, row in data.iterrows():
        utt_id = row['utt_id']
        splitted_labels = splitLabel(row['clc_label'])
        for splitted_label in splitted_labels:
            turn_id, label = splitted_label
            if label == "a":
                utt_ids, utt_labels = findCandidate(data, utt_id, turn_id, window_size)
                dev_candidate_utt_ids.append(utt_ids)
                dev_candidate_utt_labels.append(utt_labels)


In [None]:
# for test data

test_candidate_utt_ids, test_candidate_utt_labels = [], []

for data in splitted_test:
    for _, row in data.iterrows():
        utt_id = row['utt_id']
        splitted_labels = splitLabel(row['clc_label'])
        for splitted_label in splitted_labels:
            turn_id, label = splitted_label
            if label == "a":
                utt_ids, utt_labels = findCandidate(data, utt_id, turn_id, window_size)
                test_candidate_utt_ids.append(utt_ids)
                test_candidate_utt_labels.append(utt_labels)


In [None]:
# Adding a row to the train_data with "[PAD]" values for "utt_id" and "utt"
pad_row = {'utt_id': '[PAD]', 'asr_text': '[PAD]'}
# Fill other keys with NaN for columns not specified in new_row
pad_row.update({col: pd.NA for col in step2_train_data.columns if col not in pad_row})
train_data_add_pad = train_data.append(pad_row, ignore_index=True)
# for dev
pad_row.update({col: pd.NA for col in dev_data.columns if col not in pad_row})
dev_data_add_pad = dev_data.append(pad_row, ignore_index=True)
# for testing
pad_row.update({col: pd.NA for col in test_data.columns if col not in pad_row})
test_data_add_pad = test_data.append(pad_row, ignore_index=True)


In [None]:
# Create a dataframe to store the data for step 2.
# Columns: a_utt_id, a_utt, candidate_b_id, candidate_b_utt, label
# The first three a_utt_id and a_utterance will be the same,
# with different candidate candidate_b_utt and the labels for the candidate_b_utt.

step2_train_data = []
for utterance_id_group, label_group in zip(train_candidate_utt_ids, train_candidate_utt_labels):
    a_utt_id = utterance_id_group[0]
    a_utt_text = train_data_add_pad.loc[train_data_add_pad['utt_id'] == a_utt_id, 'asr_text'].values[0]
    for candidate_b_utt_id, label in zip(utterance_id_group[1:], label_group[1:]):
        candidate_b_text = train_data_add_pad.loc[train_data_add_pad['utt_id'] == candidate_b_utt_id, 'asr_text'].values[0]
        step2_train_data.append([a_utt_id, a_utt_text, candidate_b_utt_id, candidate_b_text, label])

# Columns: 'a_utt_id', 'a_utt', 'candidate_b_id', 'candidate_b_utt', 'label'
step2_train_df = pd.DataFrame(step2_train_data, columns=['a_utt_id', 'a_utt', 'candidate_b_id', 'candidate_b_utt', 'label'])


In [None]:
# For dev

step2_dev_data = []
for utterance_id_group, label_group in zip(dev_candidate_utt_ids, dev_candidate_utt_labels):
    a_utt_id = utterance_id_group[0]
    a_utt_text = dev_data_add_pad.loc[dev_data_add_pad['utt_id'] == a_utt_id, 'asr_text'].values[0]
    for candidate_b_utt_id, label in zip(utterance_id_group[1:], label_group[1:]):
        candidate_b_text = dev_data_add_pad.loc[dev_data_add_pad['utt_id'] == candidate_b_utt_id, 'asr_text'].values[0]
        step2_dev_data.append([a_utt_id, a_utt_text, candidate_b_utt_id, candidate_b_text, label])

# Columns: 'a_utt_id', 'a_utt', 'candidate_b_id', 'candidate_b_utt', 'label'
step2_dev_df = pd.DataFrame(step2_dev_data, columns=['a_utt_id', 'a_utt', 'candidate_b_id', 'candidate_b_utt', 'label'])


In [None]:
# For testing data

step2_test_data = []
for utterance_id_group, label_group in zip(test_candidate_utt_ids, test_candidate_utt_labels):
    a_utt_id = utterance_id_group[0]
    a_utt_text = test_data_add_pad.loc[test_data_add_pad['utt_id'] == a_utt_id, 'asr_text'].values[0]
    for candidate_b_utt_id, label in zip(utterance_id_group[1:], label_group[1:]):
        candidate_b_text = test_data_add_pad.loc[test_data_add_pad['utt_id'] == candidate_b_utt_id, 'asr_text'].values[0]
        step2_test_data.append([a_utt_id, a_utt_text, candidate_b_utt_id, candidate_b_text, label])

# Columns: 'a_utt_id', 'a_utt', 'candidate_b_id', 'candidate_b_utt', 'label'
step2_test_df = pd.DataFrame(step2_test_data, columns=['a_utt_id', 'a_utt', 'candidate_b_id', 'candidate_b_utt', 'label'])

## Loading the Pre-trained Model


In [None]:
import pandas as pd
import evaluate
from datasets import Dataset
from transformers import (AutoTokenizer, AutoModelForSequenceClassification,
                          TrainingArguments, Trainer,
                          enable_full_determinism)

In [None]:
epochs = 3  # Number of epochs to train the model
batch_size = 16  # Number of examples used per gradient update
learning_rate = 5e-5  # The learning rate for the optimizer
max_length = 50  # Maximum lenght of the input sequence
output_dir = "Step2_model"  # The output directory where the model will be written to

In [None]:
def load_model(model_name):
    '''This function takes the name of the pre-trained mode as input, returns
    the loaded model, initialized for Text Classification, and its tokenizer.'''
    # Load the pre-trained tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    # Load the pre-trained model for text classification
    model = AutoModelForSequenceClassification.from_pretrained(model_name)
    # Return the model and tokenizer
    return model, tokenizer

In [None]:
model, tokenizer = load_model("roberta-base")

In [None]:
# Use the Dataset library for dataset management
step2_train_data = step2_train_df.loc[:, ['a_utt', 'candidate_b_utt', 'label']]
step2_dev_data = step2_dev_df.loc[:, ['a_utt', 'candidate_b_utt', 'label']]
step2_test_data = step2_test_df.loc[:, ['a_utt', 'candidate_b_utt', 'label']]
step2_train_dataset = Dataset.from_pandas(step2_train_data)
step2_dev_dataset = Dataset.from_pandas(step2_dev_data)
step2_test_dataset = Dataset.from_pandas(step2_test_data)
step2_train_dataset[0]

In [None]:
def preprocess_data(examples, tokenizer, max_length):
    '''This function takes examples of the Dataset, and tokenize the utterance
    pairs. Tokenizer is the output of the load_model.
    The function run the tokenizer jointly on the a_utt and candidate_b_utt columns of the
    Dataset. The tokenizer pad and truncate the sequences to the max_length.
    Return: the output of the tokenizer.'''
    tokenized_sents = tokenizer(
        examples['a_utt'], examples['candidate_b_utt'],
        padding='max_length',
        truncation=True,
        max_length=max_length
    )
    return tokenized_sents

In [None]:
step2_train_dataset = step2_train_dataset.map(lambda x: preprocess_data(x, tokenizer, max_length), batched=True)
step2_dev_dataset = step2_dev_dataset.map(lambda x: preprocess_data(x, tokenizer, max_length), batched=True)
step2_test_dataset = step2_test_dataset.map(lambda x: preprocess_data(x, tokenizer, max_length), batched=True)
print(step2_train_dataset[0])
print(tokenizer.convert_ids_to_tokens(step2_train_dataset[0]["input_ids"]))

## Fine-tuning


In [None]:
def create_training_arguments(epochs, batch_size, learning_rate, output_dir):
    """
    Create a TrainingArguments object for a training session with the given hyperparameters.
    Args:
        epochs (int): The number of epochs to train for.
        batch_size (int): The batch size to use during training.
        learning_rate (float): The learning rate to use during training.
        output_dir (str): The output directory to save the model and training artifacts.
    Returns:
        A TrainingArguments object with the specified hyperparameters.
    """
    return TrainingArguments(
        output_dir=output_dir,
        save_strategy='no',
        evaluation_strategy='epoch',
        num_train_epochs=epochs,
        per_device_train_batch_size=batch_size,
        learning_rate=learning_rate,
        do_eval = True
    )

In [None]:
train_args = create_training_arguments(epochs, batch_size, learning_rate, output_dir)

In [None]:
def create_trainer(model, train_args, train_dataset, dev_dataset):
    """
    Create a Trainer object for a training session with the given model, training arguments, and datasets.
    Args:
        model: The pre-trained or randomly initialized model to train.
        training_args: The TrainingArguments object specifying the hyperparameters and settings for the training.
        train_dataset: The dataset to use for training.
        dev_dataset: The dataset to use for evaluation during training.
    Returns:
        A Trainer object with the specified model, training arguments, and datasets.
    """
    return Trainer(
        model=model,
        args=train_args,
        train_dataset=train_dataset,
        eval_dataset=dev_dataset,
        )

In [None]:
trainer = create_trainer(model, train_args, step2_train_dataset,step2_dev_dataset)

In [None]:
trainer.train()

# Predict

In [None]:
def make_predictions(trainer, test_dataset):
    """
    Use the trained model to make predictions on the test dataset.

    Args:
        trainer: The Trainer object with the trained model.
        test_dataset: The test dataset to make predictions on.

    Returns:
        A list of integers containing the predicted label index for each statement pair in the test dataset.
    """
    import numpy as np
    # Run the predict method on the test dataset to get the logits for each label
    output = trainer.predict(test_dataset)
    logits = output.predictions

    # Get the index of the label with the highest logit value for each statement pair
    predicted_labels = np.argmax(logits, axis=-1)

    return predicted_labels

In [None]:
predictions = make_predictions(trainer, step2_test_dataset)
step2_test_df["step2_prediction"] = predictions
step2_test_df

In [None]:
def evaluate_prediction(test_data):
    accuracy = evaluate.load("accuracy")
    f1 = evaluate.load("f1")
    # Compute accuracy and F1 score
    accuracy_result = accuracy.compute(predictions=test_data["step2_prediction"].values, references=test_data["label"].values)
    f1_result = f1.compute(predictions=test_data["step2_prediction"].values, references=test_data["label"].values, average='macro')  # 'macro' can be changed based on your needs

    # Return both accuracy and F1 score
    return accuracy_result, f1_result

evaluate_prediction(step2_test_df)

# Aggregating the results

In [None]:
# Define the structure of the DataFrame for aggregation
columns = ['utt_id', 'step1_label', 'step1_prediction', 'step2_label', 'step2_prediction']

# Create an empty DataFrame with these columns
aggregated_test = pd.DataFrame(columns=columns)

In [None]:
# Copy 'utt_id' column from "test_data" to the new DataFrame
aggregated_test['utt_id'] = test_data['utt_id']

# Transform 'step1_label' and 'step1_prediction' values from "a" and "not-a" to 1 and 0 respectively
aggregated_test['step1_label'] = test_data['step1_label'].map({'a': 1, 'not-a': 0})
aggregated_test['step1_prediction'] = test_data['step1_prediction'].map({'a': 1, 'not-a': 0})

aggregated_test

In [None]:
# Replace the first element of each inner list in test_candidate_utt_labels with the corresponding element from test_candidate_utt_ids
for i in range(len(test_candidate_utt_labels)):
    test_candidate_utt_labels[i][0] = test_candidate_utt_ids[i][0]

test_candidate_utt_labels


In [None]:
# Iterate through the updated test_candidate_utt_labels to update the 'step2_label' in the new DataFrame 'df'
for utt_labels in test_candidate_utt_labels:
    utt_id = utt_labels[0]  # Extract the utt_id
    # Check if there's at least one 1 in the rest of the insider list
    step2_label_value = 1 if 1 in utt_labels[1:] else 0
    # Assign the value to the 'step2_label' column in 'df' for the corresponding 'utt_id'
    aggregated_test.loc[aggregated_test['utt_id'] == utt_id, 'step2_label'] = step2_label_value

aggregated_test


In [None]:

# Group by 'a_utt_id' and calculate the sum of 'step2_prediction' for each group
step2_prediction_sum = step2_test_df.groupby('a_utt_id')['step2_prediction'].sum().reset_index()

step2_prediction_sum


In [None]:

# add 'step2_prediction' column to 'aggregated_test' with default values
aggregated_test['step2_prediction'] = 0  # Adding default value of 0

# Now, find every 'utt_id' from 'step2_prediction_sum' in 'aggregated_test'
# If 'step2_prediction' != 0, then 'step2_prediction' in 'aggregated_test' gets a value of 1, otherwise, it remains 0
for index, row in step2_prediction_sum.iterrows():
    if row['step2_prediction'] != 0:
        aggregated_test.loc[aggregated_test['utt_id'] == row['a_utt_id'], 'step2_prediction'] = 1

aggregated_test


In [None]:
aggregated_test = aggregated_test.fillna(0)
aggregated_test

In [None]:
# Add a new column 'y_test' to the DataFrame 'aggregated_test' based on the conditions specified
aggregated_test['y_test'] = aggregated_test.apply(lambda row: "NA" if row['step1_label'] == 0 else ("closed" if row['step1_label'] == 1 and row['step2_label'] == 1 else "open"), axis=1)


In [None]:
aggregated_test['y_pred'] = aggregated_test.apply(lambda row: "NA" if row['step1_prediction'] == 0 else ("closed" if row['step1_prediction'] == 1 and row['step2_prediction'] == 1 else "open"), axis=1)
aggregated_test

In [None]:
from sklearn.metrics import classification_report
class_report = classification_report(aggregated_test['y_test'], aggregated_test['y_pred'], digits=3)
class_report


In [None]:
print(class_report)

In [None]:
f1_weighted = f1_score(aggregated_test['y_test'], aggregated_test['y_pred'], average='weighted')  # Calculate weighted-average F1 score
f1_weighted = round(f1_weighted, 3)

In [None]:
f1_weighted = round(f1_weighted, 3)
f1_weighted