# Scoring of the Autobiographical Interview with NLP

This code is used to automatically score text data using natural language processing. The trained language model of Genugten and Schacter (2024) is employed to score English text data in accordance with the Autobiographical Interview method (Levine, 2002). 

## License and Copyright Note

The code is based on the [Colab Notebook](https://colab.research.google.com/github/rubenvangenugten/autobiographical_interview_scoring/blob/main/automated_autobiographical_interview_scoring_share.ipynb) by Ruben von Genugten, published on [GitHub](https://github.com/rubenvangenugten/autobiographical_interview_scoring) under the GPL-3.0 licence. It has been modified in order to adapt it to our study and data. 

In [8]:
# install packages if necessary
!pip install numpy
!pip install tensorflow
!pip install pysbd

In [9]:
# import packages
import pandas as pd
import os
import numpy as np
import tensorflow as tf
import pysbd
import re
from transformers import AutoTokenizer, TFDistilBertForSequenceClassification
from transformers import TextClassificationPipeline

In [None]:
# read data
marian = pd.read_csv("../data/translated/marian.csv")
mbart = pd.read_csv("../data/translated/mbart.csv")
google = pd.read_csv("../data/translated/google.csv")

In [None]:
# access trained model and tokenizer
# this model was trained and used by Genugten & Schacter 2024 
aiscoring = 'vangenugtenr/autobiographical_interview_scoring'
model = TFDistilBertForSequenceClassification.from_pretrained(aiscoring)
tokenizer = AutoTokenizer.from_pretrained(aiscoring)

In [12]:
# here we create a data frame for each data set with the text data in long format 
# the long format contains one row per sentence that can then be classified 

# define sentence segmenter
seg = pysbd.Segmenter(language="en", clean=False)

# define function 
def reshape_to_long_format(data):
    list_of_dataframes = []
    for row in range(data.shape[0]):
        # access some general info about this narrative
        this_subID = data.iloc[row, data.columns.get_loc("ID")]
        narrative = data.iloc[row, data.columns.get_loc("text_eng")]
        # store current row
        currentRow = data.iloc[[row], :]
        # create a new dataframe with each row a new sentence, and subID added
        segmented_sentences = seg.segment(narrative)
        sentences_df = pd.DataFrame(segmented_sentences, columns=['sentence'])
        sentences_df["ID"] = this_subID
        # create a new merged dataframe
        merged_thisNarrative = pd.merge(currentRow, sentences_df, on=["ID"])
        list_of_dataframes.append(merged_thisNarrative)
    return pd.concat(list_of_dataframes)

# here we process each of our data sets separately using the defined function
marian_long = reshape_to_long_format(marian)
mbart_long = reshape_to_long_format(mbart)
google_long = reshape_to_long_format(google)

In [13]:
# here we prepare the data sets for the model 
# that means, the data are shaped such, that BERT is able to work with them 

# define data type, which should be character 
marian_long.loc[:,'sentence'] = marian_long.loc[:,'sentence'].astype('str')
mbart_long.loc[:,'sentence'] = mbart_long.loc[:,'sentence'].astype('str')
google_long.loc[:,'sentence'] = google_long.loc[:,'sentence'].astype('str')

# define a function 
def prepare(data):
    test_texts = []
    # extract each sentence, convert to string, and append to list
    for row in range(data.shape[0]):
        temp_text = data.iloc[row, data.columns.get_loc("sentence")]
        temp_text = str(temp_text)  
        test_texts.append(temp_text)
    # encode text for BERT model
    encodings = tokenizer(test_texts, truncation=True, padding=True)
    # convert to a TensorFlow dataset
    dataset = tf.data.Dataset.from_tensor_slices((dict(encodings)))
    return dataset, test_texts

In [None]:
# here we classify the sentences 

# set up text classification pipeline using the defined model and tokenizer
pipe = TextClassificationPipeline(model=model, tokenizer=tokenizer, return_all_scores=True)

def classification(data):
    # use the preparation function from above 
    dataset, test_texts = prepare(data)
    # Split classification up into batches of sentences to manage RAM
    stored_test = []
    batch_size = 200
    # unse the classification pipeline 
    for i in range(0, len(test_texts), batch_size):
      stored_test.extend(pipe(test_texts[i:i+batch_size]))
    return stored_test

# here we process our data sets using the function
marian_classified = classification(marian_long)
mbart_classified = classification(mbart_long)
google_classified = classification(google_long)

In [16]:
# here we generate new dataframes with predictions 

def predictions(data, stored_test):
    # create a list to store prediction dataframes
    list_of_predictionDfs = []
    # For each item in the stored_test (predictions), create a data frame and process
    for row in range(len(stored_test)):
        thisTestLabels = pd.DataFrame(stored_test[row])
        # set the 'label' as the index and remove it from the columns
        thisTestLabels.index = thisTestLabels['label']
        thisTestLabels = thisTestLabels.drop('label', axis=1)
        thisTestLabels = thisTestLabels.transpose()
        # append the data frame to the list
        list_of_predictionDfs.append(thisTestLabels)
    # get the prediction data frames 
    predictionsDf = pd.concat(list_of_predictionDfs)
    # identify the most likely label for each sentence
    predictionsDf['toplabel'] = predictionsDf.idxmax(axis=1)
    # merge the predictions with the original data frame 
    merged_data = pd.concat([data.reset_index(drop=True), predictionsDf.reset_index(drop=True)], axis=1)
    # add a variable with a word count for each sentence
    merged_data['sentenceWordCount'] = merged_data['sentence'].apply(lambda x: len(re.findall(r'\w+', str(x))))
    return merged_data

marian_predictions = predictions(marian_long, marian_classified)
mbart_predictions = predictions(mbart_long, mbart_classified)
google_predictions = predictions(google_long, google_classified)


In [None]:
marian_predictions.head()

In [None]:
# here we create to variables wit predicted classifications
# number of words classified as internal, and number of words classified as external 

def predicted_words(df):
    # create two new columns for the counts
    df[['internal_pred']] = 0
    df[['external_pred']] = 0
    # loop through each row and calculate the counts
    for row in range(df.shape[0]):
        predictionType_thisIter = df.iloc[row, df.columns.get_loc("toplabel")]
        numTotalWords = df.iloc[row, df.columns.get_loc("sentenceWordCount")]
        # get the column locations for internal and external predictions
        internalLocation = df.columns.get_loc("internal_pred")
        externalLocation = df.columns.get_loc("external_pred")
        # classify based on the label and update the columns
        if predictionType_thisIter == 'LABEL_0':
            df.iloc[row, externalLocation] = numTotalWords
        elif predictionType_thisIter == 'LABEL_1':
            halfDetails = numTotalWords / 2
            df.iloc[row, externalLocation] = halfDetails
            df.iloc[row, internalLocation] = halfDetails
        elif predictionType_thisIter == 'LABEL_2':
            df.iloc[row, externalLocation] = numTotalWords / 4
            df.iloc[row, internalLocation] = numTotalWords * (3 / 4)
        elif predictionType_thisIter == 'LABEL_3':
            df.iloc[row, internalLocation] = numTotalWords
    return df

# here we apply the function to our data frames 
marian_new = predicted_words(marian_predictions)
mbart_new = predicted_words(mbart_predictions)
google_new = predicted_words(google_predictions)


In [None]:
marian_new.head()

In [10]:
# sum up internal and external words for each narrative
# because right now we are still on sentence level

def sum_narrative(df):
    # select the relevant columns for output
    df_write_out_subset = df.loc[:, ["ID", "internal_pred", "external_pred", 'sentenceWordCount']]
    # Group by ID, then sum the 'internal_pred' and 'external_pred' columns
    grouped = df_write_out_subset.groupby(by=["ID"]).agg({
        'internal_pred': 'sum',
        'external_pred': 'sum',
        'sentenceWordCount': 'sum'
    }).reset_index()  # reset index to get a clean dataframe
    grouped.rename(columns={"sentenceWordCount": "total_words"}, inplace=True)
    return grouped

# here we apply the function to our data frames 
marian_grouped = sum_narrative(marian_new)
mbart_grouped = sum_narrative(mbart_new)
google_grouped = sum_narrative(google_new)


In [None]:
# write data sets into csv files
marian_grouped.to_csv('../data/scored/marian_scored.csv', index=False)
mbart_grouped.to_csv('../data/scored/mbart_scored.csv', index=False)
google_grouped.to_csv('../data/scored/google_scored.csv', index=False)