The following script allows for quick and flexible translation and subsequent scoring of non-English texts with the Autobiographical Interview (Levene, 2005) heavily relying on the automated-scoring approach developed and provided by van Genugten & Schacter (2024).  
It is a generalization and extension of the code used by Annika Kuelpmann for this project: https://osf.io/vcns4.  
The liscence for this script is:   
The authors of this script are: Giuliano Groer (https://orcid.org/0009-0009-2656-3796), Annika Kuelpmann (https://orcid.org/0000-0002-0256-2037), Laurin Plank (https://orcid.org/0000-0002-8846-5405)

In [None]:
import os
import sys
import subprocess

In [None]:
#installing the necessary packages via requirements.txt
# Path to requirements.txt
requirements_file = "requirements.txt"

if os.path.exists(requirements_file):
    print("Found requirements.txt, installing packages...")
    subprocess.check_call([sys.executable, "-m", "pip", "install", "-r", requirements_file])
else:
    print("No requirements.txt found.")

In [None]:
#loading packages
import numpy as np
from tqdm import tqdm
import pandas as pd
import os
import re
import tensorflow as tf
import pysbd

from textblob_de import TextBlobDE
from deep_translator import GoogleTranslator

from transformers import (
    AutoTokenizer,
    TFDistilBertForSequenceClassification,
    TextClassificationPipeline,
    MarianMTModel,
    MarianTokenizer,
    MBartForConditionalGeneration,
    MBart50Tokenizer
)


#vangenugten model
aiscoring = 'vangenugtenr/autobiographical_interview_scoring'
model_genugten = TFDistilBertForSequenceClassification.from_pretrained(aiscoring)
tokenizer = AutoTokenizer.from_pretrained(aiscoring)


[nltk_data] Downloading package punkt to C:\Users\Giuliano
[nltk_data]     Groer\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


load in your data.
your data should contain a column containing the text to be translated called "response", a column containing the ID of the subject that provided the response called "ResponseId", and a column containing the number of the response (1 if it is only one per subject) called "response_number".

In [None]:
language_data = pd.read_csv("LOCATION_OF_YOUR_DATAFILE.csv", sep= ",",header = 0, decimal='.', encoding= 'utf-8')

In [None]:
# --- Unified translator function ---
def translate_row(row, translator_name):
    """
    row: a DataFrame row with 'response', 'ResponseId', 'response_number'
    translator_name: 'mbart', 'marian', or 'google'
    """
    text = row['response']
    
    if translator_name == "mbart":
        mbart_model_name = "facebook/mbart-large-50-many-to-many-mmt"
        model_mbart = MBartForConditionalGeneration.from_pretrained(mbart_model_name)
        tokenizer_mbart = MBart50Tokenizer.from_pretrained(mbart_model_name)
        tokenizer_mbart.src_lang = "de_DE"
        inputs = tokenizer_mbart(text, return_tensors="pt", truncation=True)
        translated_tokens = model_mbart.generate(
            **inputs,
            forced_bos_token_id=tokenizer_mbart.lang_code_to_id["en_XX"]
        )
        return tokenizer_mbart.decode(translated_tokens[0], skip_special_tokens=True)

    elif translator_name == "marian":
        marian_model_name = "Helsinki-NLP/opus-mt-de-en"
        model_marian = MarianMTModel.from_pretrained(marian_model_name)
        tokenizer_marian = MarianTokenizer.from_pretrained(marian_model_name)
        inputs = tokenizer_marian(text, return_tensors="pt", padding=True, truncation=True)
        translated_tokens = model_marian.generate(**inputs)
        return tokenizer_marian.decode(translated_tokens[0], skip_special_tokens=True)

    elif translator_name == "google":
        google_translator = GoogleTranslator(source='de', target='en')
        return google_translator.translate(text)

    else:
        raise ValueError("Translator must be 'mbart', 'marian', or 'google'")


def translate_dataframe(df, translator_name):
    df = df.copy()
    translations = []

    # Translate each row with a progress bar
    for row in tqdm(df.itertuples(index=False), total=len(df), desc=f"Translating with {translator_name}"):
        translations.append(translate_row(row, translator_name))

    df['translation'] = translations

    # Add the new ID_new column
    df["ID_new"] = df["ResponseId"].astype(str) + "_" + df["response_number"].astype(str)

    return df


Simply put your data in and specify a translator

In [None]:
translator_choice = "mbart"  # or "marian", or "google"

# Translate entire DataFrame
translated_language_data = translate_dataframe(language_data, "mbart")  # or "marian" / "google"


For Scoring, use the below code segments.
The code is a modified version of the code provided by van Genugten & Schacter (2024) using their model and general coding approach.

In [None]:
# --- helper functions to allow scoring --- 
seg = pysbd.Segmenter(language="en", clean=False)
pipe = TextClassificationPipeline(model=model_genugten, tokenizer=tokenizer, return_all_scores=True)

def reshape_to_long_format(data):
    list_of_dataframes = []
    for row in range(data.shape[0]):
        # access some general info about this narrative
        this_subID = data.iloc[row, data.columns.get_loc("ID_new")]
        narrative = data.iloc[row, data.columns.get_loc("response_eng")]
        # store current row
        currentRow = data.iloc[[row], :]
        # create a new dataframe with each row a new sentence, and subID added
        segmented_sentences = seg.segment(narrative)
        sentences_df = pd.DataFrame(segmented_sentences, columns=['sentence'])
        sentences_df["ID_new"] = this_subID
        # create a new merged dataframe
        merged_thisNarrative = pd.merge(currentRow, sentences_df, on=["ID_new"])
        list_of_dataframes.append(merged_thisNarrative)
    return pd.concat(list_of_dataframes)

def prepare(data):
    test_texts = []
    # extract each sentence, convert to string, and append to list
    for row in range(data.shape[0]):
        temp_text = data.iloc[row, data.columns.get_loc("sentence")]
        temp_text = str(temp_text)  
        test_texts.append(temp_text)
    # encode text for BERT model
    encodings = tokenizer(test_texts, truncation=True, padding=True)
    # convert to a TensorFlow dataset
    dataset = tf.data.Dataset.from_tensor_slices((dict(encodings)))
    return dataset, test_texts

def classification(data):
    # use the preparation function from above 
    dataset, test_texts = prepare(data)
    # Split classification up into batches of sentences to manage RAM
    stored_test = []
    batch_size = 200
    # unse the classification pipeline 
    for i in range(0, len(test_texts), batch_size):
      stored_test.extend(pipe(test_texts[i:i+batch_size]))
    return stored_test

def predictions(data, stored_test):
    # create a list to store prediction dataframes
    list_of_predictionDfs = []
    # For each item in the stored_test (predictions), create a data frame and process
    for row in range(len(stored_test)):
        thisTestLabels = pd.DataFrame(stored_test[row])
        # set the 'label' as the index and remove it from the columns
        thisTestLabels.index = thisTestLabels['label']
        thisTestLabels = thisTestLabels.drop('label', axis=1)
        thisTestLabels = thisTestLabels.transpose()
        # append the data frame to the list
        list_of_predictionDfs.append(thisTestLabels)
    # get the prediction data frames 
    predictionsDf = pd.concat(list_of_predictionDfs)
    # identify the most likely label for each sentence
    predictionsDf['toplabel'] = predictionsDf.idxmax(axis=1)
    # merge the predictions with the original data frame 
    merged_data = pd.concat([data.reset_index(drop=True), predictionsDf.reset_index(drop=True)], axis=1)
    # add a variable with a word count for each sentence
    merged_data['sentenceWordCount'] = merged_data['sentence'].apply(lambda x: len(re.findall(r'\w+', str(x))))
    return merged_data

def predicted_words(df):
    # create two new columns for the counts
    df[['internal_pred']] = 0
    df[['external_pred']] = 0
    # loop through each row and calculate the counts
    for row in range(df.shape[0]):
        predictionType_thisIter = df.iloc[row, df.columns.get_loc("toplabel")]
        numTotalWords = df.iloc[row, df.columns.get_loc("sentenceWordCount")]
        # get the column locations for internal and external predictions
        internalLocation = df.columns.get_loc("internal_pred")
        externalLocation = df.columns.get_loc("external_pred")
        # classify based on the label and update the columns
        if predictionType_thisIter == 'LABEL_0':
            df.iloc[row, externalLocation] = numTotalWords
        elif predictionType_thisIter == 'LABEL_1':
            halfDetails = numTotalWords / 2
            df.iloc[row, externalLocation] = halfDetails
            df.iloc[row, internalLocation] = halfDetails
        elif predictionType_thisIter == 'LABEL_2':
            df.iloc[row, externalLocation] = numTotalWords / 4
            df.iloc[row, internalLocation] = numTotalWords * (3 / 4)
        elif predictionType_thisIter == 'LABEL_3':
            df.iloc[row, internalLocation] = numTotalWords
    return df

def sum_narrative(df):
    # select the relevant columns for output
    df_write_out_subset = df.loc[:, ["ID_new", "internal_pred", "external_pred", 'sentenceWordCount']]
    # Group by ID, then sum the 'internal_pred' and 'external_pred' columns
    grouped = df_write_out_subset.groupby(by=["ID_new"]).agg({
        'internal_pred': 'sum',
        'external_pred': 'sum',
        'sentenceWordCount': 'sum'
    }).reset_index()  # reset index to get a clean dataframe
    grouped.rename(columns={"sentenceWordCount": "total_words"}, inplace=True)
    return grouped




In [None]:
# --- core function to score the language data ---
def process_translated_data(translated_language_data, pipe):
    """
    Full pipeline to process translated language data:
    - reshape narratives to sentence-level
    - classify sentences
    - map predicted labels to word counts
    - aggregate by narrative
    """
    # Reshape to long format
    translated_language_data_long = reshape_to_long_format(translated_language_data)

    # Classify sentences using BERT pipeline
    stored_test = classification(translated_language_data_long, pipe)

    # Merge predictions with original long data
    translated_language_data_predictions = predictions(translated_language_data_long, stored_test)

    # Map labels to predicted internal/external word counts
    translated_language_data_new = predicted_words(translated_language_data_predictions)

    # Aggregate predictions by narrative
    translated_language_data_grouped = sum_narrative(translated_language_data_new)

    return translated_language_data_grouped


Simply put your translated data into the following function to get the scored data

In [None]:
scored_data = process_translated_data(translated_language_data, pipe)