In [None]:
"""
lexical_features.ipynb

This script calculates lexical diversity metrics (TTR and Brunet's Index)
from a dataset of spoken language transcripts. Developed as part of a study
on multilingual lexical markers and depression severity.

Usage:
- Place your input CSV file in the /data folder.
- Ensure the file contains a column named 'Text' with participant responses.
- Run the script after setting the correct working directory and file name.

Author: Anastasiia Tokareva
"""

# Set your working directory
data_dir = "you/directory/here"
os.chdir(data_dir)


In [None]:
## 1. Load libraries, set wd, and load data

import os
import pandas as pd

# Install and import the NLTK library for natural language processing
! pip install nltk
import nltk

# Donwload Punkt tokenizer (tokenises text into sentebces and sentences into words)
nltk.download('punkt')

# Load the data
data = pd.read_csv('your_speech_transcript_dataset.csv')  # enter the name of your file
print(data.head)

In [None]:
## 2. Calculate TTR for each row and add it as a new column
# TTR = n_unique_words/n_total_words * 100%

def calculate_ttr(text):
    if isinstance(text, str):                       # condition: TTR calculated if the entry is string
        tokens = nltk.word_tokenize(text.lower())   # tokenize and convert to lowercase
        unique_words = set(tokens)                  # convert a list of tokens into a set
        n_unique_words = len(unique_words)          # ccount unqiue words
        n_total_words = len(tokens)                 # count total words in the response
        ttr = (n_unique_words / n_total_words) * 100 if n_total_words > 0 else 0   # no /0 if the entry is empty
        return ttr
    else:
        return 0  # condition: otherwise, return 0

# Apply the TTR calculation to the 'Text' column - change the column name if needed
data['TTR'] = data['Text'].apply(calculate_ttr)


In [3]:
## 3. Calculate Brunet's index for each row and add it as a new columnBrunet index of LD
# W = N**V**-alpha, where N = text total text length, V = number of unique words

alpha = -0.165   # standard value used

def calculate_brunet(text, alpha=alpha):
    if isinstance(text, str): 
        tokens = nltk.word_tokenize(text.lower())  
        unique_words = set(tokens)
        n_unique_words = len(set(tokens))
        n_total_words = len(tokens)
        
        if n_total_words > 0 and n_unique_words > 0:
            brunet_index = n_total_words ** (n_unique_words ** -alpha)
            return brunet_index
        else:
            return 0   # if no words
    else:
        return 0     # if not a string


# Apply the Brunet index calculation to the 'Text' column - change column name if needed
data['Brunet'] = data['Text'].apply(calculate_brunet)

# Save the updated file as a new csv file in the same wd
output_file = 'RADAR_LIWC_Brunet.csv'
data.to_csv(output_file, index=False)

print(f"Updated csv file saved as {output_file}")

Updated csv file saved as RADAR_LIWC_Brunet.csv
