In [45]:
import warnings

warnings.filterwarnings("ignore")

import os
import sys
import time

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import tensorflow as tf

In [46]:
interview_data = pd.read_excel(
    "../data/DAIC_demographic_data.xlsx",
    sheet_name="Interview_Data",
    skiprows=lambda x: x == 1,
)
# interview_data.head()

phq_score = pd.read_excel(
    "../data/DAIC_demographic_data.xlsx", sheet_name="Metadata_mapping"
)

# phq_score.head()

The following step was a workaround to prevent the end of file error due to unclosed inverted commas. What the code is doing - opening and reading each file, reading each line and checking for lines that start with " but does not end with " (i.e., checking for unclosed quotes).

In [47]:
# preprocessing the csv files to remove the unclosed inverted commas
def clean_csv(file_path, output_path):
    with open(file_path, "r") as infile, open(output_path, "w") as outfile:
        for line in infile:
            if line.startswith('"') and not line.endswith('"\n'):
                line = line[1:]
            outfile.write(line)


def process_directory(input_dir, output_dir):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    for filename in os.listdir(input_dir):
        if filename.endswith(".csv"):
            input_path = os.path.join(input_dir, filename)
            output_path = os.path.join(output_dir, filename)
            clean_csv(input_path, output_path)


input_dir = "../data/E-DAIC_Transcripts"
output_dir = "../data/E-DAIC_Transcripts_cleaned"

process_directory(input_dir, output_dir)

In [48]:
cleaned_transcripts_path = "../data/E-DAIC_Transcripts_cleaned"

transcripts = {}

# loop through each file in the folder, load it, and store the content
for filename in os.listdir(cleaned_transcripts_path):
    if filename.endswith("_Transcript.csv"):
        participant_id = filename.split("_")[0]  # extract the participant id
        file_path = os.path.join(cleaned_transcripts_path, filename)

        df = pd.read_csv(file_path)
        transcripts[participant_id] = df

# transcripts
transcripts["386"]

Unnamed: 0,Text
0,might have pulled something that
1,I'm going to bring the great thanks so much
2,and please
3,are you okay with this yes
4,oh I'm fine I'm a little tired but I found ou...
...,...
76,yeah well after college people usually many p...
77,thank you goodbye
78,oh that was that was fast
79,but I didn't never said there wasn't any like...


In [49]:
# combine all the text data for each participant into a single string
combined_transcripts = {}

for participant_id, transcript in transcripts.items():
    combined_transcripts[participant_id] = " ".join(transcript["Text"].astype(str))

# removing extra spaces caused by newlines
for participant_id, transcript in combined_transcripts.items():
    combined_transcripts[participant_id] = " ".join(transcript.split())

# combined_transcripts["386"]

In [50]:
# randomly split the transcripts into 5 folds
import random
import math

all_keys = list(combined_transcripts.keys())
random.seed(42)
random.shuffle(all_keys)

num_folds = 5
fold_size = math.ceil(len(all_keys) / num_folds)
folds = []

for i in range(num_folds):
    start = i * fold_size
    end = start + fold_size
    fold_keys = all_keys[start:end]
    fold = {key: combined_transcripts[key] for key in fold_keys}
    folds.append(fold)


for i, fold in enumerate(folds):
    print(f"Fold {i+1} contains {len(fold)} transcripts")

Fold 1 contains 38 transcripts
Fold 2 contains 38 transcripts
Fold 3 contains 38 transcripts
Fold 4 contains 38 transcripts
Fold 5 contains 38 transcripts


## Extracting Language Features
### _CountVectorizer_

In [63]:
from sklearn.feature_extraction.text import CountVectorizer

fold_features_count = {}

for i, fold in enumerate(folds):
    print(f"Processing fold {i+1}")

    fold_transcripts = list(fold.values())
    fold_ids = list(fold.keys())

    count_vectorizer = CountVectorizer()
    count_features = count_vectorizer.fit_transform(fold_transcripts)
    fold_features_count[f"Fold_{i+1}"] = {
        "features": count_features,
        "vocabulary": count_vectorizer.get_feature_names_out(),
    }
    print(f"Fold {i+1} Count Features Shape: {count_features.shape}")
    print(f"First 10 words: {', '.join(count_vectorizer.get_feature_names_out()[:10])}...\n")

    # TODO: might have to do some preprocessing to remove the stop words, numbers and more

Processing fold 1
Fold 1 Count Features Shape: (38, 3783)
First 10 words: 00, 10, 10th, 11, 12, 13, 14, 15, 15th, 16...

Processing fold 2
Fold 2 Count Features Shape: (38, 3517)
First 10 words: 00, 07, 08, 09, 10, 100, 101, 11, 12, 13...

Processing fold 3
Fold 3 Count Features Shape: (38, 3731)
First 10 words: 00, 04, 08, 10, 100, 101, 11, 11th, 12, 14...

Processing fold 4
Fold 4 Count Features Shape: (38, 3992)
First 10 words: 00, 02, 10, 100, 1000000, 101, 11, 12, 13, 14...

Processing fold 5
Fold 5 Count Features Shape: (38, 3712)
First 10 words: 00, 10, 11th, 12, 12th, 13, 14, 15, 16, 17...



### _TF-IDF Vectorizer_

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

fold_features_tfidf = {}

for i, fold in enumerate(folds):
    print(f"Processing fold {i+1}")

    fold_transcripts = list(fold.values())
    fold_ids = list(fold.keys())

    # TF-IDF Vectorizer
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_features = tfidf_vectorizer.fit_transform(fold_transcripts)
    fold_features_tfidf[f"Fold_{i+1}"] = {
        "features": tfidf_features,
        "vocabulary": tfidf_vectorizer.get_feature_names_out(),
    }
    print(f"Fold {i+1} TF-IDF Features Shape: {tfidf_features.shape}")
    print(
        f"Fold {i+1} TF-IDF Vocabulary: {', '.join(tfidf_vectorizer.get_feature_names_out()[:10])}...\n"
    )

    # TODO: maybe can also see the top N terms in each document

Processing fold 1
Fold 1 TF-IDF Features Shape: (38, 3783)
Fold 1 TF-IDF Vocabulary: 00, 10, 10th, 11, 12, 13, 14, 15, 15th, 16...

Processing fold 2
Fold 2 TF-IDF Features Shape: (38, 3517)
Fold 2 TF-IDF Vocabulary: 00, 07, 08, 09, 10, 100, 101, 11, 12, 13...

Processing fold 3
Fold 3 TF-IDF Features Shape: (38, 3731)
Fold 3 TF-IDF Vocabulary: 00, 04, 08, 10, 100, 101, 11, 11th, 12, 14...

Processing fold 4
Fold 4 TF-IDF Features Shape: (38, 3992)
Fold 4 TF-IDF Vocabulary: 00, 02, 10, 100, 1000000, 101, 11, 12, 13, 14...

Processing fold 5
Fold 5 TF-IDF Features Shape: (38, 3712)
Fold 5 TF-IDF Vocabulary: 00, 10, 11th, 12, 12th, 13, 14, 15, 16, 17...



### Semantic Features using _VADER_

The `compound` score is the overall sentiment score, which is normalized score ranging from $-1$ (most extreme negative) to $+1$ (most extreme positive). The valence in the name _VADER_ is essentially this compound score, which represents the overall sentiment of the text.

In [66]:
# !pip install vaderSentiment
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

fold_features_sentiment = {}
sentiment_analyzer = SentimentIntensityAnalyzer()

for i, fold in enumerate(folds):
    fold_sentiment_scores = {}
    for participant_id, transcript in fold.items():
        sentiment = sentiment_analyzer.polarity_scores(transcript)
        fold_sentiment_scores[participant_id] = sentiment

    # Print sentiment scores for each participant
    # for participant_id, scores in fold_sentiment_scores.items():
    #     print(f"Participant ID: {participant_id}")
    #     print(f"Sentiment Scores: {scores}")

    # Save sentiment scores to a dictionary or file
    fold_features_sentiment[f"Fold_{i+1}"] = fold_sentiment_scores