In [None]:
import warnings

warnings.filterwarnings("ignore")

import os
import sys
import time

import pandas as pd
import numpy as np

## Part a: Extracting Language Features

In [16]:
# the interview data (sheet 1)
interview_data = pd.read_excel(
    "../data/DAIC_demographic_data.xlsx",
    sheet_name="Interview_Data",
    skiprows=lambda x: x == 1,
)
# drop the rows where data in column Partic# is NaN
interview_data = interview_data.dropna(subset=["Partic#"])
print(interview_data.tail())

# the phq score data (sheet 2)
phq_score = pd.read_excel(
    "../data/DAIC_demographic_data.xlsx", sheet_name="Metadata_mapping"
)

print(phq_score.head())

     Partic# Condition  gender  race
428    837.0        AI     2.0     3
429    838.0        AI     1.0     1
430    839.0        AI     1.0     1
431    840.0        AI     2.0     3
432    841.0        AI     1.0     4
   Participant_ID  PHQ_Score
0             300          2
1             301          3
2             302          4
3             303          0
4             304          6


The following step was a workaround to prevent the end of file error due to unclosed inverted commas. What the code is doing - opening and reading each file, reading each line and checking for lines that start with " but does not end with " (i.e., checking for unclosed quotes).

In [3]:
# preprocessing the csv files to remove the unclosed inverted commas
def clean_csv(file_path, output_path):
    with open(file_path, "r") as infile, open(output_path, "w") as outfile:
        for line in infile:
            if line.startswith('"') and not line.endswith('"\n'):
                line = line[1:]
            outfile.write(line)


def process_directory(input_dir, output_dir):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    for filename in os.listdir(input_dir):
        if filename.endswith(".csv"):
            input_path = os.path.join(input_dir, filename)
            output_path = os.path.join(output_dir, filename)
            clean_csv(input_path, output_path)


input_dir = "../data/E-DAIC_Transcripts"
output_dir = "../data/E-DAIC_Transcripts_cleaned"

process_directory(input_dir, output_dir)

In [4]:
cleaned_transcripts_path = "../data/E-DAIC_Transcripts_cleaned"

transcripts = {}

# loop through each file in the folder, load it, and store the content
for filename in os.listdir(cleaned_transcripts_path):
    if filename.endswith("_Transcript.csv"):
        participant_id = filename.split("_")[0]  # extract the participant id
        file_path = os.path.join(cleaned_transcripts_path, filename)

        df = pd.read_csv(file_path)
        transcripts[participant_id] = df

# transcripts
transcripts["386"]

Unnamed: 0,Text
0,might have pulled something that
1,I'm going to bring the great thanks so much
2,and please
3,are you okay with this yes
4,oh I'm fine I'm a little tired but I found ou...
...,...
76,yeah well after college people usually many p...
77,thank you goodbye
78,oh that was that was fast
79,but I didn't never said there wasn't any like...


In [7]:
# combine all the text data for each participant into a single string
combined_transcripts = {}

for participant_id, transcript in transcripts.items():
    combined_transcripts[participant_id] = " ".join(transcript["Text"].astype(str))

# removing extra spaces caused by newlines
for participant_id, transcript in combined_transcripts.items():
    combined_transcripts[participant_id] = " ".join(transcript.split())

# combined_transcripts["386"]

In [21]:
# combine the demographic data with the extracted text data

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# create the feature extractor objects here; using the base settings for now
# TODO: make changes to the parameters to preprocess the text data

corpus = list(combined_transcripts.values())
tfidf_vectorizer = TfidfVectorizer()
tfidf_vectorizer.fit(corpus)

count_vectorizer = CountVectorizer()
count_vectorizer.fit(corpus)

sentiment_analyzer = SentimentIntensityAnalyzer()

combined_data = []

# convert the participant id in demographic data to int for consistency
interview_data["Partic#"] = interview_data["Partic#"].astype(int)

for _, row in interview_data.iterrows():
    participant_id = str(row["Partic#"])  # convert id to match the transcript ids

    # find langauge features for this participant
    if participant_id in combined_transcripts:
        transcript = combined_transcripts[participant_id]

        # tfidf features
        tfidf_features = (
            tfidf_vectorizer.transform([transcript]).toarray().flatten()
        )  # has to be 1D array

        # count features
        count_features = count_vectorizer.transform([transcript]).toarray().flatten()

        # sentiment features
        sentiment_compound_scores = sentiment_analyzer.polarity_scores(transcript)[
            "compound"
        ]  # only extracting the compound score

        # combine all the features
        data = {
            "participant_id": participant_id,
            "condition": row["Condition"],
            "race": row["race"],
            "gender": row["gender"],
            "tfidf_features": tfidf_features,
            "count_features": count_features,
            "overall_sentiment": sentiment_compound_scores,
        }

        combined_data.append(data)

# convert the combined data into a dataframe
combined_data_df = pd.DataFrame(combined_data)

# print(combined_data_df.head())

In [22]:
combined_data_df.head()

Unnamed: 0,participant_id,condition,race,gender,tfidf_features,count_features,overall_sentiment
0,386,WoZ,3,2.0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0085443313426...","[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.9999
1,387,WoZ,1,1.0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0253774451976...","[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.9996
2,388,WoZ,4,1.0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0292778521617...","[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.9953
3,389,WoZ,1,1.0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0487055979434...","[0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 1, 0, ...",0.9822
4,390,WoZ,3,1.0,"[0.03182282795453763, 0.0, 0.0, 0.0, 0.0, 0.0,...","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.9996


Some useful information about the resulting dataframe - 

- Each term in the `TF-IDF` vector is considered a feature. The values represent the `TF-IDF` score for that term. A higher score could mean that the term is important to the transcript but no common in the entire list of transcripts. This is helpful in identifying the transcript's topic/sentiment.
- Count features are straightforward. The value of a feature is the raw count of how many times the term appears in the transcript.
- The compound score is the overall sentiment of the transcript. Its value ranges from $-1$ to $1$ where positive sentiments have a compound score of $\geq 0.05$, neutral sentiment have a score between $-0.05$ and $0.05$, and negative sentiments have a compound score of $\leq -0.05$.