In [1]:
import warnings

warnings.filterwarnings("ignore")

import os
import sys
import time

import pandas as pd
import numpy as np

## Part a: Extracting Language Features

In [2]:
# the interview data (sheet 1)
interview_data = pd.read_excel(
    "../data/DAIC_demographic_data.xlsx",
    sheet_name="Interview_Data",
    skiprows=lambda x: x == 1,
)
# drop the rows where data in column Partic# is NaN
interview_data = interview_data.dropna(subset=["Partic#"])
print(interview_data.tail())

# the phq score data (sheet 2)
phq_scores_data = pd.read_excel(
    "../data/DAIC_demographic_data.xlsx", sheet_name="Metadata_mapping"
)

#Convert Participant_ID column to string
phq_scores_data['Participant_ID'] = phq_scores_data['Participant_ID'].astype(str)

#Convert PHQ_Score to int
phq_scores_data['PHQ_Score'] = phq_scores_data['PHQ_Score'].astype(int)

print(phq_scores_data.head())

     Partic# Condition  gender  race
428    837.0        AI     2.0     3
429    838.0        AI     1.0     1
430    839.0        AI     1.0     1
431    840.0        AI     2.0     3
432    841.0        AI     1.0     4
  Participant_ID  PHQ_Score
0            300          2
1            301          3
2            302          4
3            303          0
4            304          6


The following step was a workaround to prevent the end of file error due to unclosed inverted commas. What the code is doing - opening and reading each file, reading each line and checking for lines that start with " but does not end with " (i.e., checking for unclosed quotes).

In [3]:
# preprocessing the csv files to remove the unclosed inverted commas
def clean_csv(file_path, output_path):
    with open(file_path, "r") as infile, open(output_path, "w") as outfile:
        for line in infile:
            if line.startswith('"') and not line.endswith('"\n'):
                line = line[1:]
            outfile.write(line)


def process_directory(input_dir, output_dir):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    for filename in os.listdir(input_dir):
        if filename.endswith(".csv"):
            input_path = os.path.join(input_dir, filename)
            output_path = os.path.join(output_dir, filename)
            clean_csv(input_path, output_path)


input_dir = "../data/E-DAIC_Transcripts"
output_dir = "../data/E-DAIC_Transcripts_cleaned"

process_directory(input_dir, output_dir)

In [4]:
cleaned_transcripts_path = "../data/E-DAIC_Transcripts_cleaned"

transcripts = {}

# loop through each file in the folder, load it, and store the content
for filename in os.listdir(cleaned_transcripts_path):
    if filename.endswith("_Transcript.csv"):
        participant_id = filename.split("_")[0]  # extract the participant id
        file_path = os.path.join(cleaned_transcripts_path, filename)

        df = pd.read_csv(file_path)
        transcripts[participant_id] = df

# transcripts
transcripts["386"]

Unnamed: 0,Text
0,might have pulled something that
1,I'm going to bring the great thanks so much
2,and please
3,are you okay with this yes
4,oh I'm fine I'm a little tired but I found ou...
...,...
76,yeah well after college people usually many p...
77,thank you goodbye
78,oh that was that was fast
79,but I didn't never said there wasn't any like...


In [5]:
# combine all the text data for each participant into a single string
combined_transcripts = {}

for participant_id, transcript in transcripts.items():
    combined_transcripts[participant_id] = " ".join(transcript["Text"].astype(str))

# removing extra spaces caused by newlines
for participant_id, transcript in combined_transcripts.items():
    combined_transcripts[participant_id] = " ".join(transcript.split())

# combined_transcripts["386"]

In [6]:
# combine the demographic data with the extracted text data

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# create the feature extractor objects here; using the base settings for now
# TODO: make changes to the parameters to preprocess the text data

corpus = list(combined_transcripts.values())
tfidf_vectorizer = TfidfVectorizer()
tfidf_vectorizer.fit(corpus)

count_vectorizer = CountVectorizer()
count_vectorizer.fit(corpus)

sentiment_analyzer = SentimentIntensityAnalyzer()

combined_data = []

# convert the participant id in demographic data to int for consistency
interview_data["Partic#"] = interview_data["Partic#"].astype(int)

for _, row in interview_data.iterrows():
    participant_id = str(row["Partic#"])  # convert id to match the transcript ids

    # find langauge features for this participant
    if participant_id in combined_transcripts:
        transcript = combined_transcripts[participant_id]

        # tfidf features
        tfidf_features = (
            tfidf_vectorizer.transform([transcript]).toarray().flatten()
        )  # has to be 1D array

        # count features
        count_features = count_vectorizer.transform([transcript]).toarray().flatten()

        # sentiment features
        sentiment_compound_scores = sentiment_analyzer.polarity_scores(transcript)[
            "compound"
        ]  # only extracting the compound score

        # Extracting the PHQ score
        phq_score_row = phq_scores_data[phq_scores_data["Participant_ID"] == participant_id]
        if not phq_score_row.empty:
            phq_score = phq_score_row["PHQ_Score"].values[0]
        else:
            phq_score = -1
            continue  # skip this participant if PHQ score is not found

        # combine all the features
        data = {
            "participant_id": participant_id,
            "condition": row["Condition"],
            "race": row["race"],
            "PHQ_score": phq_score,
            "gender": row["gender"],
            "tfidf_features": tfidf_features,
            "count_features": count_features,
            "overall_sentiment": sentiment_compound_scores
        }

        combined_data.append(data)

# convert the combined data into a dataframe
combined_data_df = pd.DataFrame(combined_data)

# print(combined_data_df.head())

In [7]:
combined_data_df.head()

Unnamed: 0,participant_id,condition,race,PHQ_score,gender,tfidf_features,count_features,overall_sentiment
0,386,WoZ,3,11,2.0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0085443313426...","[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.9999
1,387,WoZ,1,2,1.0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0253774451976...","[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.9996
2,388,WoZ,4,17,1.0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0292778521617...","[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.9953
3,389,WoZ,1,14,1.0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0487055979434...","[0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 1, 0, ...",0.9822
4,390,WoZ,3,9,1.0,"[0.03182282795453763, 0.0, 0.0, 0.0, 0.0, 0.0,...","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.9996


Some useful information about the resulting dataframe - 

- Each term in the `TF-IDF` vector is considered a feature. The values represent the `TF-IDF` score for that term. A higher score could mean that the term is important to the transcript but no common in the entire list of transcripts. This is helpful in identifying the transcript's topic/sentiment.
- Count features are straightforward. The value of a feature is the raw count of how many times the term appears in the transcript.
- The compound score is the overall sentiment of the transcript. Its value ranges from $-1$ to $1$ where positive sentiments have a compound score of $\geq 0.05$, neutral sentiment have a score between $-0.05$ and $0.05$, and negative sentiments have a compound score of $\leq -0.05$.

## Part b: Classifying for gender

In [8]:
# drop race, condition, phq_score, and participant_id
genderDF = combined_data_df.drop(["race", "condition", "participant_id", "PHQ_score"], axis=1)

# drop any row with NaN
genderDF = genderDF.dropna()

# map gender from [1,2] -> [0,1] (XGBoost needs the labels to be 0 or 1)
genderDF["gender"] = genderDF["gender"].map({1: 0, 2: 1})

# extract the target values (gender)
y = genderDF.pop("gender")

In [9]:
# XGBoost cannot accept multi-dimensional features so each list element must live in its own column

# Expand lists into separate columns
tfidf_features = pd.DataFrame(genderDF['tfidf_features'].tolist(), index=genderDF.index)
tfidf_features.columns = [f'tfidf_features{i}' for i in range(tfidf_features.shape[1])]

# Expand lists into separate columns
count_features = pd.DataFrame(genderDF['count_features'].tolist(), index=genderDF.index)
count_features.columns = [f'count_features{i}' for i in range(count_features.shape[1])]

# recreate gender dataframe
genderDF = pd.concat([tfidf_features, count_features], axis=1)

genderDF.shape

(132, 17404)

In [76]:
# due to the flattening of the arrays, we not how 17404 features which is too much
# therefore, we are going to use PCA to reduce our input dimensionality

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# Scale data before applying PCA
scaling=StandardScaler()
 
# Use fit and transform method 
scaling.fit(genderDF)
Scaled_data=scaling.transform(genderDF)

In [77]:
# helper function to calculate accuracy and balanced accuracy
def getAccAndBAcc(yPred, yTrue):

    truePos = 0
    trueNeg = 0
    falsePos = 0
    falseNeg = 0

    for idx in range(len(yPred)):
        
        if yPred[idx] == 1:

            if yTrue[idx] == 1:

                truePos += 1

            else:

                falseNeg += 1

        else:

            if yTrue[idx] == 1:

                falsePos += 1

            else:

                trueNeg += 1

    if (trueNeg+falsePos) == 0 and (truePos+falseNeg) != 0:
        balancedAccuracy = 0.5*truePos/(truePos+falseNeg)
    elif (trueNeg+falsePos) != 0 and (truePos+falseNeg) == 0:
        balancedAccuracy = 0.5*trueNeg/(trueNeg+falsePos)
    else:
        balancedAccuracy = 0.5*trueNeg/(trueNeg+falsePos) + 0.5*truePos/(truePos+falseNeg)
    
    accuracy = (truePos + trueNeg) / (truePos + trueNeg + falsePos + falseNeg)

    return accuracy, balancedAccuracy

In [78]:
# create deep learning model
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

def getDLModel(inputShape):

    model = Sequential([
        Dense(128, activation='relu', input_shape=(inputShape,)),
        Dense(64, activation='relu'),
        Dense(32, activation='relu'),
        Dense(16, activation='relu'),
        Dense(8, activation='relu'),
        Dense(1, activation="sigmoid")
    ])

    model.compile(
        optimizer='adam',
        loss = 'binary_crossentropy',
        metrics = ["accuracy"]
    )

    return model

In [79]:
from sklearn.model_selection import train_test_split
#!pip install xgboost
import xgboost as xgb

# minimum covariance for feature to be included
pcaCovarianceFloor = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]

bAccTree = []
accTree  = []

bAccDL = []
accDL  = []

numberOfComponents = []

for pcaFloor in pcaCovarianceFloor:

    # run PCA with given number of components
    principal=PCA(n_components=pcaFloor)
    principal.fit(Scaled_data)
    x=principal.transform(Scaled_data)

    # get the reduced dataset
    reducedGenderData = principal.transform(Scaled_data)
    numberOfComponents.append(reducedGenderData.shape[1])
    reducedGenderDF = pd.DataFrame(reducedGenderData, columns=[f"PC{i}" for i in range(1, numberOfComponents[-1] + 1)])

    # split the dataset into testing and training splits
    # since the dataset is so small, we want to make sure 
    # there is at least 10 of each class in test split
    xTrain, xTest, yTrain, yTest = train_test_split(reducedGenderDF, y, test_size=0.2, random_state=76)

    # train XGBoost model
    xgb_classifier = xgb.XGBClassifier()
    xgb_classifier.fit(xTrain, yTrain)

    # get test accuracy and balanced accuracy
    acc, bAcc = getAccAndBAcc(xgb_classifier.predict(xTest), yTest.to_numpy())

    # store accuracies
    accTree.append(acc)
    bAccTree.append(bAcc)

    # train DL model
    FNN = getDLModel(numberOfComponents[-1])
    FNN.fit(xTrain, yTrain, epochs=100, verbose=0)

    # get test accuracy and balanced accuracy
    acc, bAcc = getAccAndBAcc(FNN.predict(xTest), yTest.to_numpy())

    # store accuracies
    accDL.append(acc)
    bAccDL.append(bAcc)

print("\n")
for idx in range(len(pcaCovarianceFloor)):

    print("#######################################")
    print(f"Values for Cov. Floor: {pcaCovarianceFloor[idx]}, Components: {numberOfComponents[idx]}:")
    print(f"Tree accuracy: {accTree[idx]}")
    print(f"Tree balanced accuracy: {bAccTree[idx]}")
    print(f"DL accuracy: {accDL[idx]}")
    print(f"DL balanced accuracy: {bAccDL[idx]}")


maxTreeBAccPCACovFloor = pcaCovarianceFloor[np.argmax(bAccTree)]
maxDLBAccPCACovFloor = pcaCovarianceFloor[np.argmax(bAccDL)]

print("\n")
print("#######################################")
print(f"Max Tree Covariance Floor: {maxTreeBAccPCACovFloor}, with balanced accuracy of {np.max(bAccTree) * 100}%")
print(f"Max DL Covariance Floor: {maxDLBAccPCACovFloor}, with balanced accuracy of {np.max(bAccDL) * 100}%")
print("#######################################")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 83ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 73ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 76ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 85ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 74ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 70ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 84ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 65ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 54ms/step


#######################################
Values for Cov. Floor: 0.1, Components: 6:
Tree accuracy: 0.5555555555555556
Tree balanced accuracy: 0.5521978021978022
DL accuracy: 0.6666666666666666
DL balanced accuracy: 0.7010869565217391
#######################################
Values for Cov. Floor: 0.2, Components: 13:
Tree accuracy: 0.6296296296296297

# part c

In [80]:
# imports
from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, BatchNormalization, Attention, Flatten, Input
from keras.optimizers import SGD
from keras.regularizers import l2
from sklearn.model_selection import train_test_split
import xgboost as xgb

In [81]:
def create_df(input_df, col_to_drop, y_col):
    new_df = input_df.drop(col_to_drop, axis=1)
    new_df = new_df.dropna()
    y_new = new_df.pop(y_col)
    tfidf_features = pd.DataFrame(new_df['tfidf_features'].tolist(), index=new_df.index)
    tfidf_features.columns = [f'tfidf_features{i}' for i in range(tfidf_features.shape[1])]

    count_features = pd.DataFrame(new_df['count_features'].tolist(), index=new_df.index)
    count_features.columns = [f'count_features{i}' for i in range(count_features.shape[1])]
    
    new_df = pd.concat([tfidf_features, count_features], axis=1)
    return new_df, y_new

In [None]:
race_df, y_race = create_df(combined_data_df, ["gender", "condition", "participant_id", "PHQ_score"], "race")
race_df.shape, y_race.shape

((134, 17404), (134,))

In [83]:
# xgboost needs the range to start at 0
y_race = y_race - 1
y_race

0      2
1      0
2      3
3      0
4      2
      ..
129    4
130    2
131    0
132    2
133    2
Name: race, Length: 134, dtype: int64

In [84]:
def scale_data(input_df):
    scaling=StandardScaler()
    scaling.fit(input_df)
    Scaled_data=scaling.transform(input_df)
    return Scaled_data

In [85]:
def get_dense_model(input_shape):
    
    model = Sequential([
        Dense(128, activation='relu', input_shape=(input_shape,)),
        BatchNormalization(),
        Dropout(0.3),
    
        Dense(64, activation='relu'),
        BatchNormalization(),
        Dropout(0.3),
    
        Dense(32, activation='relu'),
        BatchNormalization(),
        Dropout(0.3),
    
        Dense(16, activation='relu'),
        BatchNormalization(),
        Dropout(0.3),
    
        Dense(8, activation='relu'),
        BatchNormalization(),
        Dropout(0.3),
    
        Dense(1, activation="sigmoid")
    ])
    
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

In [None]:
scaled_data = scale_data(race_df)

pcaCovarianceFloor = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
# [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]

bA_dense, ba_xgb = [], []
a_dense, a_xgb = [], []

num_comp = []

for pcaFloor in pcaCovarianceFloor:
    
    principal=PCA(n_components=pcaFloor)
    principal.fit(scaled_data)
    x=principal.transform(scaled_data)
    
    reduced_race_df = principal.transform(scaled_data)
    num_comp.append(reduced_race_df.shape[1])
    reduced_race_df = pd.DataFrame(reduced_race_df, columns=[f"PC{i}" for i in range(1, num_comp[-1] + 1)])
    
    x_train, x_test, y_train, y_test = train_test_split(reduced_race_df, y_race, test_size=0.2, random_state=76)

    attention_model = get_dense_model(num_comp[-1])
    attention_model.fit(x_train, y_train, epochs=100, verbose=0)

    acc, bAcc = getAccAndBAcc(attention_model.predict(x_test), y_test.to_numpy())

    a_dense.append(acc)
    bA_dense.append(bAcc)
    
    xgb_classifier = xgb.XGBClassifier()
    xgb_classifier.fit(x_train, y_train)

    acc, bAcc = getAccAndBAcc(xgb_classifier.predict(x_test), y_test.to_numpy())

    
    a_xgb.append(acc)
    ba_xgb.append(bAcc)
    

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 202ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
[2 2 3 2 2 0 2 6 0 6 0 2 0 2 0 2 2 2 2 2 3 2 0 2 2 3 2]
[0.58213997 0.9465875  0.9135452  0.9200443  0.97254556 0.88910455
 0.93198967 0.98857963 0.9179726  0.9168656  0.95342916 0.89318293
 0.02660007 0.91617525 0.95555437 0.97454137 0.5222763  0.9778992
 0.983741   0.9231853  0.96315026 0.9467469  0.8984548  0.88574713
 0.9868848  0.97462684 0.30373526]
[0 2 2 0 0 2 2 0 0 0 0 3 1 2 3 2 0 2 3 3 2 2 2 1 0 1 0]


In [90]:
print("\n")
for idx in range(len(pcaCovarianceFloor)):

    print("#######################################")
    print(f"Values for Cov. Floor: {pcaCovarianceFloor[idx]}, Components: {num_comp[idx]}:")
    print(f"Tree accuracy: {a_xgb[idx]}")
    print(f"Tree balanced accuracy: {ba_xgb[idx]}")
    print(f"DL accuracy: {a_dense[idx]}")
    print(f"DL balanced accuracy: {bA_dense[idx]}")


maxTreeBAccPCACovFloor = pcaCovarianceFloor[np.argmax(ba_xgb)]
maxDLBAccPCACovFloor = pcaCovarianceFloor[np.argmax(bA_dense)]

print("\n")
print("#######################################")
print(f"Max Tree Covariance Floor: {maxTreeBAccPCACovFloor}, with balanced accuracy of {np.max(ba_xgb) * 100}%")
print(f"Max DL Covariance Floor: {maxDLBAccPCACovFloor}, with balanced accuracy of {np.max(bA_dense) * 100}%")
print("#######################################")



#######################################
Values for Cov. Floor: 0.1, Components: 6:
Tree accuracy: 0.8888888888888888
Tree balanced accuracy: 0.4444444444444444
DL accuracy: 0.8888888888888888
DL balanced accuracy: 0.4444444444444444
#######################################
Values for Cov. Floor: 0.2, Components: 13:
Tree accuracy: 0.8888888888888888
Tree balanced accuracy: 0.4444444444444444
DL accuracy: 0.8888888888888888
DL balanced accuracy: 0.4444444444444444
#######################################
Values for Cov. Floor: 0.3, Components: 22:
Tree accuracy: 0.8888888888888888
Tree balanced accuracy: 0.4444444444444444
DL accuracy: 0.8888888888888888
DL balanced accuracy: 0.4444444444444444
#######################################
Values for Cov. Floor: 0.4, Components: 32:
Tree accuracy: 0.8888888888888888
Tree balanced accuracy: 0.4444444444444444
DL accuracy: 0.8888888888888888
DL balanced accuracy: 0.4444444444444444
#######################################
Values for Cov. Floor: 

# part d

In [111]:
def absRelErr(yPred, yTrue):
    return np.mean(np.abs(yPred - yTrue) / np.max(yTrue))



def pearsonCorr(yPred, yTrue):
    return np.corrcoef(yPred, yTrue)[0, 1]


In [114]:


def getDLModelPHQ(inputShape):

    model = Sequential([
        Dense(128, activation='relu', input_shape=(inputShape,)),
        Dense(64, activation='relu'),
        Dense(32, activation='relu'),
        Dense(16, activation='relu'),
        Dense(8, activation='relu'),
        Dense(1, activation="linear")
    ])

    model.compile(
        optimizer='adam',
        loss = 'binary_crossentropy',
        metrics = ["accuracy"]
    )

    return model

In [None]:
phq_df, y_phq = create_df(combined_data_df, ["gender", "condition", "participant_id", "race"], "PHQ_score")

scaled_data = scale_data(phq_df)

pcaCovarianceFloor = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
pcaCovarianceFloor = [0.5]

re_dense, re_xgb = [], []
r_dense, r_xgb = [], []

num_comp = []

for pcaFloor in pcaCovarianceFloor:

    principal=PCA(n_components=pcaFloor)
    principal.fit(scaled_data)
    x=principal.transform(scaled_data)
    
    reduced_phq_df = principal.transform(scaled_data)
    num_comp.append(reduced_phq_df.shape[1])
    reduced_phq_df = pd.DataFrame(reduced_phq_df, columns=[f"PC{i}" for i in range(1, num_comp[-1] + 1)])
    
    x_train, x_test, y_train, y_test = train_test_split(reduced_phq_df, y_phq, test_size=0.2, random_state=76)

    attention_model = getDLModelPHQ(num_comp[-1])
    attention_model.fit(x_train, y_train, epochs=100, verbose=0)

    relErr = absRelErr(attention_model.predict(x_test), y_test.to_numpy())
    re_dense.append(relErr)
    
    xgb_regressor = xgb.XGBRegressor()
    xgb_regressor.fit(x_train, y_train)

    relErr = absRelErr(xgb_regressor.predict(x_test), y_test.to_numpy())
    re_xgb.append(relErr)

    pearson = pearsonCorr(xgb_regressor.predict(x_test), y_test.to_numpy())
    r_xgb.append(pearson)

    pearson = pearsonCorr(attention_model.predict(x_test).flatten(), y_test.to_numpy())
    r_dense.append(pearson)

    

    

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 77ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
DL: [13.337463   2.258594   4.4206686  4.718963  12.492231   5.1419744
 12.062782  12.091892   3.3506124  5.1999702  3.191227   5.011941
 13.5868845  4.8835373  9.297262  12.555527  15.228771   4.727675
 13.370886   2.4170954  2.0760539  3.0328467  4.8470025  3.6417987
  4.6540422  6.291273  16.052652 ]
XGB: [ 5.090878   4.273607   4.4827123  4.713615  12.078203  12.830768
  5.066999  12.960701   6.7639174  4.727186   7.1131506  7.7622232
  7.646736   6.0238395  9.4570875  7.1776958  4.96517    6.283907
  5.917521   4.4232993  1.436622   6.660711   3.411708   6.949372
  9.9961195  4.734971   4.9377565]
True: [ 1 10  0  1  6  1  1  8  2  2  0  9  0  0  3 15  2  1  9 16  4  0  4  0
  3  1  2]
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step


In [118]:
print("\n")

for idx in range(len(pcaCovarianceFloor)):
    print("#######################################")
    print(f"Values for Cov. Floor: {pcaCovarianceFloor[idx]}, Components: {num_comp[idx]}:")
    print(f"Tree Abs. Rel. Err.: {re_xgb[idx]}")
    print(f"Tree Pearson Corr.: {r_xgb[idx]}")
    print(f"DL Abs. Rel. Err.: {re_dense[idx]}")
    print(f"DL Pearson Corr.: {r_dense[idx]}")

maxTreeREPCACovFloor = pcaCovarianceFloor[np.argmin(re_xgb)]
maxDLREPCACovFloor = pcaCovarianceFloor[np.argmin(re_dense)]

print("\n")
print("#######################################")
print(f"Min Tree Covariance Floor: {maxTreeREPCACovFloor}, with absolute relative error of {np.min(re_xgb)}")
print(f"Min DL Covariance Floor: {maxDLREPCACovFloor}, with absolute relative error of {np.min(re_dense)}")
print("#######################################")



#######################################
Values for Cov. Floor: 0.1, Components: 6:
Tree Abs. Rel. Err.: 0.33117100072127803
Tree Pearson Corr.: -0.10339518775899634
DL Abs. Rel. Err.: 0.25367001057774935
DL Pearson Corr.: 0.183301031282902
#######################################
Values for Cov. Floor: 0.2, Components: 13:
Tree Abs. Rel. Err.: 0.2908648470485652
Tree Pearson Corr.: 0.02956304302284035
DL Abs. Rel. Err.: 0.9675216216527549
DL Pearson Corr.: -0.005082451557327767
#######################################
Values for Cov. Floor: 0.3, Components: 22:
Tree Abs. Rel. Err.: 0.30529405176639557
Tree Pearson Corr.: -0.15847892585774917
DL Abs. Rel. Err.: 0.6489768052276924
DL Pearson Corr.: -0.0467748554974345
#######################################
Values for Cov. Floor: 0.4, Components: 32:
Tree Abs. Rel. Err.: 0.29765791901283795
Tree Pearson Corr.: 0.13526043382162525
DL Abs. Rel. Err.: 0.5747569360769007
DL Pearson Corr.: -0.023756786148974054
###############################

## Part f using transformers


In [17]:
from sklearn.model_selection import train_test_split
phq_scores_data.columns

Index(['Participant_ID', 'PHQ_Score'], dtype='object')

In [11]:
transcripts_df = pd.DataFrame(columns=["Participant_ID", "transcript"])
for id, transcript in combined_transcripts.items():
    temp = pd.DataFrame({"Participant_ID": [id], "transcript": transcript})
    transcripts_df = pd.concat([transcripts_df, temp], ignore_index=True)

In [12]:
transcripts_df = pd.merge(transcripts_df, phq_scores_data, on="Participant_ID", how="inner")
transcripts_df

Unnamed: 0,Participant_ID,transcript,PHQ_Score
0,423,okay and please yes feeling well where are you...,0
1,436,that was big yes I'm doing fine Mexico when I ...,0
2,447,yeah that's perfectly fine I'm feeling great i...,1
3,452,official say bye and then you're going to pres...,1
4,396,okay thank you thanks for coming in today crea...,5
...,...,...,...
129,466,I was that man when do I talk okay okay okay y...,9
130,633,and she's a speech recognition so like recogni...,10
131,484,okay yes I'm doing well I was born in Oakland ...,9
132,491,do I still have to put my cat okay yes how ove...,8


In [13]:
transcripts_df.columns

Index(['Participant_ID', 'transcript', 'PHQ_Score'], dtype='object')

In [14]:
prompt = f"""
    Transcript= {transcripts_df["transcript"][0]} : Depression Severity=0

    Transcript= {transcripts_df["transcript"][1]} : Depression Severity=0

    Transcript= {transcripts_df["transcript"][10]} : Depression Severity=19

    Transcript= {transcripts_df["transcript"][14]} : Depression Severity=7

    Give one score for all transcripts, only respond with the score.
"""

In [15]:
prompt

"\n    Transcript= okay and please yes feeling well where are you from originally Los Angeles the Greater Los Angeles area people diversity and various entertainment and activities fickle weather traffic and litter do you travel I have travel domestically not internationally seeing other places and how people live in the culture I want to hear about one of your trips the last trip that I went to a San Diego butt out from childhood or as an adult family vacations to Baja in Rosarito Beach can you tell me yeah the reason why that memory comes to mind because this weekend is Easter and we usually will go down to East Easter spring break to Rosarito Baja am I have a Bachelors in communication studies because of the open field just doesn't deal with them communication interactively but Performing Arts telecommunications business entrepreneurship so no I'm not at Maxey unemployed what's your dream job I think 200 my business that's my dream my dream job to have my own company I would like to

In [16]:
y_transcripts = transcripts_df.pop('PHQ_Score')
y_transcripts

0       0
1       0
2       1
3       1
4       5
       ..
129     9
130    10
131     9
132     8
133     0
Name: PHQ_Score, Length: 134, dtype: int64

In [18]:
import sys
sys.path.append('/Users/calebkumar/Desktop/Desktop_Calebs_Pro/coding/ml/Mitigating-socio-demographic-bias/code/minGPT')
from mingpt.model import GPT

In [19]:
x_train, x_test, y_train, y_test = train_test_split(transcripts_df, y_transcripts, test_size=0.2, random_state=76)

In [20]:
y_train

29     12
22      3
10     19
73     14
133     0
       ..
79      4
129     9
42      5
26      1
128     0
Name: PHQ_Score, Length: 107, dtype: int64

In [21]:
x_train = x_train.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)

In [22]:
training_dataset = pd.concat([x_train, y_train], axis=1)
training_dataset

Unnamed: 0,Participant_ID,transcript,PHQ_Score
0,677,I'm going to bring up over to Human you can't ...,12
1,430,looks like we may have okay I will step out wh...,3
2,440,when it's always female yes I'm okay thank you...,19
3,389,and please are you okay sure I'm okay small to...,14
4,657,go ahead and start a camcorder recording and t...,0
...,...,...,...
102,486,already okay great yes I'm feel great I am fro...,4
103,466,I was that man when do I talk okay okay okay y...,9
104,697,okie dokie yeah okay here we go I'm not a ther...,5
105,454,hi I'm Ali thanks for coming in today yes grea...,1


In [23]:
training_dataset.to_csv("./training_dataset.csv", index=False)

In [24]:
def prepare_gpt_dataset(dataset):
    formatted_data = []
    for _, row in dataset.iterrows():
        input_text = f"Transcript: {row['transcript']} PHQ_Score:"
        target_text = f" {row['PHQ_Score']}"
        formatted_data.append({"input": input_text, "target": target_text})
    return formatted_data

formatted_dataset = prepare_gpt_dataset(training_dataset)

# Display the first few formatted examples
formatted_dataset[:5]

[{'input': "Transcript: I'm going to bring up over to Human you can't hi I'm Ellie I'm not a therapist but I'm here and please yes I'm doing fine and yourself where are you from originally I was born in gone for so long I've been down to Georgia for a while but my family is all it from here so that's why I'm here the rudeness of the people I see what you mean do you travel a lot been all over the world driving a funny car down a drag strip and 200 miles an hour what you study basic automotive stuff and I worked at the drag strip already go to school so I got to know quite a few people what are you doing now right now I'm on workman's comp I work for Amvets and I got hurt at work and I'm on workman's comp right now okay what's your dream job I really don't have one at this time pretty much outgoing I'm able to talk to anybody had any time no matter what race they are just I'm one of them lucky persons that can talk to anybody I'll let them do their own thing I just lost my wife three ye

In [25]:
formatted_training_dataset = pd.DataFrame(formatted_dataset)
formatted_training_dataset

Unnamed: 0,input,target
0,Transcript: I'm going to bring up over to Huma...,12
1,Transcript: looks like we may have okay I will...,3
2,Transcript: when it's always female yes I'm ok...,19
3,Transcript: and please are you okay sure I'm o...,14
4,Transcript: go ahead and start a camcorder rec...,0
...,...,...
102,Transcript: already okay great yes I'm feel gr...,4
103,Transcript: I was that man when do I talk okay...,9
104,Transcript: okie dokie yeah okay here we go I'...,5
105,Transcript: hi I'm Ali thanks for coming in to...,1


In [51]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from mingpt.model import GPT
from mingpt.bpe import BPETokenizer
from mingpt.utils import set_seed

# Set up seed for reproducibility
set_seed(3407)

# Define whether to use minGPT or Hugging Face's transformers
use_mingpt = True  # Toggle between minGPT and Hugging Face
model_type = 'gpt2'  # Choose a GPT model size
device = 'cuda' if torch.cuda.is_available() else 'cpu'

print(device)

# Load the model

# model = GPT.from_pretrained(model_type)
# tokenizer = BPETokenizer()  # minGPT tokenizer
model = GPT2LMHeadModel.from_pretrained(model_type)
tokenizer = GPT2Tokenizer.from_pretrained(model_type)
model.config.pad_token_id = model.config.eos_token_id
# Suppress warnings

# Move model to device and set it to evaluation mode
model.to(device)
model.eval()


ImportError: cannot import name 'GPTConfig' from 'mingpt.model' (/Users/calebkumar/Desktop/Desktop_Calebs_Pro/coding/ml/Mitigating-socio-demographic-bias/code/minGPT/mingpt/model.py)

In [48]:

def generate(prompt='', num_samples=10, steps=20, do_sample=True):

    # tokenize the input prompt into integer input sequence
    if use_mingpt:
        tokenizer = BPETokenizer()
        if prompt == '':
            # to create unconditional samples...
            # manually create a tensor with only the special <|endoftext|> token
            # similar to what openai's code does here https://github.com/openai/gpt-2/blob/master/src/generate_unconditional_samples.py
            x = torch.tensor([[tokenizer.encoder.encoder['<|endoftext|>']]], dtype=torch.long)
        else:
            x = tokenizer(prompt).to(device)
    else:
        tokenizer = GPT2Tokenizer.from_pretrained(model_type)
        if prompt == '':
            # to create unconditional samples...
            # huggingface/transformers tokenizer special cases these strings
            prompt = '<|endoftext|>'
        encoded_input = tokenizer(prompt, return_tensors='pt').to(device)
        x = encoded_input['input_ids']

    # we'll process all desired num_samples in a batch, so expand out the batch dim
    x = x.expand(num_samples, -1)

    # forward the model `steps` times to get samples, in a batch
    y = model.generate(x, max_new_tokens=steps, do_sample=do_sample, top_k=40)

    for i in range(num_samples):
        out = tokenizer.decode(y[i].cpu().squeeze())
        print('-'*80)
        print(out)


In [52]:
task_prompt = """
Below are examples of transcripts and their corresponding PHQ scores:

Transcript: "I feel hopeless and have no energy." PHQ Score: 15
Transcript: "I'm managing okay, but sometimes I feel a little anxious." PHQ Score: 7
Transcript: "I can't stop crying and don't want to get out of bed." PHQ Score: 18

Now, classify the following transcript:
Transcript: "I’ve been having trouble sleeping and feeling down." Give me your quess PHQ Score:
"""

# Generate prediction
predictions = generate(prompt="Hello", num_samples=1, steps=50)