# Notebook for processing depression labels and features extracted from COVAREP

Processing of data from COVAREP feature extraction. First gather all data from questionnaires and gender from participants which are collected from 3 different files. Then import COVAREP data from all participants (one file per participant), calculate the mean for each feature for each participant and create a dataframe that stores means for all features for all participants. 

In [1]:
import pandas as pd
import numpy as np

### Import and process questionnaire and gender data

In [2]:
depression_data_train = pd.read_csv('train_split_Depression_AVEC2017.csv')
colnames=['Participant_ID', 'PHQ8_Binary', 'PHQ8_Score', 'Gender']
depression_data_test = pd.read_csv('full_test_split.csv', names = colnames)

depression_data_train = depression_data_train[['Participant_ID', 'PHQ8_Binary', 'PHQ8_Score', 'Gender']]
depression_data_dev = pd.read_csv('dev_split_Depression_AVEC2017.csv')
depression_data_dev = depression_data_dev[['Participant_ID', 'PHQ8_Binary', 'PHQ8_Score', 'Gender']]
depression_data = pd.concat([depression_data_train, depression_data_test, depression_data_dev], axis = 0)

#Convert columns into numeric features
depression_data['Participant_ID'] = pd.to_numeric(depression_data['Participant_ID'], errors ='coerce')
depression_data['PHQ8_Binary'] = pd.to_numeric(depression_data['PHQ8_Binary'], errors ='coerce')
depression_data['PHQ8_Score'] = pd.to_numeric(depression_data['PHQ8_Score'], errors ='coerce')
depression_data['Gender'] = pd.to_numeric(depression_data['Gender'], errors ='coerce')

## Import COVAREP datafiles and assign column names 

In [4]:
#assign column names for COVAREP dataframe
column_names = ['F0','VUV','NAQ','QOQ','H1H2','PSP','MDQ','peakSlope','Rd',
'Rd_conf','creak','MCEP_0','MCEP_1','MCEP_2','MCEP_3','MCEP_4','MCEP_5',
'MCEP_6','MCEP_7','MCEP_8','MCEP_9','MCEP_10','MCEP_11','MCEP_12',
'MCEP_13','MCEP_14','MCEP_15','MCEP_16','MCEP_17','MCEP_18', 
'MCEP_19','MCEP_20','MCEP_21','MCEP_22','MCEP_23','MCEP_24',
'HMPDM_0','HMPDM_1','HMPDM_2','HMPDM_3','HMPDM_4','HMPDM_5', 
'HMPDM_6','HMPDM_7','HMPDM_8','HMPDM_9','HMPDM_10','HMPDM_11','HMPDM_12', 
'HMPDM_13','HMPDM_14','HMPDM_15','HMPDM_16','HMPDM_17','HMPDM_18', 
'HMPDM_19','HMPDM_20','HMPDM_21','HMPDM_22','HMPDM_23','HMPDM_24',
'HMPDD_0','HMPDD_1','HMPDD_2','HMPDD_3','HMPDD_4','HMPDD_5', 
'HMPDD_6','HMPDD_7','HMPDD_8','HMPDD_9','HMPDD_10','HMPDD_11','HMPDD_12']

#Function to import COVAREP data from .csv into a dataframe
def import_csv(patientId):
    data = pd.read_csv("COVAREP/{}_COVAREP.csv".format(patientId), names = column_names, header=None)
    participant_data = data.loc[data['VUV'] == 1] #locate rows when participant speaks
    patient_depression_binary = depression_data.loc[depression_data['Participant_ID'] == patientId]['PHQ8_Binary'].values[0]
    patient_depression_score = depression_data.loc[depression_data['Participant_ID'] == patientId]['PHQ8_Score'].values[0]
    patient_gender = depression_data.loc[depression_data['Participant_ID'] == patientId]['Gender'].values[0]
    #append data from questionnaires and gender to heach particpant from previous dataframe with questionnaire data
    participant_data['PHQ8_Binary'] = np.full((1,participant_data.shape[0]),patient_depression_binary).T
    participant_data['PHQ8_Score'] = np.full((1,participant_data.shape[0]),patient_depression_score).T
    participant_data['Gender'] = np.full((1,participant_data.shape[0]),patient_gender).T
    return participant_data

In [None]:
participants_total_data = pd.DataFrame([]) #create empty dataframe

for patientId in range(300, 492):
    if (patientId == 342 or patientId == 394 or patientId == 398 or patientId == 460):
        print("no patient") #except patients with no data
    else:
        participants_total_data = pd.concat([participants_total_data, import_csv(patientId)]) #append to created dataframe the data imported from covarep for each participant

participants_total_data


In [7]:
#export dataframe with COVAREP data for all participants. This has data of each patient every 10 ms
participants_total_data.to_csv('COVAREP_and_depression_data.csv')

#### Similar to previous but at the moment of importing .csv data calculate mean for every feature for each participant

In [9]:
def import_csv_mean(patientId):
    data = pd.read_csv("COVAREP/{}_COVAREP.csv".format(patientId), names = column_names, header=None)
    participant_data = data.loc[data['VUV'] == 1] #only the timesteps where the participant is talking
    participant_data = participant_data.mean() #mean for every column
    #locate information for PHQ8 and gender for participant
    patient_depression_binary = depression_data.loc[depression_data['Participant_ID'] == patientId]['PHQ8_Binary'].values[0]
    patient_depression_score = depression_data.loc[depression_data['Participant_ID'] == patientId]['PHQ8_Score'].values[0]
    patient_gender = depression_data.loc[depression_data['Participant_ID'] == patientId]['Gender'].values[0]
    
    participant_data['PHQ8_Binary'] = patient_depression_binary
    participant_data['PHQ8_Score'] = patient_depression_score
    participant_data['Gender'] = patient_gender
    participant_data['patientId'] = patientId
    return participant_data.to_frame().T

participants_data_mean = pd.DataFrame([])
for patientId in range(300, 493):
    if (patientId == 342 or patientId == 394 or patientId == 398 or patientId == 460):
        print("no patient")
    else:
        participants_data_mean = pd.concat([participants_data_mean, import_csv_mean(patientId)])

#export new dataframe with means for each participant (189 rows)
participants_data_mean.to_csv('COVAREP_and_depression_data_mean.csv') 



no patient
no patient
no patient
no patient


In [10]:
#check dataframe
participants_data_mean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 189 entries, 0 to 0
Data columns (total 78 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   F0           189 non-null    float64
 1   VUV          189 non-null    float64
 2   NAQ          189 non-null    float64
 3   QOQ          189 non-null    float64
 4   H1H2         189 non-null    float64
 5   PSP          189 non-null    float64
 6   MDQ          189 non-null    float64
 7   peakSlope    189 non-null    float64
 8   Rd           189 non-null    float64
 9   Rd_conf      189 non-null    float64
 10  creak        189 non-null    float64
 11  MCEP_0       189 non-null    float64
 12  MCEP_1       189 non-null    float64
 13  MCEP_2       189 non-null    float64
 14  MCEP_3       189 non-null    float64
 15  MCEP_4       189 non-null    float64
 16  MCEP_5       189 non-null    float64
 17  MCEP_6       189 non-null    float64
 18  MCEP_7       189 non-null    float64
 19  MCEP_8    