**Imports**

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
import numpy as np
import pandas as pd
import gc
from keras.models import Sequential
from keras.layers import Conv2D
from keras.layers import MaxPooling2D
from keras.layers import Flatten
from keras.layers import Dense
from keras.layers import Dropout
from smart_open import open
from matplotlib import pyplot as plt
from sklearn.metrics import classification_report
from gensim.models.keyedvectors import KeyedVectors

**Data Preprocessing**

In [4]:
# reading the train, text and dev datasets
dataset_1 = np.array(pd.read_csv('/content/drive/My Drive/ICAIML_Project/Datasets/diacwoz/dev_split_Depression_AVEC2017.csv',delimiter=',',encoding='utf-8'))[:, 0:2]
dataset_2 = np.array(pd.read_csv('/content/drive/My Drive/ICAIML_Project/Datasets/diacwoz/full_test_split.csv',delimiter=',',encoding='utf-8'))[:, 0:2]
dataset_3 = np.array(pd.read_csv('/content/drive/My Drive/ICAIML_Project/Datasets/diacwoz/train_split_Depression_AVEC2017.csv',delimiter=',',encoding='utf-8'))[:, 0:2]

In [5]:
# concatenating the full dataset
full_dataset = np.concatenate((dataset_1, np.concatenate((dataset_2, dataset_3))))

In [6]:
def checkBoolVal(dataset, index):
    for i in range(len(dataset)):
        if(dataset[i][0] == index):
            return dataset[i][1]
    return 0

In [7]:
# initializing all variables
Data = []
Y = []
Data_test = []
Y_test = []
index = -1

# getting transcript files for each participant
for i in range(len(dataset_3)):
    val = checkBoolVal(full_dataset, dataset_3[i][0])
    Y.append(val)
    try:
        train_data_fileName = "/content/drive/My Drive/ICAIML_Project/Datasets/diacwoz/train_data/" + str(int(dataset_3[i][0])) + "_TRANSCRIPT.csv"
        Data.append(np.array(pd.read_csv(train_data_fileName, delimiter='\t', encoding='utf-8'))[:, 2:4])
    except Exception as e:
        print(e)

for i in range(len(dataset_1)):
    val = checkBoolVal(full_dataset, dataset_1[i][0])
    Y.append(val)
    try:
        dev_data_fileName = "/content/drive/My Drive/ICAIML_Project/Datasets/diacwoz/dev_data/" + str(int(dataset_1[i][0])) + "_TRANSCRIPT.csv"
        Data.append(np.array(pd.read_csv(dev_data_fileName, delimiter='\t', encoding='utf-8'))[:, 2:4])
    except Exception as e:
        print(e)

for i in range(0, len(dataset_2)):
    Y_test.append(checkBoolVal(full_dataset, dataset_2[i][0]))
    try:
        test_data_fileName = "/content/drive/My Drive/ICAIML_Project/Datasets/diacwoz/test_data/" + str(int(dataset_2[i][0])) + "_TRANSCRIPT.csv"
        Data_test.append(np.array(pd.read_csv(test_data_fileName, delimiter='\t', encoding='utf-8'))[:, 2:4])
    except Exception as e:
        print(e)

In [8]:
# appending only participant value to Data2
Y = np.array(Y)
Data2 = []

Data2_test = []
Y_test = np.array(Y_test)

# for training data
for i in range(len(Data)):
    script = []
    for k in range(1, len(Data[i])):
        if(Data[i][k][0] == "Participant"):
            script.append(Data[i][k][1])
    Data2.append(script)
    
# for test data
for i in range(len(Data_test)):
    script = []
    for k in range(1, len(Data_test[i])):
        if(Data_test[i][k][0] == "Participant"):
            script.append(Data_test[i][k][1])
    Data2_test.append(script)


In [9]:
Data = []
Data_test = []

# running garbage collection to free up memory
gc.collect()   

Data2 = np.array(Data2, dtype=object)
Data2_test = np.array(Data2_test, dtype=object)

# loading GoogleNews keyed vectors bin file
model = KeyedVectors.load_word2vec_format('/content/drive/MyDrive/ICAIML_Project/Datasets/GoogleNews-vectors-negative300.bin', binary=True)

# getting the stop words
stop_words = set(stopwords.words('english'))

In [10]:
def threshold(Y_pred, t):
    Y_pred2 = []
    for i in range(len(Y_pred)):
        if(Y_pred[i] < t):
            Y_pred2.append(0)
        else:
            Y_pred2.append(1)
    return np.array(Y_pred2)

In [11]:
def rem_stopwords(sent):
    filt_sent = [] 
    for word in sent: 
        if word not in stop_words: 
            filt_sent.append(word)
    return filt_sent

In [13]:
# upsampling training data
def upsample(X_train,Y_train):
    X_train_0 = X_train[Y_train==0]
    X_train_1 = X_train[Y_train==1]

    # print(X_train_0.shape)
    Y_train_1 = Y_train[Y_train==1]
    size = X_train_0.shape[0] - X_train_1.shape[0]
    X = []
    Y = []
    X_train = list(X_train)
    Y_train = list(Y_train)

    while(size>0):
        size -= 1
        index = np.random.randint(0,X_train_1.shape[0]-1)
        leave_index = np.random.randint(0,len(X_train)-1)
        X_add = X_train_1[index]
        X_leave = X_train[leave_index]

        Y_add = Y_train_1[index]
        Y_leave = Y_train[leave_index]

        X_train[leave_index] = X_add
        X_train.append(X_leave)

        Y_train[leave_index] = Y_add
        Y_train.append(Y_leave)


    X_train = np.array(X_train)
    Y_train = np.array(Y_train)
    return X_train,Y_train

In [14]:
max_ws = 20
max_sent = 250

# preprocessing the train data
matrix = np.zeros((Data2.shape[0], max_sent, max_ws, 300))
max_sent_len = 0
sent = ""
for k in range(Data2.shape[0]):
    if(max_sent_len < len(Data2[k])):
        max_sent_len = len(Data2[k])
        sent = Data2[k]
    for i in range(min(max_sent, len(Data2[k]))):
        try:
            sent = Data2[k][i].split(" ")
        except:
            continue
        sent = rem_stopwords(sent)
        for j in range(min(max_ws, len(sent))):
            try:
                # removing the < and > from the words
                w = sent[j]
                if(w[0] == '<'):
                    if(w.find('>')!=-1):
                        w = w[1:-1]
                    else:
                        w = w[1:]
                else:
                    if(w.find('>')!=-1):
                        w = w[0:-1]
                matrix[k][i][j] = np.array(model[w])
            except Exception as e:
                continue

In [15]:
# preprocessing the test data
# separated due to memory constraints
max_len_sent = 0
matrix_test = np.zeros((Data2_test.shape[0], max_sent, max_ws, 300))
for k in range(Data2_test.shape[0]):
    if(max_len_sent < len(Data2_test[k])):
      max_len_sent = len(Data2_test[k])
      sent = Data2_test[k]
    for i in range(min(max_sent, len(Data2_test[k]))):
        try:
            sent = Data2_test[k][i].split(" ")
        except:
            continue
        sent = rem_stopwords(sent)
        for j in range(min(max_ws, len(sent))):
            try:
                # removing the < and > from the words
                w = sent[j]
                if(w[0] == '<'):
                    if(w.find('>')!=-1):
                        w = w[1:-1]
                    else:
                        w = w[1:]
                else:
                    if(w.find('>')!=-1):
                        w = w[0:-1]
                matrix_test[k][i][j] = np.array(model[w])
            except Exception as e:
                continue

Data2 = []
Data2_test = []
model = []
stop_words = []

# running garbage collection to free up memory
gc.collect()

# upsampling training and test data
matrix, Y = upsample(matrix,Y)
matrix_test, Y_test = upsample(matrix_test,Y_test)

**CNN Class**

In [None]:
class CNN:
  def __init__(self):
    classifier = Sequential()
    classifier.add(Conv2D(150, (1, 5), input_shape = (matrix.shape[1], matrix.shape[2], matrix.shape[3]), activation = 'relu', data_format="channels_last"))
    classifier.add(MaxPooling2D(pool_size = (1, 3)))
    classifier.add(Conv2D(75, (1, 3), activation = 'relu', data_format="channels_last"))
    classifier.add(MaxPooling2D(pool_size = (1, 2)))
    classifier.add(Flatten())
    classifier.add(Dense(units = 128, activation = 'relu'))
    classifier.add(Dense(units = 1, activation = 'sigmoid'))
    classifier.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])
    self.classifier = classifier

  def fitModel(self, X, Y, epoch):
    class_weight = {0: 0.4, 1: 0.7}
    self.classifier.fit(X, Y, epochs = epoch, class_weight=class_weight)

  def predictModel(self, X):
    return threshold(self.classifier.predict(X), 0.45)
  
model = CNN()
model.fitModel(matrix, Y, 5)
Y_Pred = model.predictModel(matrix_test)
print(classification_report(Y_test, Y_Pred))