In [1]:
import keras
import math
import numpy as np
import os
import pandas as pd
import re
from sklearn import preprocessing
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer 
from textblob import TextBlob


from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import Ridge, RidgeCV
from sklearn.metrics import mean_squared_error
import torch
import transformers as ppb
import warnings
warnings.filterwarnings('ignore')

import umap
import plotly.io as plt_io
import plotly.graph_objects as go
from plotly.validators.scatter.marker import SymbolValidator

In [65]:
path = os.path.join(os.getcwd(), 'data', 'response_format_cleaned_ds1.csv')
responses = pd.read_csv(path, sep=';', header=0)
responses.drop(responses.columns[[0]], axis=1, inplace=True)

# Drop rows with NA values in 'dep_all_words' column
responses = responses[~responses.dep_all_words.str.contains("NA NA NA NA NA")]
responses = responses.reset_index(drop=True)
#responses = responses[~responses.Deptext.str.contains("NaN")]

NBR_RESPONSES = len(responses.values)

responses

Unnamed: 0,id,submitdate,lastpage,startlanguage,seed,startdate,datestamp,sequence1,seqOne,Dep5words[Word1],...,wor_all_selected,wor_all_selected1,minidep_scale,minidep_diagnose,depression_episodes,miniGAD_scale,miniGAD_symptoms_scale,miniGAD_diagnose,minidiagnose_category,minidiagnose_category_number
0,434.0,2020-08-07 11:46:22,15.0,en,6.593644e+08,2020-08-07 11:38:22,2020-08-07 11:46:22,2.0,1.0,motivated,...,NA NA happy NA NA NA NA NA NA NA NA NA NA care...,happy carefree satisfied ...,0.0,0,0,0,0,0,NoDi,0
1,184.0,2020-08-07 11:58:36,15.0,en,2.803892e+08,2020-08-07 11:34:31,2020-08-07 11:58:36,2.0,1.0,connected,...,anxious NA NA NA NA NA NA NA NA NA NA tense NA...,anxious tense fearful sad fe...,3.0,0,0,8,5,0,NoDi,0
2,330.0,2020-08-07 11:51:54,15.0,en,6.770686e+07,2020-08-07 11:36:32,2020-08-07 11:51:54,1.0,1.0,Yes,...,anxious NA NA NA NA NA worried NA NA NA NA NA ...,anxious worried scared sad mon...,7.0,0,5,9,5,0,NoDi,0
3,630.0,2020-08-07 13:22:42,15.0,en,1.176643e+09,2020-08-07 12:55:26,2020-08-07 13:22:42,3.0,1.0,minor,...,anxious NA NA NA NA concerned NA NA NA NA NA t...,anxious concerned tense scared ...,3.0,0,5,8,5,0,NoDi,0
4,400.0,2020-08-07 12:04:52,15.0,en,1.012492e+09,2020-08-07 11:37:19,2020-08-07 12:04:52,1.0,1.0,family,...,NA NA NA NA NA concerned NA NA NA NA NA tense ...,concerned tense sad tired ...,4.0,0,2,7,4,1,GAD,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
957,32.0,2020-08-07 12:02:58,15.0,en,1.363161e+09,2020-08-07 11:45:11,2020-08-07 12:02:58,3.0,1.0,Worried,...,anxious NA NA NA NA NA NA NA NA NA NA NA scare...,anxious scared uneasy fearful ...,9.0,1,1,0,0,0,Dep,1
958,344.0,2020-08-07 13:45:57,15.0,en,1.283747e+08,2020-08-07 13:32:04,2020-08-07 13:45:57,3.0,1.0,Tired,...,NA NA NA nervous NA NA worried NA NA NA NA NA ...,nervous worried stress anxie...,7.0,1,7,11,6,1,BOTH,3
959,297.0,2020-08-07 13:11:46,15.0,en,9.695994e+08,2020-08-07 12:52:04,2020-08-07 13:11:46,3.0,1.0,Unsettled,...,NA NA NA NA NA NA NA NA stressed NA NA NA scar...,stressed scared fearful anxi...,7.0,1,2,10,5,1,BOTH,3
960,241.0,2020-08-07 12:51:07,15.0,en,1.976520e+09,2020-08-07 12:28:54,2020-08-07 12:51:07,2.0,1.0,Despair,...,anxious NA NA NA NA NA worried NA stressed NA ...,anxious worried stressed uneasy ...,8.0,1,1,11,6,1,BOTH,3


In [96]:
dep_words_columns = ['dep_all_words', 'dep_all_selected1']
df_words_raw = responses[dep_words_columns]

In [72]:
"""
Gets true PHQ-9 scale and cleans from NaN values. 
"""
def replace_nan(y_array):
    #Replaceing NaN values with mean value of column - perhaps we should do this differently
    col_mean = np.nanmean(y_array, axis=0)
    col_mean = np.around(col_mean, decimals=0, out=None) 
    inds = np.where(np.isnan(y_array))
    y_array[inds] = np.take(col_mean, inds[1])
    return y_array

def get_binary_y():
    #The categorical variables, whether someone is diagnosed with worry/dep or not
    dep_diag = responses[['minidep_diagnose']]
    wor_diag = responses[['miniGAD_diagnose']]

    #Create y - variables
    y_wor_diag = wor_diag.values
    y_dep_diag = dep_diag.values
    
    return y_dep_diag, y_wor_diag

def get_scale_y(): 
    #The numerical rating scales for depression and worry
    dep_scale = responses[['PHQtot']]
    wor_scale = responses[['GADtot']]
    
    #Checking for NaN values
    dep_scale.isnull().values.any() #true
    wor_scale.isnull().values.any() #true

    #Create y - variables
    y_wor_scale = wor_scale.values
    y_dep_scale = dep_scale.values

    #Replace NaN Values for the numerical scales
    y_wor_scale = replace_nan(y_wor_scale).flatten()
    y_dep_scale = replace_nan(y_dep_scale).flatten()
    
    return y_dep_scale, y_wor_scale

In [12]:
"""
Regroups the PHQ-9 scale into 5 classes. 
"""
def reclass_scale(num_array):
    
    def new_class(num):
        if 0 <= num <= 4: return 0
        elif 5 <= num <= 9: return 1 
        elif 10 <= num <= 14: return 2
        elif 15 <= num <= 19: return 3
        else: return 4
        
    return np.array([new_class(x) for x in num_array])

In [13]:
"""
Tokenizes string and lemmatizes tokens. Computes sentiment analysis on all words.
"""
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    if isinstance(text, str):
        try:
            tokens = word_tokenize(text)
            words = [lemmatizer.lemmatize(w.lower()) for w in tokens if w.isalpha()]
            words = [w for w in words if not w in stop_words] #Uncomment if stop words should be excluded
            return words
        except Exception as e: 
            print(e)
    elif math.isnan(text):
        return []
    
def aggregate_cell(text):
    tokens = pd.Series.apply(text, clean_text)
    return tokens

# Defining a sentiment analyser function
def sentiment_analyser(wordlist):
    return wordlist.apply(lambda Words: pd.Series(TextBlob(' '.join(Words)).sentiment.polarity))

In [133]:
df_words = df_words_raw.apply(aggregate_cell, axis=1)
df_words['all_words'] = df_words['dep_all_words'] + df_words['dep_all_selected1']
df_words['polarity'] = sentiment_analyser(df_words['all_words'])
df_words['dep_class'] = reclass_scale(get_scale_y()[0])
df_words['dep_diagnosis'] = get_binary_y()[0]
df_words['minidiagnose_category'] = responses.minidiagnose_category
df_words['minidiagnose_category_number'] = responses.minidiagnose_category_number

In [103]:
for i, row in enumerate(df_words['all_words']):
    print(i, row)

0 ['motivated', 'learning', 'passionate', 'enthusiastic', 'happy', 'joyful', 'cheerful', 'hopeful', 'relaxed', 'active']
1 ['connected', 'engaged', 'alive', 'alert', 'strong', 'content', 'hopeful', 'optimistic', 'active', 'loving']
2 ['yes', 'somewhat', 'extremely', 'partially', 'unhappy', 'lonely', 'stressed', 'depressed', 'angry']
3 ['minor', 'unwilling', 'unwanting', 'depressive', 'unbothered', 'sad', 'blue', 'lonely', 'depressed', 'angry']
4 ['family', 'work', 'money', 'bill', 'stressed', 'tired', 'anxious', 'worried', 'stressed']
5 ['happy', 'joyful', 'excited', 'hopeful', 'pleased', 'joyful', 'cheerful', 'optimistic', 'blessed', 'love']
6 ['edgy', 'low', 'slow', 'tense', 'tired', 'satisfied', 'peaceful', 'calm', 'stressed']
7 ['downhearted', 'sad', 'melancholy', 'blue', 'unhappy', 'anxious', 'unhappy', 'blue', 'depressed']
8 ['low', 'sad', 'distressed', 'numb', 'lucid', 'tired', 'anxious', 'worried', 'blue']
9 ['react', 'thought', 'act', 'wake', 'live', 'hopeful', 'optimistic', '

In [126]:
responses.iloc[19][['PHQtot','GADtot', 'minidep_diagnose','miniGAD_diagnose','minidiagnose_category']]

PHQtot                    4.0
GADtot                    4.0
minidep_diagnose            0
miniGAD_diagnose            0
minidiagnose_category    NoDi
Name: 19, dtype: object

In [116]:
for col in responses.columns:
    print(col)

id
submitdate
lastpage
startlanguage
seed
startdate
datestamp
sequence1
seqOne
Dep5words[Word1]
Dep5words[Word2]
Dep5words[Word3]
Dep5words[Word4]
Dep5words[Word5]
Wor5words[SQ01]
Wor5words[SQ02]
Wor5words[SQ03]
Wor5words[SQ04]
Wor5words[SQ05]
sequence2
seq2
Dep5phraseorwords[SQ01]
Dep5phraseorwords[SQ02]
Dep5phraseorwords[SQ03]
Dep5phraseorwords[SQ04]
Dep5phraseorwords[SQ05]
Wor5phraseorwords[SQ01]
Wor5phraseorwords[SQ02]
Wor5phraseorwords[SQ03]
Wor5phraseorwords[SQ04]
Wor5phraseorwords[SQ05]
sequence3
seq3
Deptext
Wortext
DepselectCol[happy]
DepselectCol[sad]
DepselectCol[content]
DepselectCol[joyful]
DepselectCol[satisfied]
DepselectCol[tired]
DepselectCol[peaceful]
DepselectCol[cheerful]
DepselectCol[excited]
DepselectCol[calm]
DepselectCol[hopeful]
DepselectCol[down]
DepselectCol[anxious]
DepselectCol[unhappy]
DepselectCol[worried]
DepselectCol[optimistic]
DepselectCol[blue]
DepselectCol[lonely]
DepselectCol[relaxed]
DepselectCol[pleased]
DepselectCol[stressed]
DepselectCol[depres

# Create embeddings using BERT

In [88]:
model_class, tokenizer_class, pretrained_weights = (ppb.BertModel, ppb.BertTokenizer, 'bert-large-uncased')

# Load pretrained model/tokenizer
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
bert_model = model_class.from_pretrained(pretrained_weights)

In [128]:
def get_words_embedding(word_list):
    tokenized_words = []
    
    for word in word_list:
        tokenized_words.append(tokenizer.encode(word, add_special_tokens=True))
    
    max_len = 0
    for i in tokenized_words:
        if len(i) > max_len:
            max_len = len(i)

    padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized_words])
    attention_mask = np.where(padded != 0, 1, 0)
    
    input_ids = torch.tensor(padded)  
    attention_mask = torch.tensor(attention_mask)

    with torch.no_grad():
        last_hidden_states = bert_model(input_ids, attention_mask=attention_mask)
    features = last_hidden_states[0][:,0,:]
    embeddings_mean = torch.mean(features, dim=0)
    
    return embeddings_mean.numpy()

In [129]:
def get_single_word_embedding(word):
    
    idx_tokens = tokenizer.encode(word, add_special_tokens=True)
    
    segments_ids = [1] * len(idx_tokens)
    
    input_ids = torch.tensor([idx_tokens]) 
    segments_tensors = torch.tensor([segments_ids])

    with torch.no_grad():
        last_hidden_states = bert_model(input_ids, segments_tensors)
    
    features = last_hidden_states[0][:,0,:]
    return features.numpy()

### Create word embeddings

In [131]:
df_words['embeddings'] = df_words['all_words'].apply(get_words_embedding)

In [132]:
df_word_embeddings = pd.DataFrame(list(df_words['embeddings'].values))
df_word_embeddings

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1014,1015,1016,1017,1018,1019,1020,1021,1022,1023
0,-0.286945,-0.704067,-0.329593,-0.095753,0.371091,0.039494,0.016153,0.567595,0.768584,0.163829,...,-0.081441,0.196476,-1.067253,0.566605,0.014094,0.560107,0.390614,-0.430500,-0.347295,0.436724
1,-0.429299,-0.810017,-0.279851,-0.102155,0.638091,0.057378,0.095978,0.451752,0.929531,0.258261,...,-0.141187,0.178328,-1.285557,0.686991,0.019614,0.510867,0.302861,-0.515970,-0.275769,0.521093
2,-0.242295,-0.729603,-0.106986,-0.045915,0.193038,-0.142833,-0.136139,0.725106,0.958970,0.161761,...,0.066825,-0.016659,-1.174224,0.452059,-0.127458,0.392650,0.342169,-0.473843,-0.385068,0.418821
3,-0.538083,-0.833550,-0.264088,-0.038816,0.300945,-0.043375,0.101197,0.507463,0.696927,0.146248,...,-0.186439,0.009205,-1.278134,0.571358,0.126268,0.591661,0.140507,-0.381265,-0.417684,0.443161
4,-0.541720,-0.685660,-0.122641,-0.141538,0.080809,0.129055,0.246563,0.577789,0.955209,0.093470,...,-0.084468,-0.002326,-1.174331,0.559987,0.000706,0.632813,0.127029,-0.512298,-0.423815,0.524847
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
957,-0.508005,-0.710467,-0.108245,-0.215925,0.178859,0.055168,0.111164,0.498558,1.004555,0.209042,...,-0.283902,-0.095138,-1.130488,0.445158,0.200916,0.622486,0.140622,-0.481370,-0.317252,0.510820
958,-0.436111,-0.633424,-0.158170,-0.057350,0.087761,-0.185637,-0.149170,0.645068,0.920233,0.106716,...,-0.126078,-0.048341,-1.156779,0.483126,0.056184,0.594872,0.283100,-0.575648,-0.270613,0.418653
959,-0.385802,-0.697408,-0.233589,-0.104543,0.157401,0.048896,-0.077120,0.458185,0.770330,0.210073,...,-0.143045,-0.008077,-1.077365,0.531188,0.057279,0.473919,0.093911,-0.430369,-0.332123,0.311898
960,-0.551933,-0.612738,-0.198439,0.020218,0.103712,-0.067198,-0.168283,0.494133,0.859710,0.190728,...,-0.204535,-0.015649,-1.294399,0.497795,0.043635,0.549639,0.229795,-0.578431,-0.233610,0.433388


In [159]:
def words_to_plot(words):
    plot_embedding = pd.DataFrame()
    for word in words: 
        plot_embedding = plot_embedding.append(pd.DataFrame(get_single_word_embedding(word)), ignore_index=True)
    
    return plot_embedding

In [143]:
df_words[['all_words','minidiagnose_category']]

Unnamed: 0,all_words,minidiagnose_category
0,"[motivated, learning, passionate, enthusiastic...",NoDi
1,"[connected, engaged, alive, alert, strong, con...",NoDi
2,"[yes, somewhat, extremely, partially, unhappy,...",NoDi
3,"[minor, unwilling, unwanting, depressive, unbo...",NoDi
4,"[family, work, money, bill, stressed, tired, a...",GAD
...,...,...
957,"[worried, fear, motivation, change, tired, anx...",Dep
958,"[tired, sad, unhappy, tearful, sad, anxious, s...",BOTH
959,"[unsettled, lazy, drowsy, overwhelmed, nervous...",BOTH
960,"[despair, numbness, guilty, hollow, grief, sad...",BOTH


# Plots

In [168]:
responses_words_plot = pd.Series(df_words['all_words']).apply(', '.join)

In [180]:
def create_plot_2d(component1, component2, color_values, words):
    
    # If y-values are binary a better coloring is chosen
    color_values = color_values.flatten()
    if np.array_equal(color_values, color_values.astype(bool)):
        color_scale = 'Bluered'
    else: 
        color_scale = 'Rainbow'
    
    fig = go.Figure(data=go.Scatter(
        x=reduced_response_embeddings_2d[:,0],
        y=reduced_response_embeddings_2d[:,1],
        text=responses_words_plot,
        hoverinfo='text',
        mode='markers',
        showlegend=False,
        marker=dict(
            size=13,
            color=color_values,
            colorscale=color_scale, 
            showscale=True,
            line_width=1
        )
    ))
    
    fig.add_trace(
        go.Scatter(
            x=component1,
            y=component2,
            text=words,
            hoverinfo='text',
            mode='markers',
            showlegend=False,
            marker=dict(
                size=15,
                symbol='diamond',
                color='magenta',
                showscale=False,
                line_width=1
        )))
    
    fig.update_layout(margin=dict( l=100,r=100,b=100,t=100),width=900,height=500)                 
    fig.layout.template = 'plotly_dark'
    
    fig.show()

In [185]:
def create_plot_3d(component1, component2, component3, color_values, words):

    # If y-values are binary a better coloring is chosen
    color_values = color_values.flatten()
    if np.array_equal(color_values, color_values.astype(bool)):
        color_scale = 'Bluered'
    else: 
        color_scale = 'Rainbow'
    
    fig = go.Figure(data=[go.Scatter3d(
            x=reduced_response_embeddings_3d[:,0],
            y=reduced_response_embeddings_3d[:,1],
            z=reduced_response_embeddings_3d[:,2],
            text=responses_words_plot,
            hoverinfo='text',
            mode='markers',
            showlegend=False,        
            marker=dict(
                size=8,
                color=color_values,
                colorscale=color_scale,
                opacity=1,
                showscale=True,
                line_width=1
            )
        )])
    
    fig.add_trace(
        go.Scatter3d(
            x=component1,
            y=component2,
            z=component3,
            text=words,
            hoverinfo='text',
            mode='markers',
            showlegend=False,
            marker=dict(
                size=8,
                symbol='diamond',
                color='magenta',
                showscale=False,
                line_width=1
        )))
    fig.update_layout(margin=dict(l=50,r=50,b=50,t=50),width=900,height=500)
    fig.layout.template = 'plotly_dark'

    fig.show()

In [187]:
reducer_2d = umap.UMAP(n_neighbors=4, min_dist=2, spread=2).fit(df_word_embeddings.values)
reduced_response_embeddings_2d = reducer_2d.transform(df_word_embeddings.values)

def plot_word_2d(words, color_scale=True):
    df_plot_embeddings = words_to_plot(words)
    
    embedding_plot_2d = reducer_2d.transform(df_plot_embeddings.values)
    color_coding = df_words['dep_class'].values if color_scale else df_words['dep_diagnosis'].values
    #color_coding = df_words['minidiagnose_category_number'].values

    create_plot_2d(embedding_plot_2d[:, 0], embedding_plot_2d[:, 1], color_coding, words)

In [188]:
reducer_3d = umap.UMAP(n_components=3, n_neighbors=4, min_dist=2, spread=2).fit(df_word_embeddings.values)
reduced_response_embeddings_3d = reducer_3d.transform(df_word_embeddings.values)

def plot_word_3d(words, color_scale=True):
    df_embeddings = words_to_plot(words)
    
    embedding_dep_3D = reducer_3d.transform(df_embeddings.values)
    color_coding = df_words['dep_class'].values if color_scale else df_words['dep_diagnosis'].values
    #color_coding = df_words['minidiagnose_category_number'].values
    
    create_plot_3d(embedding_dep_3D[:, 0], embedding_dep_3D[:, 1], embedding_dep_3D[:, 2], color_coding, words)

In [190]:
plot_words = ['sad', 'calm', 'suicide']

plot_word_2d(plot_words, color_scale=False)
plot_word_3d(plot_words, color_scale=False)