In [296]:
import keras
import math
import numpy as np
import os
import pandas as pd
import re
from sklearn import preprocessing
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer 
from textblob import TextBlob


from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import Ridge, RidgeCV
from sklearn.metrics import mean_squared_error
import torch
import transformers as ppb
import warnings
warnings.filterwarnings('ignore')

import umap
import plotly.io as plt_io
import plotly.graph_objects as go
from plotly.validators.scatter.marker import SymbolValidator

In [573]:
path = os.path.join(os.getcwd(), 'data', 'response_format_cleaned_ds1.csv')
responses = pd.read_csv(path, sep=';', header=0)
responses.drop(responses.columns[[0]], axis=1, inplace=True)

# Drop rows with NA values in 'dep_all_words' column
responses = responses[~responses.dep_all_words.str.contains("NA NA NA NA NA")]
responses = responses.reset_index(drop=True)

NBR_RESPONSES = len(responses.values)

responses

962

In [559]:
dep_text_columns = ['Deptext', 'dep_all_phraces']
dep_words_columns = ['dep_all_words', 'dep_all_selected1']
df_text_raw = responses[dep_text_columns]
df_words_raw = responses[dep_words_columns]

df_words_raw

Unnamed: 0,dep_all_words,dep_all_selected1
0,motivated learning passionate enthusiastic happy,joyful cheerful hopeful relaxed...
1,connected engaged alive alert strong,content hopeful optimistic ...
2,Yes Somewhat Very Extremely Partially,unhappy lonely stressed depr...
3,minor unwilling unwanting depressive unbothered,sad blue lonely depressed a...
4,family work money bills stressed,tired down anxious worried str...
...,...,...
957,Worried Fear Motivation Change Tired,anxious worried optimistic st...
958,Tired Down Sad Unhappy Tearful,sad anxious stressed depress...
959,Unsettled Lazy Drowsy Overwhelmed Nervous,sad anxious unhappy worried lonel...
960,Despair Numbness Guilty Hollow Grief,sad tired down unhappy depres...


In [604]:
"""
Gets true PHQ-9 scale and cleans from NaN values. 
"""
def replace_nan(y_array):
    #Replaceing NaN values with mean value of column - perhaps we should do this differently
    col_mean = np.nanmean(y_array, axis=0)
    col_mean = np.around(col_mean, decimals=0, out=None) 
    inds = np.where(np.isnan(y_array))
    y_array[inds] = np.take(col_mean, inds[1])
    return y_array

def get_binary_y():
    #The categorical variables, whether someone is diagnosed with worry/dep or not
    dep_diag = responses[['minidep_diagnose']]
    wor_diag = responses[['miniGAD_diagnose']]

    #Create y - variables
    y_wor_diag = wor_diag.values
    y_dep_diag = dep_diag.values
    
    return y_dep_diag, y_wor_diag

def get_scale_y(): 
    #The numerical rating scales for depression and worry
    dep_scale = responses[['PHQtot']]
    wor_scale = responses[['GADtot']]
    
    #Checking for NaN values
    dep_scale.isnull().values.any() #true
    wor_scale.isnull().values.any() #true

    #Create y - variables
    y_wor_scale = wor_scale.values
    y_dep_scale = dep_scale.values

    #Replace NaN Values for the numerical scales
    y_wor_scale = replace_nan(y_wor_scale).flatten()
    y_dep_scale = replace_nan(y_dep_scale).flatten()
    
    return y_dep_scale, y_wor_scale

In [56]:
"""
Regroups the PHQ-9 scale into 5 classes. 
"""
def reclass_scale(num_array):
    
    def new_class(num):
        if 0 <= num <= 4: return 0
        elif 5 <= num <= 9: return 1 
        elif 10 <= num <= 14: return 2
        elif 15 <= num <= 19: return 3
        else: return 4
        
    return np.array([new_class(x) for x in num_array])

In [47]:
"""
Tokenizes string and lemmatizes tokens. Computes sentiment analysis on all words.
"""
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    if isinstance(text, str):
        try:
            tokens = word_tokenize(text)
            words = [lemmatizer.lemmatize(w.lower()) for w in tokens if w.isalpha()]
            words = [w for w in words if not w in stop_words] #Uncomment if stop words should be excluded
            return words
        except Exception as e: 
            print(e)
    elif math.isnan(text):
        return []
    
def aggregate_cell(text):
    tokens = pd.Series.apply(text, clean_text)
    return tokens

# Defining a sentiment analyser function
def sentiment_analyser(wordlist):
    return wordlist.apply(lambda Words: pd.Series(TextBlob(' '.join(Words)).sentiment.polarity))

In [605]:
df_words = df_words_raw.apply(aggregate_cell, axis=1)
df_words['all_words'] = df_words['dep_all_words'] + df_words['dep_all_selected1']
df_words['polarity'] = sentiment_analyser(df_words['all_words'])
df_words['dep_class'] = reclass_scale(get_scale_y()[0])
df_words['dep_diagnosis'] = get_binary_y()[0]

In [611]:
df_words

Unnamed: 0,dep_all_words,dep_all_selected1,all_words,polarity,dep_class,dep_diagnosis
0,"[motivated, learning, passionate, enthusiastic...","[joyful, cheerful, hopeful, relaxed, active]","[motivated, learning, passionate, enthusiastic...",0.323333,0,0
1,"[connected, engaged, alive, alert, strong]","[content, hopeful, optimistic, active, loving]","[connected, engaged, alive, alert, strong, con...",0.250000,3,0
2,"[yes, somewhat, extremely, partially]","[unhappy, lonely, stressed, depressed, angry]","[yes, somewhat, extremely, partially, unhappy,...",-0.400000,4,0
3,"[minor, unwilling, unwanting, depressive, unbo...","[sad, blue, lonely, depressed, angry]","[minor, unwilling, unwanting, depressive, unbo...",-0.230000,2,0
4,"[family, work, money, bill, stressed]","[tired, anxious, worried, stressed]","[family, work, money, bill, stressed, tired, a...",-0.325000,1,0
...,...,...,...,...,...,...
957,"[worried, fear, motivation, change, tired]","[anxious, worried, optimistic, stressed, love]","[worried, fear, motivation, change, tired, anx...",-0.050000,2,1
958,"[tired, sad, unhappy, tearful]","[sad, anxious, stressed, depressed, angry]","[tired, sad, unhappy, tearful, sad, anxious, s...",-0.458333,4,1
959,"[unsettled, lazy, drowsy, overwhelmed, nervous]","[sad, anxious, unhappy, worried, lonely]","[unsettled, lazy, drowsy, overwhelmed, nervous...",-0.340000,4,1
960,"[despair, numbness, guilty, hollow, grief]","[sad, tired, unhappy, depressed]","[despair, numbness, guilty, hollow, grief, sad...",-0.483333,4,1


# Create embeddings using BERT

In [566]:
model_class, tokenizer_class, pretrained_weights = (ppb.BertModel, ppb.BertTokenizer, 'bert-large-uncased')

# Load pretrained model/tokenizer
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
model = model_class.from_pretrained(pretrained_weights)

In [567]:
def get_embedding(word_list):
    tokenized_words = []
    
    for word in word_list:
        tokenized_words.append(tokenizer.encode(word, add_special_tokens=True))
    
    max_len = 0
    for i in tokenized_words:
        if len(i) > max_len:
            max_len = len(i)

    padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized_words])
    attention_mask = np.where(padded != 0, 1, 0)
    
    input_ids = torch.tensor(padded)  
    attention_mask = torch.tensor(attention_mask)

    with torch.no_grad():
        last_hidden_states = model(input_ids, attention_mask=attention_mask)
    features = last_hidden_states[0][:,0,:]
    embeddings_mean = torch.mean(features, dim=0)
    
    return embeddings_mean.numpy()

In [568]:
def get_single_embedding(word):
    
    idx_tokens = tokenizer.encode(word, add_special_tokens=True)
    
    segments_ids = [1] * len(idx_tokens)
    
    input_ids = torch.tensor([idx_tokens]) 
    segments_tensors = torch.tensor([segments_ids])

    with torch.no_grad():
        last_hidden_states = model(input_ids, segments_tensors)
    
    features = last_hidden_states[0][:,0,:]
    return features.numpy()
    

In [612]:
df_words['embeddings'] = df_words['all_words'].apply(get_embedding)

In [613]:
df_embeddings = pd.DataFrame(list(df_words['embeddings'].values))
df_embeddings

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1014,1015,1016,1017,1018,1019,1020,1021,1022,1023
0,-0.286945,-0.704067,-0.329593,-0.095753,0.371091,0.039494,0.016153,0.567595,0.768584,0.163829,...,-0.081441,0.196476,-1.067253,0.566605,0.014094,0.560107,0.390614,-0.430500,-0.347295,0.436724
1,-0.429299,-0.810017,-0.279851,-0.102155,0.638091,0.057378,0.095978,0.451752,0.929531,0.258261,...,-0.141187,0.178328,-1.285557,0.686991,0.019614,0.510867,0.302861,-0.515970,-0.275769,0.521093
2,-0.242295,-0.729603,-0.106986,-0.045915,0.193038,-0.142833,-0.136139,0.725106,0.958970,0.161761,...,0.066825,-0.016659,-1.174224,0.452059,-0.127458,0.392650,0.342169,-0.473843,-0.385068,0.418821
3,-0.538083,-0.833550,-0.264088,-0.038816,0.300945,-0.043375,0.101197,0.507463,0.696927,0.146248,...,-0.186439,0.009205,-1.278134,0.571358,0.126268,0.591661,0.140507,-0.381265,-0.417684,0.443161
4,-0.541720,-0.685660,-0.122641,-0.141538,0.080809,0.129055,0.246563,0.577789,0.955209,0.093470,...,-0.084468,-0.002326,-1.174331,0.559987,0.000706,0.632813,0.127029,-0.512298,-0.423815,0.524847
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
957,-0.508005,-0.710467,-0.108245,-0.215925,0.178859,0.055168,0.111164,0.498558,1.004555,0.209042,...,-0.283902,-0.095138,-1.130488,0.445158,0.200916,0.622486,0.140622,-0.481370,-0.317252,0.510820
958,-0.436111,-0.633424,-0.158170,-0.057350,0.087761,-0.185637,-0.149170,0.645068,0.920233,0.106716,...,-0.126078,-0.048341,-1.156779,0.483126,0.056184,0.594872,0.283100,-0.575648,-0.270613,0.418653
959,-0.385802,-0.697408,-0.233589,-0.104543,0.157401,0.048896,-0.077120,0.458185,0.770330,0.210073,...,-0.143045,-0.008077,-1.077365,0.531188,0.057279,0.473919,0.093911,-0.430369,-0.332123,0.311898
960,-0.551933,-0.612738,-0.198439,0.020218,0.103712,-0.067198,-0.168283,0.494133,0.859710,0.190728,...,-0.204535,-0.015649,-1.294399,0.497795,0.043635,0.549639,0.229795,-0.578431,-0.233610,0.433388


In [665]:
def add_words_to_embedding(words):
    df_embeddings = pd.DataFrame(list(df_words['embeddings'].values))
    df_words_plot = pd.Series(df_words['all_words']).apply(', '.join)
    
    for word in words: 
        df_embeddings = df_embeddings.append(pd.DataFrame(get_single_embedding(word)), ignore_index=True)
        df_words_plot = df_words_plot.append(pd.Series(word), ignore_index=True)
    
    return df_embeddings, df_words_plot

In [662]:
df_embeddings, df_words_plot = add_word_to_embedding(['love', 'happy'])
df_words_plot

0      motivated, learning, passionate, enthusiastic,...
1      connected, engaged, alive, alert, strong, cont...
2      yes, somewhat, extremely, partially, unhappy, ...
3      minor, unwilling, unwanting, depressive, unbot...
4      family, work, money, bill, stressed, tired, an...
                             ...                        
959    unsettled, lazy, drowsy, overwhelmed, nervous,...
960    despair, numbness, guilty, hollow, grief, sad,...
961    worried, frustrated, helpless, inconsolable, r...
962                                                 love
963                                                happy
Length: 964, dtype: object

# Plots

In [598]:
def create_plot_2d(component1, component2, color_values, words):
    
    # If y-values are binary a better coloring is chosen
    color_values = color_values.flatten()
    if np.array_equal(color_values, color_values.astype(bool)):
        color_scale = 'Bluered'
    else: 
        color_scale = 'Rainbow'
    
    fig = go.Figure(data=go.Scatter(
        x = component1[:NBR_RESPONSES],
        y = component2[:NBR_RESPONSES],
        text=words[:NBR_RESPONSES],
        hoverinfo='text',
        mode='markers',
        showlegend=False,
        marker=dict(
            size=13,
            color=color_values,
            colorscale=color_scale, 
            showscale=True,
            line_width=1
        )
    ))
    
    fig.add_trace(
        go.Scatter(
            x=component1[NBR_RESPONSES:],
            y=component2[NBR_RESPONSES:],
            text=words[NBR_RESPONSES:],
            hoverinfo='text',
            mode='markers',
            showlegend=False,
            marker=dict(
                size=15,
                symbol='diamond',
                color='magenta',
                showscale=False,
                line_width=1
        )))
    
    fig.update_layout(margin=dict( l=100,r=100,b=100,t=100),width=900,height=500)                 
    fig.layout.template = 'plotly_dark'
    
    fig.show()

In [599]:
def create_plot_3d(component1, component2, component3, color_values, words):
    
    # If y-values are binary a better coloring is chosen
    color_values = color_values.flatten()
    if np.array_equal(color_values, color_values.astype(bool)):
        color_scale = 'Bluered'
    else: 
        color_scale = 'Rainbow'
    
    fig = go.Figure(data=[go.Scatter3d(
            x=component1[:NBR_RESPONSES],
            y=component2[:NBR_RESPONSES],
            z=component3[:NBR_RESPONSES],
            text=words[:NBR_RESPONSES],
            hoverinfo='text',
            mode='markers',
            showlegend=False,        
            marker=dict(
                size=8,
                color=color_values,
                colorscale=color_scale,
                opacity=1,
                showscale=True,
                line_width=1
            )
        )])
    
    fig.add_trace(
        go.Scatter3d(
            x=component1[NBR_RESPONSES:],
            y=component2[NBR_RESPONSES:],
            z=component3[NBR_RESPONSES:],
            text=words[NBR_RESPONSES:],
            hoverinfo='text',
            mode='markers',
            showlegend=False,
            marker=dict(
                size=8,
                symbol='diamond',
                color='magenta',
                showscale=False,
                line_width=1
        )))
    fig.update_layout(margin=dict(l=50,r=50,b=50,t=50),width=900,height=500)
    fig.layout.template = 'plotly_dark'

    fig.show()

In [663]:
def plot_word_2d(words, color_scale=True):
    df_embeddings, df_words_plot = add_words_to_embedding(words)
    
    embedding_dep_2D = umap.UMAP(n_neighbors=4, min_dist=2, spread=2).fit_transform(df_embeddings.values)
    color_coding = df_words['dep_class'].values if color_scale else df_words['dep_diagnosis'].values

    create_plot_2d(embedding_dep_2D[:, 0], embedding_dep_2D[:, 1], color_coding, df_words_plot.values)

In [664]:
def plot_word_3d(words, color_scale=True):
    df_embeddings, df_words_plot = add_words_to_embedding(words)
    
    embedding_dep_3D = umap.UMAP(n_components=3, n_neighbors=4, min_dist=2, spread=2).fit_transform(df_embeddings.values)
    color_coding = df_words['dep_class'].values if color_scale else df_words['dep_diagnosis'].values
    
    create_plot_3d(embedding_dep_3D[:, 0], embedding_dep_3D[:, 1], embedding_dep_3D[:, 2], color_coding, df_words_plot.values)

In [675]:
plot_words = ['suicide', 'hate', 'happy']

plot_word_2d(plot_words, color_scale=False) #color_scale changes binary or scale color
plot_word_3d(plot_words, color_scale=False)

# Ridge multiple regression

In [202]:
X = df_embeddings.values
y = df_words['dep_class'].values

In [212]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.25)
alphas = [0.000001, 0.00001, 0.0001, 0.001, 0.01, 0.1,0.5, 1]

In [214]:
for a in alphas:
    ridge_model = Ridge(alpha=a, normalize=True).fit(X_train,y_train) 
    score = ridge_model.score(X_train, y_train)
    pred_y = ridge_model.predict(X_test)
    print(pred_y)
    mse = mean_squared_error(y_test, pred_y) 
    print("Alpha:{0:.6f}, R2:{1:.3f}, MSE:{2:.2f}, RMSE:{3:.2f}".format(a, score, mse, np.sqrt(mse)))

[ 5.89255375e+08 -6.38201245e+06  6.46218600e+07  7.03663366e+08
 -2.97372729e+07 -7.86331451e+08 -6.54341012e+07 -1.83392265e+08
  9.41757067e+08 -4.61561609e+08 -5.49772797e+08  7.07902336e+08
 -1.80386061e+08  8.35758956e+08 -3.61536266e+08  1.73775193e+08
 -3.31970522e+08  4.77654720e+08 -2.06651002e+08 -2.24246596e+08
 -4.66900946e+07 -5.79510510e+08 -4.45734879e+08 -4.53348053e+08
  3.90051710e+08  5.39714809e+07  8.48691426e+07  1.94844222e+07
 -4.72098312e+08  4.84596166e+08 -1.18265521e+08 -1.99725584e+08
  4.13996342e+08  8.52735265e+07 -4.61693251e+08 -4.62066486e+07
  6.30785307e+08 -3.67869692e+08 -2.71665342e+08 -2.84390660e+08
  9.60888319e+08 -2.46389168e+08  4.25178573e+08 -5.62463439e+08
 -1.61885049e+08  7.09200121e+08 -8.75199915e+08  1.12679924e+09
  3.10935516e+08 -4.10571931e+07  2.80717185e+08  1.25513730e+07
 -5.24161037e+08 -6.34385331e+08 -4.84856342e+08 -5.91422942e+08
  1.94041748e+08 -8.66520690e+08  1.23987300e+09  1.40504493e+08
 -3.26223429e+08  1.00995

[-2.59949995e+02  1.29113237e+01 -1.12961426e+02 -4.14001466e+02
  1.55690978e+02  3.95764580e+02  7.51325345e+01  1.14148726e+01
 -5.78307675e+02  2.22209310e+02  3.71377537e+02 -3.18895293e+02
  1.32109764e+02 -3.78068846e+02  2.68693032e+02  3.12279594e+01
  2.33655132e+02 -2.34062458e+02  1.02619564e+02  1.53664617e+02
  1.66167648e+02  2.88340374e+02  2.57971289e+02  2.78028595e+02
 -2.95903368e+02  6.50638574e+01 -1.01243371e+02 -8.80837257e+01
  2.14009991e+02 -1.43047721e+02  8.16575206e+01  1.84271566e+01
 -1.93036090e+02  6.77037063e+01  2.13971016e+02  4.38224401e+01
 -2.29435741e+02  1.86777898e+02 -7.23062162e+01  2.18991599e+02
 -4.42472861e+02  2.10359475e+02 -3.22939441e+02  6.06658923e+01
  2.99792877e+00 -4.91737856e+02  5.64390197e+02 -7.33651335e+02
 -1.83768823e+02 -2.84749793e+02 -2.27612460e+02 -2.39917608e+01
  1.44172503e+02  3.12263816e+02  1.89043109e+02  3.17908369e+02
 -8.61458387e+01  4.34208287e+02 -6.38730109e+02 -3.15505841e+02
  4.29812618e+02 -4.33703

100000