In [2]:
import pandas as pd
import keras
import numpy as np

responses = pd.read_csv('../data/response_format_cleaned_ds1.csv', sep=';', header=0)
responses.drop(responses.columns[[0]], axis=1, inplace=True)
responses.head()

Unnamed: 0,id,submitdate,lastpage,startlanguage,seed,startdate,datestamp,sequence1,seqOne,Dep5words[Word1],...,wor_all_selected,wor_all_selected1,minidep_scale,minidep_diagnose,depression_episodes,miniGAD_scale,miniGAD_symptoms_scale,miniGAD_diagnose,minidiagnose_category,minidiagnose_category_number
0,434.0,2020-08-07 11:46:22,15.0,en,659364400.0,2020-08-07 11:38:22,2020-08-07 11:46:22,2.0,1.0,motivated,...,NA NA happy NA NA NA NA NA NA NA NA NA NA care...,happy carefree satisfied ...,0.0,0,0,0,0,0,NoDi,0
1,184.0,2020-08-07 11:58:36,15.0,en,280389200.0,2020-08-07 11:34:31,2020-08-07 11:58:36,2.0,1.0,connected,...,anxious NA NA NA NA NA NA NA NA NA NA tense NA...,anxious tense fearful sad fe...,3.0,0,0,8,5,0,NoDi,0
2,330.0,2020-08-07 11:51:54,15.0,en,67706860.0,2020-08-07 11:36:32,2020-08-07 11:51:54,1.0,1.0,Yes,...,anxious NA NA NA NA NA worried NA NA NA NA NA ...,anxious worried scared sad mon...,7.0,0,5,9,5,0,NoDi,0
3,630.0,2020-08-07 13:22:42,15.0,en,1176643000.0,2020-08-07 12:55:26,2020-08-07 13:22:42,3.0,1.0,minor,...,anxious NA NA NA NA concerned NA NA NA NA NA t...,anxious concerned tense scared ...,3.0,0,5,8,5,0,NoDi,0
4,400.0,2020-08-07 12:04:52,15.0,en,1012492000.0,2020-08-07 11:37:19,2020-08-07 12:04:52,1.0,1.0,family,...,NA NA NA NA NA concerned NA NA NA NA NA tense ...,concerned tense sad tired ...,4.0,0,2,7,4,1,GAD,2


In [None]:
"""
Only for understanding data and visualize a response example.
Prints column name and response of patient at row 0.
"""
for res, col in zip(responses.iloc[0], responses.columns):
    print("{} -> {}".format(col, res))

In [103]:
"""
Using 5-gram contexts from the database, a co-occurrence (word by word) matrix was set up, 
where the rows contained the 120,000 most common words in the n-gram database and the columns 
consisted of the 10,000 most common words in the n-gram database.

The variable 'space' is a matrix of the semantic space with dimentions reduced to 512.
"""
space = pd.read_csv('../data/spaceEnglish1.csv', encoding= 'unicode_escape')
space.set_index('words', inplace=True)
space.drop(space.columns[[0]], axis=1, inplace=True)
space.dropna(inplace=True)
space = space[~space.index.duplicated(keep='first')]

In [105]:
space.head()

Unnamed: 0_level_0,X1,X2,X3,X4,X5,X6,X7,X8,X9,X10,...,X503,X504,X505,X506,X507,X508,X509,X510,X511,X512
words,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
was,-0.234071,-0.278211,-0.100658,-0.26957,-0.115498,-3.8e-05,-0.036835,0.024037,-0.003974,0.006582,...,-0.011414,0.018075,-0.020312,0.001287,0.024483,0.012867,0.021265,0.016368,0.024858,-0.020382
not,-0.28323,-0.338776,-0.141085,-0.243715,-0.236692,-0.033354,-0.099906,0.053253,-0.025582,-0.040372,...,-0.028472,0.048824,-0.025452,0.007828,0.027658,-0.022135,0.023037,0.005371,-0.001482,-0.024063
by,-0.251058,-0.327183,-0.203889,-0.283337,-0.124522,-0.006537,0.015371,0.131667,-0.130597,0.055605,...,-0.00155,0.027915,-0.012646,-0.005019,0.075544,0.014663,0.013489,-0.022636,0.010127,-0.027951
that,-0.281888,-0.346746,-0.171006,-0.266698,-0.208917,-0.019832,-0.035404,0.044301,-0.076601,0.021328,...,0.019319,0.042742,0.001747,0.019198,0.022598,0.034352,0.033282,-0.006843,0.027052,-0.013623
of,-0.25653,-0.335434,-0.229791,-0.25607,-0.12002,0.01708,0.078004,0.112134,-0.073805,0.098183,...,0.012012,0.00547,-0.022383,-0.042172,-0.00343,-0.013435,0.003697,-0.022769,0.024873,0.010061


In [127]:
"""
Cleans the string from punctuations and removes all words which are not represented in the semantic space. 
"""

import re
import math

words_in_space = set(space.index.values)

def clean_text(text):
    if isinstance(text, str):
        try:
            text = text.lower()
            text = re.sub(r'[^\w\s]', '', text)
            text = list(set(text.split()))
            cleaned_words = [w for w in text if w in words_in_space] # TODO: Hantera ord som inte finns i spacet. Nu ignoreras dem.
            return cleaned_words
        except Exception as e: 
            print(e)
    elif math.isnan(text):
        return []
    

In [122]:
"""
Controlling for artifacts relating to frequently occurring words.

1) Calculate, from Google N-gram, a frequency weighted average of all semantic representations in the space.
   (So that the weighting is proportional to how frequently the words occur in Google N-gram.)
2) Subtract this mean prior to aggregating each word, and then add to the final value.
"""

space_mean = pd.Series.to_numpy(space.mean())

def aggregating_words(responses):
    res_arr = np.zeros(512)
    
    for word in responses:
        word_arr = pd.Series.to_numpy(space.loc[word])
        res_arr = res_arr + (word_arr - space_mean)
    
    res_arr += space_mean    
    res_arr = res_arr / res_arr.sum() # Normalizing aggregated vector
    return res_arr

In [132]:
def aggregate_cell(text):
    words_in_cell = pd.Series.apply(text, clean_text)
    cell_vectors = pd.Series.apply(words_in_cell, aggregating_words)
    return cell_vectors

In [137]:
dep_columns = ['Deptext', 'dep_all_phraces', 'dep_all_words', 'dep_all_selected1']
df_dep_responses = responses[dep_columns]

df_dep_aggregated = df_dep_responses.apply(aggregate_cell, axis=1)

In [136]:
wor_columns = ['Wortext', 'wor_all_phraces', 'wor_all_words', 'wor_all_selected1']
df_wor_responses = responses[wor_columns]

df_wor_aggregated = df_wor_responses.apply(aggregate_cell, axis=1)

In [131]:
df_dep_aggregated.iloc[:10]

Unnamed: 0,Deptext,dep_all_phraces,dep_all_words,dep_all_selected1
0,"[0.14620488346512742, 0.20439917711775293, 0.0...","[-0.7590918166513144, -0.7996448245182226, 0.6...","[-0.12735496139054883, -0.08134255488745973, 0...","[-0.10631367041145828, -0.0761176566484352, 0...."
1,"[0.1687325994480793, 0.22817746128469954, -0.0...","[0.3150959567419946, 0.4207709520192084, 0.015...","[1.481708549565079, 1.68527947556706, -1.02609...","[-0.44645774520185727, -0.3896723426287448, 0...."
2,"[0.15851603167923164, 0.217751607989601, 0.031...","[0.09205027035822108, 0.12041302636339311, 0.0...","[0.12709153308204266, 0.14295118020024067, -0....","[-0.07663349903279251, -0.0006333442681840852,..."
3,"[0.15762474074250946, 0.21672014178412558, 0.0...","[0.2704033968315661, 0.35429957613545476, -0.0...","[0.025633109762595925, 0.008765453890638987, -...","[-0.07360932328901602, -0.03903701225966791, 0..."
4,"[0.1919698469397765, 0.26635971533780395, 0.01...","[0.39916260826011757, 0.4636167724424406, -0.1...","[0.10400074631268061, 0.11652565658484361, -0....","[53.49798528434978, 36.20381427223155, -104.12..."
5,"[0.16020033435596978, 0.22476856909904844, 0.0...","[0.4163829239997853, 0.5368902458559512, -0.04...","[-0.026319931002123525, -0.008734687972281292,...","[-0.09433512754889445, -0.04938581939782907, 0..."
6,"[0.24168212896922375, 0.33902324777852244, 0.0...","[0.13731923979741933, 0.17728122055888806, 0.0...","[-0.21575673903918435, -0.17254878886225866, 0...","[-0.07639443687669706, -0.06505044300142827, 0..."
7,"[0.19748763475130368, 0.2696508766614911, 0.02...","[0.13360152439614426, 0.1815627422093893, 0.02...","[-0.19561638585825628, -0.14930168873878924, 0...","[0.28285568876115946, 0.2521117267827112, -0.3..."
8,"[0.20163162925816497, 0.2766626351045351, 0.00...","[-0.4567649142131939, -0.17522813017334934, 0....","[-0.1390705297657966, -0.08380077038092038, 0....","[-2.0148557305068096, -1.7268435631282695, 3.1..."
9,"[0.16036779308098764, 0.22456358894876233, 0.0...","[0.23138802445768988, 0.31897402139246045, 0.0...","[-0.3710269064624872, -0.38723656128234063, 0....","[-0.23176359942669628, -0.20155104925319123, 0..."


In [138]:
df_wor_aggregated.iloc[:10]

Unnamed: 0,Wortext,wor_all_phraces,wor_all_words,wor_all_selected1
0,"[0.148989815726339, 0.20678471111840982, 0.023...","[0.16848163704578223, 0.2110408595467317, 0.00...","[0.08512889256007851, 0.06930940294720918, -0....","[-0.03628907508059879, -0.028415450913131562, ..."
1,"[0.16658035037273555, 0.22419803669426439, -0....","[2.926113550483038, 3.775815574093923, -0.2913...","[-0.5054823889786448, -0.1719085052464011, 0.2...","[-0.04345398189988287, -0.020184341988825117, ..."
2,"[0.14912997482144705, 0.2077058877329334, 0.03...","[0.10662762521250932, 0.12498854698322398, -0....","[0.07113400315024145, 0.07405232367513158, -0....","[-0.2445014187941481, -0.12076884155682255, 0...."
3,"[0.14166991480328237, 0.19702991519885324, 0.0...","[-0.20939610364178612, -0.02262574654514594, 0...","[-0.06529720047146928, 0.0462673323090299, 0.2...","[-0.1487217599810065, -0.09152232772736152, 0...."
4,"[0.18670963544273275, 0.25676537028535223, 0.0...","[0.26847344035296594, 0.33163575806082485, 0.0...","[-0.08879341781163708, -0.07424974126664868, 0...","[-0.06327165709289424, -0.04194209704023101, 0..."
5,"[0.2411979563481076, 0.3323859887436076, 0.034...","[0.46890125346292644, 0.587621833904448, -0.04...","[-0.30424776300405126, -0.04949671606636581, 0...","[-0.04199190272889661, -0.024059956123692574, ..."
6,"[0.24362128776903585, 0.3363286804922321, 0.01...","[0.14751288417729552, 0.19693854765932112, 0.0...","[0.11098324236286285, 0.036409039030220754, -0...","[0.3334432322058594, 0.26953732272642733, -0.2..."
7,"[0.16857844964441268, 0.2354066342337168, 0.03...","[0.15824374599203705, 0.20497218134088435, -0....","[-0.2617804024662385, 0.007203681762336978, 0....","[0.06378269752602976, 0.008016117756634652, -0..."
8,"[0.18522021167783884, 0.2544669083751666, 0.00...","[0.11635905622328928, 0.018585133221237265, -0...","[0.06984923510555667, 0.0065180639121361, -0.1...","[0.08754686825446154, 0.06462605258728957, -0...."
9,"[0.2046066832838274, 0.28211646776321403, 0.02...","[0.1755492642411605, 0.2157095366202667, -0.03...","[-0.10305678157119323, -0.04645025593640229, 0...","[-0.317957439407505, -0.17534088139707735, 0.5..."


In [38]:
#TODO: Ändra så att varje response cell aggregeras individuellt. Därefter concatineras alla 512 x 8 vectorer
#      dimensionerna på vectorn reduceras med en SVD eller PCA. Sen kan de två resulterande dep & wor vectorerna
#      läggas in i sina respective dataframes 

### Important variables 

Deptext = Depression text-response <br> 
Wortext = Worry text-response

dep_all_phraces = Depression all phraces responses <br> 
wor_all_phraces = Worry all phraces responses

dep_all_word = Depression all descriptive word responses <br>
wor_all_words = Worry all descriptive word responses

dep_all_selected1 = All selected depression word responses <br>
wor_all_selected1 = All selected worry word responses

<hr style="border:1px solid gray"> </hr>

CESDtot = Center for Epidemiological Studies Depression (CESD) <br>
PHQtot = PHQ-9 = Patient Helath Questionnaire = a depression scale

GADtot = GAD-7 = Generalized anxiety disorder scale <br>
PSWQtot = Penn State Worry Questionniare 

<hr style="border:1px solid gray"> </hr>

miniGAD_diagnose = Self-reported MINI (structured interview) GAD diangose <br>
minidep_diagnose = Self-reported MINI (structured interview) MDD (depression) diangose

# Using the Semantic Representations in Analyses

In [47]:

resp_dep_scale=responses[['minidep_scale']]
resp_GAD_scale=responses[['miniGAD_scale']]
#Checking for NaN values
resp_dep_scale.isnull().values.any() #true
resp_GAD_scale.isnull().values.any() #false

y_anxiety= resp_GAD_scale.values
y_dep=resp_dep_scale.values 

#Replaceing NaN values with mean value of column - perhaps we should do this differently
col_mean = np.nanmean(y_dep, axis=0)
col_mean=np.around(col_mean, decimals=0, out=None) #rounding 
#Find indices that you need to replace
inds = np.where(np.isnan(y_dep))
#Place column means in the indices. Align the array using take
y_dep[inds] = np.take(col_mean, inds[1])


pandas.core.frame.DataFrame

In [48]:
"""
Semantic-numeric correlations. 
Analyzing the relationship between semantic responses and a numerical variable
"""

"""Predicting the corresponding numeric rating scales on the basis of these representations by means 
of multiple linear regression analyses """

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

regr = LinearRegression()

x_dep = response_space_dep
x_anx = response_space_anx
#splitting data into training and testing dataset

X_train_dep, X_test_dep, y_train_dep, y_test_dep = train_test_split(x_dep, y_dep, test_size=0.2,random_state=0)
regr.fit(X_train_dep, y_train_dep) 

# Predicting the test set results
y_pred_dep = regr.predict(X_test_dep)

ValueError: Found input variables with inconsistent numbers of samples: [975, 976]

In [None]:
"""When the predicted variable is categorical, multinomial logistic regression is used."""

