#### Importing and cleaning dataset

In [14]:
import pandas as pd
import keras
import numpy as np
import os

#Get relative directory to find path
path = os.path.join(os.getcwd(), 'data', 'response_format_cleaned_ds1.csv')
responses = pd.read_csv(path, sep=';', header=0)
responses.drop(responses.columns[[0]], axis=1, inplace=True)
responses.head()


Unnamed: 0,id,submitdate,lastpage,startlanguage,seed,startdate,datestamp,sequence1,seqOne,Dep5words[Word1],...,wor_all_selected,wor_all_selected1,minidep_scale,minidep_diagnose,depression_episodes,miniGAD_scale,miniGAD_symptoms_scale,miniGAD_diagnose,minidiagnose_category,minidiagnose_category_number
0,434.0,2020-08-07 11:46:22,15.0,en,659364400.0,2020-08-07 11:38:22,2020-08-07 11:46:22,2.0,1.0,motivated,...,NA NA happy NA NA NA NA NA NA NA NA NA NA care...,happy carefree satisfied ...,0.0,0,0,0,0,0,NoDi,0
1,184.0,2020-08-07 11:58:36,15.0,en,280389200.0,2020-08-07 11:34:31,2020-08-07 11:58:36,2.0,1.0,connected,...,anxious NA NA NA NA NA NA NA NA NA NA tense NA...,anxious tense fearful sad fe...,3.0,0,0,8,5,0,NoDi,0
2,330.0,2020-08-07 11:51:54,15.0,en,67706860.0,2020-08-07 11:36:32,2020-08-07 11:51:54,1.0,1.0,Yes,...,anxious NA NA NA NA NA worried NA NA NA NA NA ...,anxious worried scared sad mon...,7.0,0,5,9,5,0,NoDi,0
3,630.0,2020-08-07 13:22:42,15.0,en,1176643000.0,2020-08-07 12:55:26,2020-08-07 13:22:42,3.0,1.0,minor,...,anxious NA NA NA NA concerned NA NA NA NA NA t...,anxious concerned tense scared ...,3.0,0,5,8,5,0,NoDi,0
4,400.0,2020-08-07 12:04:52,15.0,en,1012492000.0,2020-08-07 11:37:19,2020-08-07 12:04:52,1.0,1.0,family,...,NA NA NA NA NA concerned NA NA NA NA NA tense ...,concerned tense sad tired ...,4.0,0,2,7,4,1,GAD,2


In [15]:
"""
Only for understanding data and visualize a response example.
Prints column name and response of patient at row 0.
"""
for res, col in zip(responses.iloc[0], responses.columns):
    print("{} -> {}".format(col, res))

id -> 434.0
submitdate -> 2020-08-07 11:46:22
lastpage -> 15.0
startlanguage -> en
seed -> 659364415.0
startdate -> 2020-08-07 11:38:22
datestamp -> 2020-08-07 11:46:22
sequence1 -> 2.0
seqOne -> 1.0
Dep5words[Word1] -> motivated
Dep5words[Word2] -> learning
Dep5words[Word3] -> passionate
Dep5words[Word4] -> enthusiastic
Dep5words[Word5] -> happy
Wor5words[SQ01] -> motivated
Wor5words[SQ02] -> enthusiastic
Wor5words[SQ03] -> learning
Wor5words[SQ04] -> exercise
Wor5words[SQ05] -> nutrition
sequence2 -> 3.0
seq2 -> 2.0
Dep5phraseorwords[SQ01] -> happy
Dep5phraseorwords[SQ02] -> eager to learn
Dep5phraseorwords[SQ03] -> restful sleep
Dep5phraseorwords[SQ04] -> motivated
Dep5phraseorwords[SQ05] -> joyful
Wor5phraseorwords[SQ01] -> content
Wor5phraseorwords[SQ02] -> improving
Wor5phraseorwords[SQ03] -> learning
Wor5phraseorwords[SQ04] -> motivated
Wor5phraseorwords[SQ05] -> better than before
sequence3 -> 1.0
seq3 -> 3.0
Deptext -> Over the past two weeks, I have not been depressed.  I hav

#### Importing and cleaning semantic space 

In [16]:
"""
Using 5-gram contexts from the database, a co-occurrence (word by word) matrix was set up, 
where the rows contained the 120,000 most common words in the n-gram database and the columns 
consisted of the 10,000 most common words in the n-gram database.

The variable 'space' is a matrix of the semantic space with dimentions reduced to 512.
"""
path = os.path.join(os.getcwd(), 'data', 'spaceEnglish1.csv')
space = pd.read_csv(path, encoding= 'unicode_escape')
space.set_index('words', inplace=True)
space.drop(space.columns[[0]], axis=1, inplace=True)
space.dropna(inplace=True)
space = space[~space.index.duplicated(keep='first')]

In [17]:
space.head()

Unnamed: 0_level_0,X1,X2,X3,X4,X5,X6,X7,X8,X9,X10,...,X503,X504,X505,X506,X507,X508,X509,X510,X511,X512
words,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
was,-0.234071,-0.278211,-0.100658,-0.26957,-0.115498,-3.8e-05,-0.036835,0.024037,-0.003974,0.006582,...,-0.011414,0.018075,-0.020312,0.001287,0.024483,0.012867,0.021265,0.016368,0.024858,-0.020382
not,-0.28323,-0.338776,-0.141085,-0.243715,-0.236692,-0.033354,-0.099906,0.053253,-0.025582,-0.040372,...,-0.028472,0.048824,-0.025452,0.007828,0.027658,-0.022135,0.023037,0.005371,-0.001482,-0.024063
by,-0.251058,-0.327183,-0.203889,-0.283337,-0.124522,-0.006537,0.015371,0.131667,-0.130597,0.055605,...,-0.00155,0.027915,-0.012646,-0.005019,0.075544,0.014663,0.013489,-0.022636,0.010127,-0.027951
that,-0.281888,-0.346746,-0.171006,-0.266698,-0.208917,-0.019832,-0.035404,0.044301,-0.076601,0.021328,...,0.019319,0.042742,0.001747,0.019198,0.022598,0.034352,0.033282,-0.006843,0.027052,-0.013623
of,-0.25653,-0.335434,-0.229791,-0.25607,-0.12002,0.01708,0.078004,0.112134,-0.073805,0.098183,...,0.012012,0.00547,-0.022383,-0.042172,-0.00343,-0.013435,0.003697,-0.022769,0.024873,0.010061


#### Methods for cleaning and aggregating semantic responses

In [18]:
"""
Cleans the string from punctuations and removes all words which are not represented in the semantic space. 
"""

import re
import math

words_in_space = set(space.index.values)

def clean_text(text):
    if isinstance(text, str):
        try:
            text = text.lower()
            text = re.sub(r'[^\w\s]', '', text)
            text = list(set(text.split()))
            cleaned_words = [w for w in text if w in words_in_space] # TODO: Hantera ord som inte finns i spacet. Nu ignoreras dem.
            return cleaned_words
        except Exception as e: 
            print(e)
    elif math.isnan(text):
        return []
    

In [19]:
"""
Controlling for artifacts relating to frequently occurring words.

1) Calculate, from Google N-gram, a frequency weighted average of all semantic representations in the space.
   (So that the weighting is proportional to how frequently the words occur in Google N-gram.)
2) Subtract this mean prior to aggregating each word, and then add to the final value.
"""

space_mean = pd.Series.to_numpy(space.mean())

def aggregating_words(responses):
    res_arr = np.zeros(512)
    
    for word in responses:
        word_arr = pd.Series.to_numpy(space.loc[word])
        res_arr = res_arr + (word_arr - space_mean)
    
    res_arr += space_mean    
    res_arr = res_arr / res_arr.sum() # Normalizing aggregated vector
    return res_arr

In [20]:
def aggregate_cell(text):
    words_in_cell = pd.Series.apply(text, clean_text)
    cell_vectors = pd.Series.apply(words_in_cell, aggregating_words)
    return cell_vectors

#### Extracting semantic responses and representing the answer to each question in the semantic space. 

In [21]:
dep_columns = ['Deptext', 'dep_all_phraces', 'dep_all_words', 'dep_all_selected1']
df_dep_responses = responses[dep_columns]

df_dep_aggregated = df_dep_responses.apply(aggregate_cell, axis=1)

In [22]:
wor_columns = ['Wortext', 'wor_all_phraces', 'wor_all_words', 'wor_all_selected1']
df_wor_responses = responses[wor_columns]

df_wor_aggregated = df_wor_responses.apply(aggregate_cell, axis=1)

In [23]:
df_dep_aggregated.iloc[:10]

Unnamed: 0,Deptext,dep_all_phraces,dep_all_words,dep_all_selected1
0,"[0.14620488346512744, 0.20439917711775296, 0.0...","[-0.7590918166513154, -0.7996448245182239, 0.6...","[-0.12735496139054883, -0.08134255488745973, 0...","[-0.10631367041145826, -0.07611765664843519, 0..."
1,"[0.16873259944807936, 0.22817746128469954, -0....","[0.3150959567419946, 0.42077095201920844, 0.01...","[1.4817085495650797, 1.6852794755670606, -1.02...","[-0.44645774520185744, -0.3896723426287449, 0...."
2,"[0.15851603167923162, 0.217751607989601, 0.031...","[0.09205027035822107, 0.12041302636339309, 0.0...","[0.12709153308204268, 0.14295118020024072, -0....","[-0.07663349903279247, -0.0006333442681840848,..."
3,"[0.15762474074250946, 0.2167201417841256, 0.02...","[0.2704033968315661, 0.35429957613545476, -0.0...","[0.025633109762595925, 0.008765453890638987, -...","[-0.07360932328901602, -0.0390370122596679, 0...."
4,"[0.19196984693977662, 0.26635971533780406, 0.0...","[0.39916260826011746, 0.4636167724424405, -0.1...","[0.10400074631268061, 0.11652565658484358, -0....","[53.497985284343585, 36.203814272227355, -104...."
5,"[0.16020033435596975, 0.2247685690990484, 0.03...","[0.4163829239997853, 0.5368902458559512, -0.04...","[-0.026319931002123518, -0.008734687972281292,...","[-0.09433512754889448, -0.049385819397829094, ..."
6,"[0.24168212896922367, 0.33902324777852244, 0.0...","[0.13731923979741936, 0.1772812205588881, 0.03...","[-0.2157567390391844, -0.17254878886225866, 0....","[-0.07639443687669704, -0.06505044300142826, 0..."
7,"[0.1974876347513036, 0.269650876661491, 0.0217...","[0.13360152439614426, 0.1815627422093893, 0.02...","[-0.19561638585825614, -0.14930168873878913, 0...","[0.28285568876115935, 0.25211172678271115, -0...."
8,"[0.20163162925816505, 0.2766626351045351, 0.00...","[-0.45676491421319454, -0.1752281301733496, 0....","[-0.1390705297657966, -0.08380077038092038, 0....","[-2.0148557305068096, -1.7268435631282695, 3.1..."
9,"[0.16036779308098767, 0.22456358894876235, 0.0...","[0.23138802445768988, 0.3189740213924604, 0.02...","[-0.37102690646248687, -0.38723656128234035, 0...","[-0.2317635994266962, -0.2015510492531911, 0.2..."


In [24]:
df_wor_aggregated.iloc[:10]

Unnamed: 0,Wortext,wor_all_phraces,wor_all_words,wor_all_selected1
0,"[0.14898981572633896, 0.20678471111840982, 0.0...","[0.16848163704578217, 0.21104085954673169, 0.0...","[0.08512889256007851, 0.06930940294720918, -0....","[-0.03628907508059879, -0.028415450913131562, ..."
1,"[0.16658035037273553, 0.22419803669426439, -0....","[2.926113550483023, 3.7758155740939063, -0.291...","[-0.5054823889786438, -0.17190850524640067, 0....","[-0.04345398189988287, -0.020184341988825117, ..."
2,"[0.14912997482144708, 0.20770588773293341, 0.0...","[0.1066276252125093, 0.12498854698322402, -0.0...","[0.07113400315024147, 0.0740523236751316, -0.0...","[-0.2445014187941481, -0.12076884155682258, 0...."
3,"[0.14166991480328242, 0.19702991519885332, 0.0...","[-0.20939610364178599, -0.022625746545145924, ...","[-0.06529720047146925, 0.046267332309029886, 0...","[-0.1487217599810065, -0.09152232772736152, 0...."
4,"[0.1867096354427328, 0.25676537028535223, 0.02...","[0.2684734403529659, 0.3316357580608248, 0.000...","[-0.08879341781163712, -0.0742497412666487, 0....","[-0.06327165709289424, -0.041942097040231, 0.1..."
5,"[0.2411979563481076, 0.3323859887436077, 0.034...","[0.46890125346292655, 0.587621833904448, -0.04...","[-0.30424776300405115, -0.0494967160663658, 0....","[-0.041991902728896614, -0.024059956123692577,..."
6,"[0.2436212877690359, 0.336328680492232, 0.0172...","[0.14751288417729555, 0.19693854765932112, 0.0...","[0.11098324236286283, 0.03640903903022074, -0....","[0.3334432322058593, 0.2695373227264273, -0.26..."
7,"[0.1685784496444126, 0.23540663423371672, 0.03...","[0.15824374599203708, 0.20497218134088446, -0....","[-0.26178040246623874, 0.007203681762336984, 0...","[0.0637826975260297, 0.008016117756634647, -0...."
8,"[0.1852202116778388, 0.2544669083751666, 0.008...","[0.1163590562232893, 0.01858513322123727, -0.3...","[0.06984923510555666, 0.0065180639121360994, -...","[0.08754686825446148, 0.06462605258728954, -0...."
9,"[0.2046066832838274, 0.28211646776321414, 0.02...","[0.1755492642411605, 0.2157095366202667, -0.03...","[-0.10305678157119322, -0.046450255936402285, ...","[-0.3179574394075049, -0.17534088139707735, 0...."


In [29]:
def concat_vectors(row):
    return np.concatenate(row.values).ravel()

df_dep_concat = pd.DataFrame(list(df_dep_aggregated.apply(concat_vectors, axis=1)))
df_wor_concat = pd.DataFrame(list(df_wor_aggregated.apply(concat_vectors, axis=1)))

### Concatinating the semantic representations for each question into one multidimensional response vector
Each participant's semantic responses are now represented by two 2048 dimension vectors. 
One for depression-words and one for anxiety-words

In [26]:
df_wor_concat.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,0.14899,0.206785,0.023243,0.112792,0.074571,-0.008851,0.013691,-0.043377,0.014825,-0.008417,...,0.00946,0.002733,-0.035797,0.019894,0.002703,0.011657,0.029119,0.024678,0.048164,-0.010254
1,0.16658,0.224198,-0.004458,0.113733,0.087324,-0.004111,0.010961,-0.010853,0.018607,-0.00632,...,0.040995,0.030989,0.062659,0.02489,0.012699,0.026413,-0.019568,-0.009488,-0.010183,0.028609
2,0.14913,0.207706,0.032671,0.094477,0.078758,-0.005968,0.03049,-0.038519,0.021922,-0.024786,...,0.189723,0.225984,-0.004859,0.033828,-0.060299,0.032765,0.060161,0.088519,-0.097915,0.075118
3,0.14167,0.19703,0.028099,0.093292,0.078866,-0.010142,0.025955,-0.040232,0.01691,-0.006348,...,0.054622,0.051137,0.07948,-0.059947,-0.065747,0.134321,-0.01707,0.023734,0.068587,0.040797
4,0.18671,0.256765,0.023149,0.118886,0.098646,-0.017863,0.022382,-0.038842,0.013801,-0.023079,...,-0.028358,0.058178,0.012247,-0.027454,-0.03545,0.003896,-0.008417,0.008029,0.002994,0.041565


#### Performing principal component analysis to reduce the number of dimensions for the concatinated semantic vectors

In [30]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# Standardizing the features
X_dep = df_dep_concat.values
X_wor = df_wor_concat.values

X_dep = StandardScaler().fit_transform(X_dep)
X_wor = StandardScaler().fit_transform(X_wor)

pca = PCA(n_components=512)
pca_vector_dep = pca.fit_transform(X_dep)
pca_vector_wor = pca.fit_transform(X_wor)

response_space_dep = pd.DataFrame(data = pca_vector_dep)
response_space_wor = pd.DataFrame(data = pca_vector_wor)

In [31]:
response_space_dep.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,502,503,504,505,506,507,508,509,510,511
0,-1.282794,-0.524196,-1.027801,-0.641494,-0.910719,0.250604,-0.360248,-1.8561,0.603526,0.031079,...,0.14296,-0.590871,0.140121,0.038419,0.187508,-0.295742,-0.040504,-0.105643,0.151323,-0.328827
1,-1.490762,-0.513754,-0.589719,-0.764626,-0.829384,-0.013092,-0.17867,-0.772768,0.515342,-0.665212,...,0.058596,-0.000172,0.41734,-0.474495,0.288218,-0.057047,0.12304,-0.499483,0.233025,0.293262
2,-1.40772,-0.359152,-0.565727,-0.510021,-0.573567,-0.330673,-0.049134,-1.812871,-0.152629,-0.314469,...,-0.222353,-0.292018,-0.113159,0.249132,0.044381,0.059403,0.215932,-0.558951,-0.03122,0.012624
3,-1.408584,-0.495786,-0.432052,-0.398641,-0.200414,0.965685,-0.364464,-2.239018,0.119474,-0.321005,...,-0.097304,-0.038147,0.12473,-0.110798,0.041609,0.099065,0.032272,0.095782,0.003446,0.13924
4,198.330622,-17.27884,3.14381,8.079438,24.829126,-3.035996,1.942622,1.719282,0.200212,-0.086043,...,-0.035102,-0.123306,0.118074,-0.133421,0.202534,0.33223,-0.041061,-0.123947,0.016772,-0.120972


### Important variables 

Deptext = Depression text-response <br> 
Wortext = Worry text-response

dep_all_phraces = Depression all phraces responses <br> 
wor_all_phraces = Worry all phraces responses

dep_all_word = Depression all descriptive word responses <br>
wor_all_words = Worry all descriptive word responses

dep_all_selected1 = All selected depression word responses <br>
wor_all_selected1 = All selected worry word responses

<hr style="border:1px solid gray"> </hr>

CESDtot = Center for Epidemiological Studies Depression (CESD) <br>
PHQtot = PHQ-9 = Patient Helath Questionnaire = a depression scale

GADtot = GAD-7 = Generalized anxiety disorder scale <br>
PSWQtot = Penn State Worry Questionniare 

<hr style="border:1px solid gray"> </hr>

miniGAD_diagnose = Self-reported MINI (structured interview) GAD diangose <br>
minidep_diagnose = Self-reported MINI (structured interview) MDD (depression) diangose

# Using the Semantic Representations in Analyses

### The dependent variable for worry and depression

In [None]:
"""
Semantic-numeric correlations. 
Analyzing the relationship between semantic responses and a numerical variable
"""

#The numerical rating scales for depression and worry
dep_scale = responses[['PHQtot']]
wor_scale = responses[['GADtot']]

#Checking for NaN values
dep_scale.isnull().values.any() #true
wor_scale.isnull().values.any() #true

def replace_nan(y_array):
    #Replaceing NaN values with mean value of column - perhaps we should do this differently
    col_mean = np.nanmean(y_array, axis=0)
    col_mean = np.around(col_mean, decimals=0, out=None) #rounding 
    #Find indices that you need to replace
    inds = np.where(np.isnan(y_array))
    #Place column means in the indices. Align the array using take
    y_array[inds] = np.take(col_mean, inds[1])
    
    return y_array

#Create y - variables
y_wor = wor_scale.values
y_dep = dep_scale.values

#Replace NaN Values for the numerical scales
y_wor = replace_nan(y_wor).flatten()
y_dep = replace_nan(y_dep).flatten()

Import packages needed for analysis

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import median_absolute_error, mean_absolute_error, mean_squared_error


# Analysis using plain linear regression 

### Depression

In [None]:
"""Predicting the corresponding numeric rating scales on the basis of these representations by means 
of multiple linear regression analyses """

#x_dep = df_dep_concat #To use full concatinated vectors with 2048 dimensions (before PCA)
x_dep = response_space_dep #To use vectors with reduced dimensions (after PCA)

#Splitting data into training and testing dataset for depression data
X_train_dep, X_test_dep, y_train_dep, y_test_dep = train_test_split(x_dep, y_dep, test_size=0.2, random_state=0)

regr_dep = LinearRegression().fit(X_train_dep, y_train_dep)

# Predicting the test set results
y_pred_dep = regr_dep.predict(X_test_dep) 


In [None]:
#TODO -  add table so we can compare the scores 

In [None]:
print("R2 score for training data:  ", regr_dep.score(X_train_dep, y_train_dep)) #R2 score for training data is 0.8
print("R2 score for testing data:  ", regr_dep.score(X_test_dep, y_test_dep)) #R2 score for test data is -223.13 

print("Root Mean Squared Error (RMSE):  ", np.sqrt(metrics.mean_squared_error(y_test_dep, y_pred_dep)))
print("Intercept:  ", regr_dep.intercept_)
#print(regr_dep.coef_)

# ...... Probably overfitted (judging by the R2 values) !!

### Worry

In [None]:
#Multiple linear regression analysis for anxiety data 

#x_wor = df_wor_concat #To use full concatinated vectors with 2048 dimensions (before PCA)
x_wor = response_space_wor #To use vectors with reduced dimensions (after PCA)

#Splitting data into training and testing dataset for depression data
X_train_wor, X_test_wor, y_train_wor, y_test_wor = train_test_split(x_wor, y_wor, test_size=0.2, random_state=0)

regr_wor = LinearRegression()
regr_wor.fit(X_train_wor, y_train_wor)

# Predicting the test set results
y_pred_wor = regr_wor.predict(X_test_wor) 

In [None]:
print("R2 score for worry training data:  ", regr_wor.score(X_train_wor, y_train_wor)) #R2 score for training data is 0.67
print("R2 score for worry testing data:  ", regr_wor.score(X_test_wor, y_test_wor)) #R2 score for test data is -10
print("Root Mean Squared Error (RMSE):  ", np.sqrt(mean_squared_error(y_test_wor, y_pred_wor)))
print("Intercept:  ", regr_wor.intercept_)

print("Median absolute error for test data: ", median_absolute_error(y_pred_wor, y_test_wor))
print("Median absolute error for training data: ", median_absolute_error(regr_wor.predict(X_train_wor), y_train_wor))

#print(regr.coef_)

# ...... Probably overfitted (judging by the R2 values)

# Examining optimal number of dimensions
### Using the method described in the article combined with plain linear regression

In [None]:
"""In practice, this was simply achieved by adding 1, 
then multiplying by 1.3 and finally rounding to the nearest integer 
(e.g., 1, 3, 5, 8, where the next number of dimen- sions to be tested are the first 12; 
in other words ([8 􏰃 1] 􏰍 1.3).
In previous research, we have found this sequence to be valid and computationally efficient"""


In [None]:
#initializing values
n, min_err = 1, np.inf

# df_wor_concat => To use full concatinated vectors with 2048 dimensions (before PCA)
# response_space_wor => To use vectors with reduced dimensions (after PCA)

new_x=df_wor_concat

while n < len(df_wor_concat.columns):
    
    #split data into train and test data
    X_train_wor, X_test_wor, y_train_wor, y_test_wor = train_test_split(new_x, y_wor, test_size=0.2, random_state=0)
    y_train_wor = y_train_wor.flatten() 
    y_test_wor = y_test_wor.flatten()
    
    regr_wor = LinearRegression()
    
    #fit data
    regr_wor.fit(X_train_wor, y_train_wor)
    
    #make prediction 
    y_pred_wor = regr_wor.predict(X_test_wor) 
    
    #test prediction with mean squared error 
    err = mean_squared_error(y_test_wor, y_pred_wor)
    
    #compare err with current min err to then later choose the nbr of dimensions that give minimum mean squared error
    if err < min_err:
        min_err = err
        dimensions=n
        r2 = regr_wor.score(X_test_wor, y_test_wor)
    
    n=round((n+1)*1.3)
    new_x=df_wor_concat.iloc[:,:n]  

print('Nbr of dimensions to use : ', dimensions) #According to this we should use 56 dimensions in the next steps?
print(min_err)
print(r2)

In [None]:
"Using the depression data to test how many dimensions to keep"

y_dim = y_dep
dimension, min_error = 0, np.inf
i = 1

# df_wor_concat => To use full concatinated vectors with 2048 dimensions (before PCA)
# response_space_wor => To use vectors with reduced dimensions (after PCA)

while i < len(df_dep_concat.columns):
    x_dim = df_dep_concat.iloc[:,:i]
    X_train_dim, X_test_dim, y_train_dim, y_test_dim = train_test_split(x_dim, y_dim, test_size=0.2, random_state=0)

    regr_dim = LinearRegression()
    
    regr_dim.fit(X_train_dim, y_train_dim)
    y_pred_dim = regr_dim.predict(X_test_dim) 
    
    mse = mean_squared_error(y_test_dim, y_pred_dim)
    
    if mse < min_error: 
        min_error = mse
        dimension = i
        r2 = regr_dim.score(X_test_dim, y_test_dim)
    
    #print(i, mse)
    i = round((i + 1)*1.3)
    
print('\nNbr of dimensions to use: ', dimension)
print(min_error, r2)



# Results examining optimal number of dimensions

#### Depression

Technique | Original dimension | R2 score | MSE | Nbr dimensions
----- | --- | --- | --- | --- 
**(i+1)*1.3** from article w/ **linReg** | PCA red vec: 512 | 0.040 | 51.80 | 17
**(i+1)*1.3** from article w/ **linReg** | Full concat: 2048	 | 0.136 | 46.62 | 56
**(i+1)*1.3** from article w/ **leave 10% CV** | Full concat: 2048 | -0.025 | 6.828 | 74



#### Anxiety

Technique | Original dimension | R2 score | MSE | Nbr dimensions
----- | --- | --- | --- | --- 
**(i+1)*1.3** from article w/ **linReg** | PCA red vec: 512 | 0.131 | 31.42 | 56
**(i+1)*1.3** from article w/ **linReg** | Full concat: 2048 | 0.131 | 31.42 | 98
**(i+1)*1.3** from article w/ **leave 10% CV** | Full concat: 2048 | -0.112 | 7.169 | 56


# Cross validation techniques

## Examining optimal number of dimensions using method described in article combined with leave 10% cross validation

### Depression

In [None]:
#Leave 10% out cross validation while testing the number of dimensions that will give the best over all score. 

from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

nbr_folds = round(len(responses)*0.1)
cv = KFold(n_splits=nbr_folds, random_state=1, shuffle=True)

reg = LinearRegression()
dimension, min_error = 0, -np.inf
i = 1

#df_dep_concat      => To use full concatinated vectors with 2048 dimensions (before PCA)
#response_space_dep => To use vectors with reduced dimensions (after PCA)

while i < len(df_dep_concat.columns):
    x_dep_cv = df_dep_concat.iloc[:,:i]
    
    #Options for scoring: 'r2', 'neg_root_mean_squared_error' 
    scores = cross_val_score(reg, x_dep_cv, y_dep, scoring='neg_root_mean_squared_error', cv=cv, n_jobs=-1)
    
    if scores.mean() > min_error:
        min_error = scores.mean()
        dimension = i
        
    i = round((i + 1)*1.3)
    
print("Best score: {} with dimension: {}".format(min_error, dimension)) #Using dep data => 74 dimensions

### Anxiety

In [None]:
#Leave 10% out cross validation while testing the number of dimensions that will give the best over all score. 

from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

nbr_folds = round(len(responses)*0.1)
cv = KFold(n_splits=nbr_folds, random_state=1, shuffle=True)

reg = LinearRegression()
dimension, min_error = 0, -np.inf
i = 1

#df_wor_concat      => To use full concatinated vectors with 2048 dimensions (before PCA)
#response_space_wor => To use vectors with reduced dimensions (after PCA)

while i < len(df_wor_concat.columns):
    x_wor_cv = df_wor_concat.iloc[:,:i]
    
    #Options for scoring: 'r2', 'neg_root_mean_squared_error' 
    scores = cross_val_score(reg, x_wor_cv, y_dep, scoring='r2', cv=cv, n_jobs=-1)
    
    if scores.mean() > min_error:
        min_error = scores.mean()
        dimension = i
        
    i = round((i + 1)*1.3)
    
print("Best score: {} with dimension: {}".format(min_error, dimension)) #Using wor data => 56 dimensions

#### Leave 10% out cross validation on PCA reduced vectors

In [None]:
#Leave 10% out cross validation using PCA reduced vectors

from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

nbr_folds = round(len(responses)*0.1)
cv = KFold(n_splits=nbr_folds, random_state=1, shuffle=True)

reg = LinearRegression()

#df_dep_concat      => To use full concatinated vectors with 2048 dimensions (before PCA)
x_dep_cv = response_space_dep #=> To use vectors with reduced dimensions (after PCA)

#Options for scoring: 'r2', 'neg_root_mean_squared_error' 
scores = cross_val_score(reg, x_dep_cv, y_dep, scoring='neg_root_mean_squared_error', cv=cv, n_jobs=-1)
print(scores.mean())

#### Leave one out cross validation

In [None]:
#LOOCV 
#new=x_wor.copy()
#new.columns = new.columns.str.replace('X', '')
from sklearn.model_selection import LeaveOneOut
cv = LeaveOneOut()

regr = LinearRegression()
scores = cross_val_score(regr, X_wor, y_wor, scoring='neg_mean_absolute_error', cv=cv) #, n_jobs=-1)


In [None]:
scores=np.absolute(scores)
print(' %.3f (%.3f)' % (np.mean(scores), np.std(scores)))

In [None]:
#LeavePOut(p=1)
from sklearn.model_selection import LeavePOut

In [None]:
#StratifiedKFold.

In [None]:
#ShuffleSplit 