import pandas as pd
import xml.etree.ElementTree as ET
import glob, os
import numpy as np
from comet_ml import Experiment, Optimizer
import pickle
import logging
import sys
from sklearn.utils import class_weight
import re

In [None]:
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
os.environ['TF_KERAS'] = '1'

# only reserve 1 GPU

In [None]:
import tensorflow as tf
# tf.version

In [None]:
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.models import Sequential, Model, load_model
from tensorflow.keras.layers import Dense, Dropout, Embedding, LSTM, Lambda, BatchNormalization, TimeDistributed, \
     Bidirectional, Input, concatenate, Flatten, RepeatVector, Activation, Multiply, Permute#, CuDNNLSTM
from tensorflow.keras import regularizers
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras import callbacks, optimizers
from tensorflow.keras import backend as K
from tensorflow.keras.utils import plot_model, Sequence

from nltk.tokenize import RegexpTokenizer, TweetTokenizer
from nltk.corpus import stopwords

import tensorflow as tf

In [None]:
import tensorflow as tf
tf.test.is_gpu_available()

In [None]:
my_seed = 1234
# tf.set_random_seed(my_seed)

In [None]:
logger = logging.getLogger('training')
logger.addHandler(logging.StreamHandler(sys.stdout))
logger.setLevel(logging.DEBUG)

# Read data

In [None]:
def read_subject_writings(subject_file):
    writings = []
    with open(subject_file) as sf:
        contents = sf.read()
        contents = clean_content(contents)
#         try:
#         print(contents[:500])

        root = ET.fromstring(contents)
#         except:
#             print('Cannot extract text', contents[:500], '\n-------\n')
            
        try:
            subject = root.findall('ID')[0].text.strip()
        except Exception:
            print('Cannot extract ID', contents[:500], '\n-------\n')        
        for w in root.iter('WRITING'):
            subject_writings = {'subject': subject}
            for title in w.findall('TITLE'):
                subject_writings['title'] = title.text
            for text in w.findall('TEXT'):
                subject_writings['text'] = text.text
            for date in w.findall('DATE'):
                subject_writings['date'] = date.text
            writings.append(subject_writings)
    return writings
def clean_content(orig_content):
    WINDOWS_LINE_ENDING = b'\r\n'
    UNIX_LINE_ENDING = b'\n'
    byte_content = orig_content.encode("UTF-8")
    byte_content = byte_content.replace(WINDOWS_LINE_ENDING, UNIX_LINE_ENDING)
    content = byte_content.decode("UTF-8")
#     content = re.sub("^<", "^\<", orig_content)    
    content = re.sub("&", "&amp;", content)
    content = re.sub("^>", "&gt;", content)    
#     content = re.sub(">$", "\>$", content)    
    content = re.sub("<$", "&lt;", content)
    content = re.sub("<\n", "&lt;\n", content)
    content = re.sub("< ", "&lt; ", content)
    content = re.sub("<(?=\d)", "&lt;", content)
    content = re.sub("<(?=[^\w\\/])", "&lt;", content)
    content = re.sub("<(?=[^A-Z\\/])", "&lt;", content)
    return content

In [None]:
# root_dir = '/home/anasab/' 
root_dir = '/home/anasab/'

In [None]:
datadir_T2 = root_dir + '/eRisk/data/eRisk2020_T2/eRisk2020_T2_TEST_DATA/'
datadir_test_T2 = root_dir + 'eRisk/data/eRISK2021_T3_training_data/training BDI/2020 data'
datadir_test2021 = root_dir + 'eRisk/data/eRisk2021_T3_Collection/clean'
# labels_file_T2 = root_dir + '/eRisk/data/eRisk2020_T2/eRisk2020_T2_TRAINING_DATA/Depression Questionnaires_anon.txt'
labels_file_test_T2 = root_dir + 'eRisk//data/eRISK2021_T3_training_data/training BDI/2020 data/Depression Questionnaires_anon.txt'
nr_questions = 21

In [None]:
def read_texts(datadir_T2,
                labels_file_T2=None):
    writings = []
    writings_df = pd.DataFrame()
    labels_df = pd.DataFrame()

        
    for subject_file in os.listdir(datadir_T2):
        if not subject_file.endswith('xml'):
            continue
#         try:
        print(subject_file)
        writings.extend(read_subject_writings(os.path.join(datadir_T2, subject_file)))
#         except:
#             print("Couldn't parse", subject_file)
    writings_df = pd.DataFrame(writings)
    if labels_file_T2:
        labels_df = pd.read_csv(os.path.join(labels_file_T2), 
                                     delimiter='\s+', names=['subject'] + ['label%i' % i for i in range(nr_questions)])

        labels_df = labels_df.set_index('subject')

        writings_df = writings_df.join(labels_df, on='subject')
    
    return writings_df, labels_df

In [None]:
writings_df_test, labels_df_test = read_texts(datadir_test_T2, labels_file_test_T2)
writings_df_test2021, _ = read_texts(datadir_test2021)
writings_df = pickle.load(open('data/writings_df_T2_liwc.pkl', 'rb'))
# writings_df_test = pickle.load(open('data/writings_df_t2_test_liwc.pkl', 'rb'))
writings_df_test = pickle.load(open('data/writings_df_T2_test_liwc_wdays.pkl', 'rb'))

In [None]:
writings_df = pickle.load(open('data/writings_df_T2_wsymanto.pkl', 'rb'))

In [None]:
writings_df_test.writing_days

In [None]:
# writings_df = pd.concat([writings_df, writings_df_test])
# writings_df_test = writings_df_test.join(labels_df_test, on='subject')
writings_df_test.groupby('subject').count()
writings_df_test.writing_days

In [None]:
writings_df_test2021.groupby('subject').count()
# writings_df_test2021
# pickle.dump(writings_df_test2021, open('data/writings_df_t3_2021_test_liwc.pkl', 'wb+'))

In [None]:
len(set(writings_df_test.subject))

## Preprocess text

In [None]:
tokenizer = RegexpTokenizer(r'\w+')

def tokenize(t):
    return tokenizer.tokenize(t.lower())
tt = TweetTokenizer()
sw = stopwords.words("english")
def tokenize_tweets(t, tokenizer=tt, stop=True):
    tokens = tokenizer.tokenize(t.lower())
    tokens_clean = [token for token in tokens if 
                            re.match("^[a-z]*$", token)]
    if not stop:
        tokens_clean = [token for token in tokens_clean 
                        if token not in sw]
    return tokens_clean

def tokenize_fields(writings_df):
    writings_df['tokenized_title'] = writings_df['title'].apply(lambda t: tokenize_tweets(t) 
                                                                if type(t)==str and t else None)
    writings_df['title_len'] = writings_df['tokenized_title'].apply(lambda t: len(t) 
                                                                    if type(t)==list and t else None)
    writings_df['tokenized_text'] = writings_df['text'].apply(lambda t: tokenize_tweets(t) 
                                                              if type(t)==str and t else None)
    writings_df['text_len'] = writings_df['tokenized_text'].apply(lambda t: len(t) 
                                                                  if type(t)==list and t else None)
    return writings_df

In [None]:
writings_df_test2021 = tokenize_fields(writings_df_test2021)

In [None]:
writings_df = tokenize_fields(writings_df)
writings_df_test = tokenize_fields(writings_df_test)

# Extract features

In [None]:
def merge_tokens(row):
    tokens = []
    if row.tokenized_text:
        tokens += row.tokenized_text
    if row.tokenized_title:
        tokens += row.tokenized_title
    return tokens
writings_df_test2021['all_tokens'] = writings_df_test2021.apply (lambda row: merge_tokens(row), axis=1)
writings_df_test['all_tokens'] = writings_df_test.apply (lambda row: merge_tokens(row), axis=1)
writings_df['all_tokens'] = writings_df.apply (lambda row: merge_tokens(row), axis=1)

In [None]:
def load_NRC(nrc_path):
    word_emotions = {}
    emotion_words = {}
    with open(nrc_path) as in_f:
        for line in in_f:
            line = line.strip()
            if not line:
                continue
            word, emotion, label = line.split()
            if word not in word_emotions:
                word_emotions[word] = set()
            if emotion not in emotion_words:
                emotion_words[emotion] = set()
            label = int(label)
            if label:
                word_emotions[word].add(emotion)
                emotion_words[emotion].add(word)
    return emotion_words

nrc_lexicon_path = root_dir + '/resources/NRC-Emotion-Lexicon-Wordlevel-v0.92.txt'
nrc_lexicon = load_NRC(nrc_lexicon_path)
emotions = list(nrc_lexicon.keys())

In [None]:
# TODO: include the title
def extract_emotions(tokens, emotion, relative=True):
    if not tokens:
        return None
    emotion_words = [t for t in tokens 
                     if t in nrc_lexicon[emotion]]
    if relative:
        return len(emotion_words) / len(tokens)
    else:
        return len(emotion_words)
    
    return encoded_emotions

from functools import partial
for emotion in emotions:
    writings_df_test2021[emotion] = writings_df_test2021['all_tokens'].apply(partial(extract_emotions, emotion=emotion, 
                                                                   relative=True))
for emotion in emotions:
    writings_df_test[emotion] = writings_df_test['all_tokens'].apply(partial(extract_emotions, emotion=emotion, 
                                                                   relative=True))

In [None]:
first_person_pronouns = {"i", "me", "my", "mine", "myself"}
def encode_pronouns(tokens, pronouns={"i", "me", "my", "mine", "myself"}, relative=True):
    if not tokens:
        return 0
    text_len = len(tokens)
    nr_pronouns = len([t for t in tokens if t in pronouns])
    if relative and text_len:
        return nr_pronouns/text_len
    else:
        return nr_pronouns

In [None]:
from functools import partial
writings_df_test2021['pronouns'] = writings_df_test2021['all_tokens'].apply(partial(encode_pronouns, relative=True))

In [None]:
from liwc_readDict import readDict

liwc = readDict('/home/anasab/resources/liwc.dic')
categories = [c for (w,c) in liwc]
set(categories)
liwc_dict = {}
for (w, c) in liwc:
    if c not in liwc_dict:
        liwc_dict[c] = []
    liwc_dict[c].append(w)

In [None]:
def encode_liwc_categories(tokens, category_words, relative=True):
    category_cnt = 0
    if not tokens:
        return None
    text_len = len(tokens)
    for t in tokens:
        for word in category_words:
            if t==word or (word[-1]=='*' and t.startswith(word[:-1])) \
            or (t==word.split("'")[0]):
                category_cnt += 1
                break # one token cannot belong to more than one word in the category
    if relative:
        return category_cnt/text_len
    else:
        return category_cnt

In [None]:
%%time
from functools import partial
# for categ in ['negemo', 'posemo', 'affect', 'sad', 'anx', 'pronoun']:#liwc_dict.keys():
for categ in liwc_dict.keys():
    if categ in writings_df_test2021.columns:
        continue
    print("Encoding for category %s..." % categ)
    writings_df_test2021[categ] = writings_df_test2021['all_tokens'].apply(partial(encode_liwc_categories, 
                                                                   category_words=liwc_dict[categ], 
                                                                   relative=True))


In [None]:
writings_df.text_len.describe()


In [None]:
writings_df.title_len.describe()

In [None]:
writings_df.groupby('subject').count().title.describe()

In [None]:
writings_df_test2021.columns

In [None]:
def encode_labels(labels):
    '''Convert ia to i and ib to -i'''
    encoded_labels = []
    for i, l in enumerate(labels):
        try:
            encoded_labels.append(int(l))
        except Exception as e:
            logger.debug("Encoding label %s\n" % l)
        
            if str(l)[-1] == 'a':
                encoded_labels.append(int(l[0]))
            elif str(l)[-1] == 'b':
                encoded_labels.append(-int(l[0]))
            else:
                logger.warning("Coult not encode label %s\n" % l)
    return encoded_labels

In [None]:
for i in range(21):
    writings_df['label%d'%i] = writings_df['label%d'%i].apply(lambda l: encode_labels([l])[0])
for i in range(21):
    writings_df_test['label%d'%i] = writings_df_test['label%d'%i].apply(lambda l: encode_labels([l])[0])
for i in range(21):
    writings_df_test2021['label%d'%i] = writings_df_test2021['label%d'%i].apply(lambda l: encode_labels([l])[0])

#### Stopwords

In [None]:
stopword_list = stopwords.words("english")
def encode_stopwords(tokens, stopwords=stopword_list):
    encoded_stopwords = [0 for s in stopwords]
    if not tokens:
        return encoded_stopwords
    for i, stopword in enumerate(stopwords):
        if stopword in tokens:
            encoded_stopwords[i] += 1
    return encoded_stopwords

# aggregate

In [None]:
# Aggregate by users
def aggregate_df(writings_df):
    writings_df = writings_df.fillna(value={'text': '', 'title':''})
    column_functions = {'text': lambda t: " ".join(t), 
                                            'title': lambda t: " ".join(t),
                                            'tokenized_text': 'sum',
                                            'tokenized_title': 'sum',
                                            'text_len': 'sum',
                                            'title_len': 'sum'}
    if 'label1' in writings_df.columns:
        column_functions.update({'label%i'%i: 'min' for i in range(21)})
    for c in list(set(categories)) + emotions:
        if c in writings_df.columns:
            column_functions.update({c: 'mean'})
    writings_per_user_df = writings_df.groupby('subject').aggregate(column_functions)
    #                                          'subset': 'min'})
    writings_per_user_df = writings_per_user_df.fillna("")
    writings_per_user_df['text'] = writings_per_user_df['text'] + " " +  writings_per_user_df['title']
    writings_per_user_df['text_len'] = writings_per_user_df['text_len'] + writings_per_user_df['title_len']
    return writings_per_user_df

In [None]:
# Aggregate by users
writings_df_per_user = aggregate_df(writings_df)
# writings_df_test_per_user = aggregate_df(writings_df_test2021)
writings_df_test_per_user = aggregate_df(writings_df_test)

In [None]:
writings_df_test_per_user.columns

In [None]:
writings_df_per_user.text_len.describe()

# SVM

In [None]:
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score


In [None]:
def get_avg_embedding(writings_df, subject, column):
    return writings_df[writings_df['subject']==subject][column].apply(lambda l: np.array(l)).values.mean()

In [None]:
avg_embeddings_text = {s: get_avg_embedding(writings_df, s, 'use_embeddings_text') 
                       for s in set(writings_df.subject.values)}
avg_embeddings_title = {s: get_avg_embedding(writings_df, s, 'use_embeddings_title') 
                       for s in set(writings_df.subject.values)}

In [None]:
series_embeddings_text = pd.Series(avg_embeddings_text)
series_embeddings_text.name = 'avg_embeddings_text'
series_embeddings_title = pd.Series(avg_embeddings_title)
series_embeddings_title.name = 'avg_embeddings_title'

In [None]:
writings_per_user_df = writings_per_user_df.join(series_embeddings_text, on='subject')
writings_per_user_df = writings_per_user_df.join(series_embeddings_title, on='subject')

In [None]:
writings_per_user_df.columns.values

In [None]:
writings_per_user_df.join?

### User embeddings

In [None]:
writings_df_per_user['subject'] = writings_df_per_user.index
writings_df_test_per_user['subject'] = writings_df_test_per_user.index

In [None]:
all_embeddings_dict = pickle.load(open('data/user_embeddings_t2_2020_model115.pkl', 'rb'))
all_embeddings_dict2 = pickle.load(open('data/user_embeddings_t2_test_2020_model115.pkl', 'rb'))
# all_embeddings_dict.update(all_embeddings_dict2)

In [None]:
all_embeddings_dict_test = all_embeddings_dict2

In [None]:
len(all_embeddings_dict)

In [None]:
# all_embeddings_dict_test = pickle.load(open('data/user_embeddings_t3_test2021_model115.pkl', 'rb'))
# len(all_embeddings_dict_test.keys())

In [None]:
writings_df_per_user['user_embeddings_avg'] = writings_df_per_user.subject.apply(
    lambda s: np.mean(all_embeddings_dict[s], axis=0))

In [None]:
len(writings_df_per_user)

In [None]:
writings_df_test_per_user['user_embeddings_avg'] = writings_df_test_per_user.subject.apply(
    lambda s: np.mean(all_embeddings_dict_test[s], axis=0))

In [None]:
writings_df_test_per_user['user_embeddings_avg']

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
# writings_embeddings['use_similarity'] = writings_embeddings['use_embeddings_text'].apply()

In [None]:
len([c for c in categories] + emotions)
# len(writings_df_test_per_user[feature_cols].values.tolist()[0])
len(categories)

In [None]:
# features = [np.random.rand(75) for i in range(20)]
# features = writings_per_user_df[list(categories) + emotions + ["pronouns"]]
# features = writings_per_user_df['avg_embeddings_title'].values.tolist()

# User embeddings as features
# features = writings_df_per_user['user_embeddings_avg'].values.tolist()

# LIWC/emotions as features
feature_cols = [c for c in set(categories)] + emotions
feature_cols = emotions
features = writings_df_per_user[feature_cols]
writings_df_per_user['allcats'] = writings_df_per_user[feature_cols].values.tolist()
writings_df_test_per_user['allcats'] = writings_df_test_per_user[feature_cols].values.tolist()

def cross_validation(folds=2):
    svmmodels= {}
    total_score = 0
    for l in range(21):
#         print("Classifier for label", l)
        labels = writings_df_per_user['label%d' % l].values
        svmmodels[l] = SVC(kernel='rbf', C=5)
        cvscores = cross_val_score(svmmodels[l], features, labels, cv=folds)
#         print(sum(cvscores)/folds, cvscores)
        total_score += sum(cvscores)/folds
    return total_score/21

print(cross_validation())

In [None]:
features['allcats']= features.values.tolist()
feature_cols

In [None]:
def results_for_label(features, l, train_examples=16):
    labels = writings_df_per_user['label%d' % l].values
    svmmodel=SVC()
    svmmodel.fit(features[:train_examples], labels[:train_examples])
    predictions = svmmodel.predict(features[train_examples:])
    print(l, predictions, labels[train_examples:], labels[:train_examples])
    return labels[train_examples:]==predictions

In [None]:
cumresults = []
for l in range(21):
    results = results_for_label(features, l)
    cumresults.append(results)

nrusers = len(cumresults[0])
nrques = 21
correct_per_user = {u: 0 for u in range(nrusers)}
for q, ques in enumerate(cumresults):
    for u, answ in enumerate(cumresults[q]):
        if answ:
            correct_per_user[u] += 1

for u in correct_per_user:
    print("u", u, correct_per_user[u]/nrques)
print("AHR", sum(correct_per_user.values())/nrusers/nrques)

## KNN

In [None]:
np.array(features).shape

In [None]:
from sklearn.neighbors import KNeighborsClassifier
label_set = []
for column in writings_df_per_user.columns:
    if column.startswith('label'):
        label_set.append(column)
label_set

In [None]:
def classifier_for_label(label_col, subjects, n=5, user_level=True, feature_col='user_embeddings_avg'):
    neigh = KNeighborsClassifier(n_neighbors=n)
    if user_level:
        X = [list(l) for l in writings_df_per_user[writings_df_per_user['subject'].isin(subjects)][
        feature_col].values]
        print(len(X), len(X[0]))
        y = writings_df_per_user[writings_df_per_user['subject'].isin(subjects)][label_col].values
        print(len(y))
    else:
        X = []
        y = []
        for s in subjects:
            label = writings_df_per_user[writings_df_per_user['subject']==s][label_col].values[0]
            for e in all_embeddings_dict[s]:
                y.append(label)
                X.append(e)
            
    neigh.fit(X, y)
    return neigh


In [None]:
n=5
all_subjects = list(all_embeddings_dict.keys())
training_subjects = all_subjects
test_subjects = all_embeddings_dict_test.keys()
knns = {}
for label in range(21):
    knns['label%d'%label] = classifier_for_label('label%d'%label, subjects=training_subjects,n=n, user_level=True,
                                                feature_col='allcats')

In [None]:
len(writings_df_test_per_user[writings_df_test_per_user['subject']=='subject1009'][feature_col].values[0])

In [None]:
writings_df_test_per_user.subject

In [None]:
# Predict Using embeddings

from collections import Counter
predictions_avg = {}
predictions_maj = {}
predictions_avglab = {}
true_labels = {}

def label_to_str(l, q=0):
    if int(round(l)) < 0:
        l = str(abs(round(l))) + 'b'
    elif q in [15, 17] and l>0:
        l = str(round(l)) + 'a'
    else:
        l = str(round(l))
    return l

for subject in test_subjects:
    predictions_avg[subject] = []
    predictions_maj[subject] = []
    predictions_avglab[subject] = []
    true_labels[subject] = []
    for l in range(21):
        label = "label%d" % l
        print("\n" + subject)
        if label in writings_df_test_per_user.columns:
            true_label = writings_df_test_per_user[writings_df_test_per_user['subject']==subject][label].values[0]
        else:
            true_label = 0
        print("True label:", true_label)
        average_pred = knns[label].predict([np.mean(all_embeddings_dict_test[subject], axis=0)])[0]
        print("Average pred:", average_pred)
        all_preds = knns[label].predict(all_embeddings_dict_test[subject])
        majority_label = Counter(all_preds).most_common(1)[0][0]
        print("Majority label:", majority_label)
        print("All preds:", all_preds)
        average_lab_pred = np.mean(all_preds)
        true_labels[subject].append(label_to_str(true_label, l))

        average_pred = label_to_str(average_pred, l)
        average_lab_pred = label_to_str(average_lab_pred, l)
        majority_label = label_to_str(majority_label, l)
            
        predictions_avg[subject].append(average_pred)
        predictions_maj[subject].append(majority_label)
        predictions_avglab[subject].append(average_lab_pred)


In [None]:
# Predict using lexicon features
from collections import Counter
predictions = {}
feature_col = 'allcats'

def predict(writings_df_test_per_user, test_subjects):

    def label_to_str(l, q=0):
        if int(round(l)) < 0:
            l = str(abs(round(l))) + 'b'
        elif q in [15, 17] and l>0:
            l = str(round(l)) + 'a'
        else:
            l = str(round(l))
        return l

    for subject in test_subjects:
        predictions[subject] = []
        true_labels[subject] = []
        for l in range(21):
            label = "label%d" % l
            print("\n" + subject)
            if label in writings_df_test_per_user.columns:
                true_label = writings_df_test_per_user[writings_df_test_per_user['subject']==subject][label].values[0]
            else:
                true_label = 0
            print("True label:", true_label)

            test_features = writings_df_test_per_user[writings_df_test_per_user['subject']==subject][feature_col
                                                                                                    ].values.tolist()[0]
    #         print(np.array(test_features).reshape(1,-1))
            all_preds = knns[label].predict(np.array(test_features).reshape(1,-1))
            print("All preds:", all_preds)
    #         true_labels[subject].append(label_to_str(true_label, l))
            true_labels[subject].append(str(true_label))

            pred = label_to_str(all_preds[0], l)


            predictions[subject].append(pred)
    #         predictions[subject].append('0')
    return predictions


In [None]:
predictions = predict(writings_df_test_per_user, test_subjects)

In [None]:
predictions

In [None]:
# pickle.dump(predictions_avg, open('t2_predictions_k5_alltexts_userembeddings101_avg_dev.pkl', 'wb+'))
# pickle.dump(predictions_maj, open('t2_predictions_k5_alltexts_userembeddings101_maj_dev.pkl', 'wb+'))
# pickle.dump(predictions_avglab, open('t2_predictions_k5_alltexts_userembeddings101_avglab_dev.pkl', 'wb+'))
# pickle.dump(true_labels, open('t2_true_labels_dev.pkl', 'wb+'))

## Correlations over time

In [None]:
def get_evolution_series(df, emotion, subject, rolling_window, date_field='writing_days'):

#     df[df[date_field]>=writing_day_cutoff][
#             ['text', 'label', 'pronouns', 'text_len', 'subject', 'date', 'date_day', 'writing_days', 'negemo', 'posemo'
#             ] + emotions + list(categories)
#     ].groupby(date_field).mean()[emotion].rolling(rolling_window).mean().plot(label=label)
    
#     return df[df['date_day']>=writing_day_cutoff][
    return df[df['subject']==subject][
            ['text', 'text_len', 'subject', 'date', 'date_day', 'writing_days', 'writing_days_reverse', 
            'depression_mention', 'diagnosis'] + emotions + [c for c in set(categories) if c in df.columns 
                                                             and c not in emotions]
    ].groupby(date_field).mean()[emotion].rolling(rolling_window).mean()
#                                 ].apply(lambda c: np.log(c) if c>0 else 0
#      

In [None]:
from scipy.stats import pearsonr
def corr_users(writings_df, subject1, subject2, emotion, window=1):
    hist_len1 = len(writings_df[writings_df['subject']==subject1].groupby('writing_days'))
    hist_len2 = len(writings_df[writings_df['subject']==subject2].groupby('writing_days'))
    
    if hist_len1 < hist_len2:
        series1 = get_evolution_series(writings_df, emotion, subject1, window).fillna(0).values
        series2 = get_evolution_series(writings_df, emotion, subject2, window).fillna(0).values
    else:
        series2 = get_evolution_series(writings_df, emotion, subject1, window).fillna(0).values
        series1 = get_evolution_series(writings_df, emotion, subject2, window).fillna(0).values
    CORR_VALS = np.array(series1)
    def get_correlation(vals):
        return pearsonr(vals, CORR_VALS)[0]
    df = pd.DataFrame(dict(x=series2))
    correlations = df.rolling(window=len(CORR_VALS)).apply(get_correlation)
#     return pearsonr(series1, series2)    
    return correlations.max()[0]

In [None]:
def sim_users(writings_df, subject1, subject2, emotions):
    return [corr_users(writings_df, subject1, subject2, e) for e in emotions]

In [None]:
sim_users(writings_df_all, 'subject3993', 'subject5791' , ['i', 'we', 'ipron'] + emotions)

In [None]:
sim_users(writings_df_all, 'subject1426', 'subject1426', ['i', 'we', 'ipron'] + emotions)

In [None]:
writings_df_all = pd.concat([writings_df, writings_df_test])
similarity_matrix_vectors = {}
for subject1 in set(writings_df_all.subject):
    similarity_matrix_vectors[subject1] = {}
    for subject2 in set(writings_df_all.subject):
        if subject1 == subject2:
            continue
        print(subject1, subject2)
        sim = sim_users(writings_df_all, subject1, subject2, list(set(categories)) + emotions)
        print(sim)
        similarity_matrix_vectors[subject1][subject2] = sim

In [None]:
features = list(set(categories)) + emotions
pickle.dump(features, open('correlations_time_similarity_vectors_features.pkl', 'wb+'))
# pickle.dump(similarity_matrix_vectors, open('correlations_time_similarity_matrix_vectors_all_T3trainvalid.pkl', 'wb+'))

In [None]:
# writings_df_all_per_user = pd.concat([writings_df_per_user, writings_df_test_per_user])
writings_df_all_per_user

In [None]:
similarity_matrix_vectors2 = {}
for s1 in similarity_matrix_vectors:
    print(s1)
    similarity_matrix_vectors2[s1] = {}
    
    for s2 in similarity_matrix_vectors[s1]:
        diffs = []
        
        for i, c in enumerate(similarity_matrix_vectors[s1][s2]):
            f = features[i]
            fval1 = writings_df_all_per_user[writings_df_all_per_user['subject']==s1][f].values[0]
            fval2 = writings_df_all_per_user[writings_df_all_per_user['subject']==s2][f].values[0]
            d = abs(fval1-fval2)
            diffs.append(d)
        similarity_matrix_vectors2[s1][s2] = (similarity_matrix_vectors[s1][s2],diffs)

In [None]:
[features[i] for i in [23,28,14]]
features.index('ipron')

In [None]:
def label_to_str(l, q=0):
    if np.isnan(l):
        return '0'
    if int(round(l)) < 0:
        l = str(abs(round(l))) + 'b'
    elif q in [15, 17] and l>0:
        l = str(round(l)) + 'a'
    else:
        l = str(round(l))
    return l


results = {'DCHR': [], 'ACR': [], 'ADODL': [], 'AHR': []}
k = 15
sim_thresh=1
features_index = range(len(set(categories))+len(emotions)) # we consider just these features
prons_index = [23,43,0,9]
oth=[28,14]
emotions_index = list(range(64,74))
features_index =  prons_index + oth + emotions_index 
# 
import random
# test_subjects_sets = [random.sample(set(writings_df_all.subject), 45) for r in range(10)]
test_subjects_sets = [set(writings_df_test.subject)]


for test_subjects in test_subjects_sets:
    predictions = {}
    true_labels = {}
    print(test_subjects)
    for subject in test_subjects:
        predictions[subject] = []
        true_labels[subject] = []
        for l in range(21):
            label = "label%d" % l
            print("\n" + subject)
            if label in writings_df_all_per_user.columns:
                true_label = writings_df_all_per_user[writings_df_all_per_user['subject']==subject][label].values[0]
            else:
                true_label = 0
            print("True label:", true_label)

            # t[1] is the correlation, and t[2] is the difference in prevalence
            best_subjects = sorted([(s,np.array(c[0])[features_index],np.array(c[1])[features_index]) for s,c in similarity_matrix_vectors2[subject].items()
                                    if s not in test_subjects], 
#                                    key = lambda t: np.nanmean(t[1]-t[2]), reverse=True)
                                   key = lambda t: np.nanmean(-t[2]), reverse=True)

            closest_subjects = [t[0] for t in best_subjects[:k]]
            closest_correlations = [np.nanmean(t[1] + (1-t[2])) 
#             closest_correlations = [np.nanmean(1-t[2]) 
#             closest_correlations = [np.nanmean(t[1]) 
                                    for t in best_subjects[:k]]
            for i,c in enumerate(closest_correlations):
                cutoff=i
                if c<sim_thresh:
                    break
            if cutoff==0:
                cutoff=1
            print('closest', cutoff, closest_subjects[:cutoff], closest_correlations[:cutoff], 
                  writings_df_all_per_user[writings_df_all_per_user['subject'
                            ].isin(closest_subjects[:cutoff])]['label%d'%l].values[:cutoff])
            if not closest_subject:
                all_preds = 0
            else:
                all_preds = np.average(abs(writings_df_all_per_user[writings_df_all_per_user['subject'].isin(
                    closest_subjects[:cutoff])]['label%d'%l].values[:cutoff]),
                                      weights=closest_correlations[:cutoff])
            print("All preds:", all_preds)
            true_labels[subject].append(label_to_str(true_label, l))
    #         true_labels[subject].append(str(true_label))

            pred = label_to_str(all_preds, l)


            predictions[subject].append(pred)
    metrics = EriskScoresT3()
    res = metrics.compute(
    predictions = predictions.values(),
    references = true_labels.values())
    for m in res.keys():
        results[m].append(res[m])

print(results)
print({k:np.average(results[k]) for k in results})

In [None]:
def my_kernel(s1, s2):
    """
    We create a custom kernel:

                 (2  0)
    k(X, Y) = X  (    ) Y.T
                 (0  1)
    """
    if s1[0]==s2[0]:
        return 0.5
    d = np.array(similarity_matrix_vectors2[s1[0]][s2[0]][0])[features_index]
    c = np.array(similarity_matrix_vectors2[s1[0]][s2[0]][1])[features_index]
    return 1-d+c/2
    
from sklearn import svm
l = 0
# we create an instance of SVM and fit out data.
clf = svm.SVC(kernel=my_kernel)
train_subjects = [[s] for s in set(writings_df_all.subject) if s not in test_subjects]
train_labels = [writings_df_all[writings_df_all['subject']==s[0]]['label%d'%l].values[0] for s in train_subjects]
# clf.fit(train_subjects, train_labels)
# my_kernel(['subject4779'],['subject2903'])
# len(train_subjects)
print(train_subjects[0], train_subjects[1])

In [None]:
prons_index = [23,43,0,9]

features_index =  prons_index + oth + emotions_index 
[features[i] for i in features_index]

## Evaluation and predictions

In [None]:
import datasets    
import numpy as np

_DESCRIPTION_T3 = """\
Metrics for measuring the performance of prediction models for eRisk 2021 Task 2 and 3.
Include decision-based performance metrics: decision-based F1, lantency-weighted F1, ERDE score.
"""

_CITATION = ""

_KWARGS_DESCRIPTION_T3 = """
Calculates how good are predictions of answers given to the depression assessment questionnaire,
given some references, using different metrics.
Each prediction and reference is expected to be of length 21, corresponding to the 21 questions.
    predictions: list of predictions to score, one for each user. Each prediction
        should be a list of 21 strings in {0, 1, 2, 3, 1a, 1b, 2a, 2b, 3a, 3b}.
    references: list of references for each prediction, one for each user. Each
        reference should be a list of 21 strings in {0, 1, 2, 3, 1a, 1b, 2a, 2b, 3a, 3b}.

Returns:
    AHR: average hit rate - ratio of cases where the automatic questionnaire
        has exactly the same answer as the real questionnaire, averaged over users.
    ACR: average closeness rate - the difference between the real and the automated answer,
        (taking into account answers are on an ordinal scale), relative to the total number
        of possible answers for the question, averaged over users.
    ADODL: average difference between overall depression levels - difference in overall
        depression levels, computed as the sum of the answers given for all questions,
        (depression levels are integers between 0 and 63).
        averaged over users.
    DCHR: fraction of cases where the automated questionnaire led to a depression category
        that is equivalent to the depression category obtained from the real questionnaire
        (among 4 possible categories: minimal/mild/moderate/severe depression). 
    
    
Examples:
    >>> t3_metric = EriskScoresT3()
    >>> results = t3_metric.compute(
        predictions = [['0', '1', '2', '3', '1a', '1b', '2a', '2b', '3', '2', 
                    '0', '1', '2', '3', '1a', '1b', '2a', '2b', '3', '2', '3b'],
                  ['0', '1', '2', '3', '1a', '1b', '2a', '2b', '3', '2', 
                    '1', '2', '3', '3a', '1', '1', '2', '2a', '3b', '2', '3b']],
        references = [['0', '0', '0', '3', '1', '1', '2a', '2b', '3', '2', 
                    '1', '1', '1', '3', '1a', '1b', '2', '2', '3', '2', '3b'],
                 ['0', '1', '2', '3', '1', '1', '2', '2', '3', '2', 
                    '1', '2', '3', '3', '1', '1', '2', '2', '3', '2', '3']]
        )
    >>> print(results)
    >>> {'AHR': 0.6190476190476191,
         'ACR': 0.9603174603174602,
         'ADODL': 0.9761904761904762,
         'DCHR': 1.0}
   
"""

def _depression_category(score):
    if score >= 0 and score <=9:
        return 'minimal'
    if score >= 10 and score <= 18:
        return 'mild'
    if score >= 19 and score <= 29:
        return 'moderate'
    if score >= 30 and score <= 63:
        return 'severe'
    raise ValueError("Invalid score for depression questionnaire: %d" % score)

def _score(reference):
    total_score = 0
    for answer in reference:
        # Consider only first letter and convert to int
        answer_int = int(answer[0])
        total_score += answer_int
    return total_score



def _dl(l1, l2, max_dl=63):
    ad = abs(l1 - l2)
    return (max_dl - ad)/max_dl
    

class EriskScoresT3(datasets.Metric):
    def _info(self):
        return datasets.MetricInfo(
            description=_DESCRIPTION_T3,
            citation=_CITATION,
            inputs_description=_KWARGS_DESCRIPTION_T3,
            features=datasets.Features({
                'predictions': datasets.Sequence(datasets.Value('string')),
                'references':datasets.Sequence(datasets.Value('string'))
            }),
            codebase_urls=[],
            reference_urls=[],
        )
    
    def _ahr(self, predictions, references):
        hr_per_user = []
        nr_users = len(predictions)
        for u in range(nr_users):
            prediction = predictions[u]
            reference = references[u]
            hr = sum(i == j for i, j in zip(prediction, reference)) / len(prediction)
            hr_per_user.append(hr)
        return sum(hr_per_user)/nr_users
    
    def _cr(self, prediction, reference, nr_answers=4):
        closeness_scores = []
        nr_questions = len(prediction)
        for q in range(nr_questions):
            # Consider only first letter and convert to int
            predictionq_int = int(prediction[q][0])
            referenceq_int = int(reference[q][0])
            mad = nr_answers - 1
            ad = abs(predictionq_int - referenceq_int)
            closeness = (mad - ad) / mad
            closeness_scores.append(closeness)
        return sum(closeness_scores)/nr_questions
            
    def _acr(self, predictions, references):
        nr_users = len(predictions)
        cr_per_user = [self._cr(predictions[u], references[u]) 
                       for u in range(nr_users)]
        return sum(cr_per_user)/nr_users
    

    def _adodl(self, predictions, references):
        nr_users = len(predictions)
        scores_predictions = [_score(p) for p in predictions]
        scores_references = [_score(r) for r in references]
        level_differences = [_dl(scores_predictions[u],scores_references[u])
                             for u in range(nr_users)]
        return sum(level_differences)/nr_users

        
    def _dchr(self, predictions, references):
        nr_users = len(predictions)
        dc_predictions = [_depression_category(_score(predictions[u])) for u in range(nr_users)]
        dc_references = [_depression_category(_score(references[u])) for u in range(nr_users)]
        hr = sum(i == j for i, j in zip(dc_predictions, dc_references)) / nr_users
        return hr
    
    def _compute(self, predictions, references):
        return {
            'AHR': self._ahr(predictions, references),
            'ACR': self._acr(predictions, references),
            'ADODL': self._adodl(predictions, references),
            'DCHR': self._dchr(predictions, references),
        }

In [None]:
# import pickle
# predictions_avg = pickle.load(open('t2_predictions_avg_dev.pkl', 'rb'))
# predictions_maj = pickle.load(open('t2_predictions_maj_dev.pkl', 'rb'))
# true_labels = pickle.load(open('t2_true_labels_dev.pkl', 'rb'))

In [None]:
metrics = EriskScoresT3()
metrics.compute(
predictions = predictions.values(),
references = true_labels.values())

In [None]:
metrics = EriskScoresT3()
metrics.compute(
predictions = predictions_avglab.values(),
references = true_labels.values())

In [None]:
metrics = EriskScoresT3()
metrics.compute(
predictions = predictions_maj.values(),
references = true_labels.values())

In [None]:
metrics = EriskScoresT3()
metrics.compute(
predictions = predictions_avg.values(),
references = true_labels.values())

In [None]:
for u, preds in predictions.items():
    print(u, " ".join(preds))

In [None]:
len(set(writings_df_test.subject.values))