In [None]:
import pandas as pd
import xml.etree.ElementTree as ET
import glob, os
import numpy as np
from comet_ml import Experiment, Optimizer
import pickle
import logging
import sys
from sklearn.utils import class_weight
import re

In [None]:
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
os.environ['TF_KERAS'] = '1'

# only reserve 1 GPU

In [None]:
import tensorflow as tf
# tf.version

In [None]:
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.models import Sequential, Model, load_model
from tensorflow.keras.layers import Dense, Dropout, Embedding, LSTM, Lambda, BatchNormalization, TimeDistributed, \
     Bidirectional, Input, concatenate, Flatten, RepeatVector, Activation, Multiply, Permute#, CuDNNLSTM
from tensorflow.keras import regularizers
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras import callbacks, optimizers
from tensorflow.keras import backend as K
from tensorflow.keras.utils import plot_model, Sequence

from nltk.tokenize import RegexpTokenizer, TweetTokenizer
from nltk.corpus import stopwords

import tensorflow as tf

In [None]:
import tensorflow as tf
tf.test.is_gpu_available()

In [None]:
my_seed = 1234
# tf.set_random_seed(my_seed)

In [None]:
logger = logging.getLogger('training')
logger.addHandler(logging.StreamHandler(sys.stdout))
logger.setLevel(logging.DEBUG)

# Read data

In [None]:
def read_subject_writings(subject_file):
    writings = []
    with open(subject_file) as sf:
        contents = sf.read()
        contents = clean_content(contents)
#         try:
#         print(contents[:500])

        root = ET.fromstring(contents)
#         except:
#             print('Cannot extract text', contents[:500], '\n-------\n')
            
        try:
            subject = root.findall('ID')[0].text.strip()
        except Exception:
            print('Cannot extract ID', contents[:500], '\n-------\n')        
        for w in root.iter('WRITING'):
            subject_writings = {'subject': subject}
            for title in w.findall('TITLE'):
                subject_writings['title'] = title.text
            for text in w.findall('TEXT'):
                subject_writings['text'] = text.text
            for date in w.findall('DATE'):
                subject_writings['date'] = date.text
            writings.append(subject_writings)
    return writings
def clean_content(orig_content):
    WINDOWS_LINE_ENDING = b'\r\n'
    UNIX_LINE_ENDING = b'\n'
    byte_content = orig_content.encode("UTF-8")
    byte_content = byte_content.replace(WINDOWS_LINE_ENDING, UNIX_LINE_ENDING)
    content = byte_content.decode("UTF-8")
#     content = re.sub("^<", "^\<", orig_content)    
    content = re.sub("&", "&amp;", content)
    content = re.sub("^>", "&gt;", content)    
#     content = re.sub(">$", "\>$", content)    
    content = re.sub("<$", "&lt;", content)
    content = re.sub("<\n", "&lt;\n", content)
    content = re.sub("< ", "&lt; ", content)
    content = re.sub("<(?=\d)", "&lt;", content)
    content = re.sub("<(?=[^\w\\/])", "&lt;", content)
    content = re.sub("<(?=[^A-Z\\/])", "&lt;", content)
    return content

In [None]:
clean_content("> hello& there<\n <~23l <TEXT/> lala")

In [None]:
# root_dir = '/home/anasab/' 
root_dir = '/home/anasab/'

In [None]:
datadir_T2 = root_dir + '/eRisk/data/eRisk2020_T2/eRisk2020_T2_TEST_DATA/'
datadir_test_T2 = root_dir + 'eRisk/data/eRISK2021_T3_training_data/training BDI/2020 data'
datadir_test2021 = root_dir + 'eRisk/data/eRisk2021_T3_Collection'
# labels_file_T2 = root_dir + '/eRisk/data/eRisk2020_T2/eRisk2020_T2_TRAINING_DATA/Depression Questionnaires_anon.txt'
labels_file_test_T2 = root_dir + 'eRisk//data/eRISK2021_T3_training_data/training BDI/2020 data/Depression Questionnaires_anon.txt'
nr_questions = 21

In [None]:
def read_texts(datadir_T2,
                labels_file_T2=None):
    writings = []
    writings_df = pd.DataFrame()
    labels_df = pd.DataFrame()

        
    for subject_file in os.listdir(datadir_T2):
        if not subject_file.endswith('xml'):
            continue
#         try:
        print(subject_file)
        writings.extend(read_subject_writings(os.path.join(datadir_T2, subject_file)))
#         except:
#             print("Couldn't parse", subject_file)
    writings_df = pd.DataFrame(writings)
    if labels_file_T2:
        labels_df = pd.read_csv(os.path.join(labels_file_T2), 
                                     delimiter='\s+', names=['subject'] + ['label%i' % i for i in range(nr_questions)])

        labels_df = labels_df.set_index('subject')

        writings_df = writings_df.join(labels_df, on='subject')
    
    return writings_df, labels_df

In [None]:
writings_df_test, labels_df_test = read_texts(datadir_test_T2, labels_file_test_T2)
writings_df_test2021, _ = read_texts(datadir_test2021)
writings_df = pickle.load(open('data/writings_df_T2_liwc.pkl', 'rb'))

In [None]:
writings_df = pd.concat([writings_df, writings_df_test])
writings_df.groupby('subject').count()

In [None]:
writings_df_test2021.groupby('subject').count()
# writings_df_test2021
# pickle.dump(writings_df_test2021, open('data/writings_df_t3_2021_test_liwc.pkl', 'wb+'))

## Preprocess text

In [None]:
tokenizer = RegexpTokenizer(r'\w+')

def tokenize(t):
    return tokenizer.tokenize(t.lower())
tt = TweetTokenizer()
sw = stopwords.words("english")
def tokenize_tweets(t, tokenizer=tt, stop=True):
    tokens = tokenizer.tokenize(t.lower())
    tokens_clean = [token for token in tokens if 
                            re.match("^[a-z]*$", token)]
    if not stop:
        tokens_clean = [token for token in tokens_clean 
                        if token not in sw]
    return tokens_clean

def tokenize_fields(writings_df):
    writings_df['tokenized_title'] = writings_df['title'].apply(lambda t: tokenize_tweets(t) 
                                                                if type(t)==str and t else None)
    writings_df['title_len'] = writings_df['tokenized_title'].apply(lambda t: len(t) 
                                                                    if type(t)==list and t else None)
    writings_df['tokenized_text'] = writings_df['text'].apply(lambda t: tokenize_tweets(t) 
                                                              if type(t)==str and t else None)
    writings_df['text_len'] = writings_df['tokenized_text'].apply(lambda t: len(t) 
                                                                  if type(t)==list and t else None)
    return writings_df

In [None]:
writings_df_test2021 = tokenize_fields(writings_df_test2021)

In [None]:
def merge_tokens(row):
    tokens = []
    if row.tokenized_text:
        tokens += row.tokenized_text
    if row.tokenized_title:
        tokens += row.tokenized_title
    return tokens
writings_df_test2021['all_tokens'] = writings_df_test2021.apply (lambda row: merge_tokens(row), axis=1)

In [None]:
writings_df_test2021 = tokenize_fields(writings_df_test2021)

In [None]:
# TODO: include the title
def extract_emotions(tokens, emotion, relative=True):
    if not tokens:
        return None
    emotion_words = [t for t in tokens 
                     if t in nrc_lexicon[emotion]]
    if relative:
        return len(emotion_words) / len(tokens)
    else:
        return len(emotion_words)
    
    return encoded_emotions

from functools import partial
for emotion in emotions:
    writings_df_test2021[emotion] = writings_df_test2021['all_tokens'].apply(partial(extract_emotions, emotion=emotion, 
                                                                   relative=True))

In [None]:
writings_df_test2021['pronouns'] = writings_df_test2021['all_tokens'].apply(partial(encode_pronouns, relative=True))

In [None]:
from liwc_readDict import readDict

liwc = readDict('/home/anasab/resources/liwc.dic')
categories = [c for (w,c) in liwc]
set(categories)
liwc_dict = {}
for (w, c) in liwc:
    if c not in liwc_dict:
        liwc_dict[c] = []
    liwc_dict[c].append(w)

In [None]:
def encode_liwc_categories(tokens, category_words, relative=True):
    category_cnt = 0
    if not tokens:
        return None
    text_len = len(tokens)
    for t in tokens:
        for word in category_words:
            if t==word or (word[-1]=='*' and t.startswith(word[:-1])) \
            or (t==word.split("'")[0]):
                category_cnt += 1
                break # one token cannot belong to more than one word in the category
    if relative:
        return category_cnt/text_len
    else:
        return category_cnt

In [None]:
%%time
from functools import partial
# for categ in ['negemo', 'posemo', 'affect', 'sad', 'anx', 'pronoun']:#liwc_dict.keys():
for categ in liwc_dict.keys():
    if categ in writings_df_test2021.columns:
        continue
    print("Encoding for category %s..." % categ)
    writings_df_test2021[categ] = writings_df_test2021['all_tokens'].apply(partial(encode_liwc_categories, 
                                                                   category_words=liwc_dict[categ], 
                                                                   relative=True))


In [None]:
writings_df.text_len.describe()

In [None]:
writings_df.title_len.describe()

In [None]:
writings_df.groupby('subject').count().title.describe()

In [None]:
writings_df_test2021.columns

# Extract features

#### Emotions

In [None]:
def load_NRC(nrc_path):
    word_emotions = {}
    emotion_words = {}
    with open(nrc_path) as in_f:
        for line in in_f:
            line = line.strip()
            if not line:
                continue
            word, emotion, label = line.split()
            if word not in word_emotions:
                word_emotions[word] = set()
            if emotion not in emotion_words:
                emotion_words[emotion] = set()
            label = int(label)
            if label:
                word_emotions[word].add(emotion)
                emotion_words[emotion].add(word)
    return emotion_words

nrc_lexicon_path = root_dir + '/resources/NRC-Emotion-Lexicon-Wordlevel-v0.92.txt'
nrc_lexicon = load_NRC(nrc_lexicon_path)
emotions = list(nrc_lexicon.keys())


In [None]:
def encode_emotions(tokens, emotion_lexicon, emotions, relative=True):
    text_len = len(tokens)
    encoded_emotions = [0 for e in emotions]
    for i, emotion in enumerate(emotions):
        try:
            emotion_words = [t for t in tokens if t in emotion_lexicon[emotion]]
            if relative:
                encoded_emotions[i] = len(emotion_words) / len(tokens)
            else:
                encoded_emotions[i] = len(emotion_words)
        except ValueError:
            print("Emotion not found.")
    return encoded_emotions

In [None]:
from liwc_readDict import readDict

liwc = readDict(root_dir + '/resources/liwc.dic')

categories = set([c for (w,c) in liwc])
len(categories)

In [None]:
def encode_labels(labels):
    '''Convert ia to i and ib to -i'''
    encoded_labels = []
    for i, l in enumerate(labels):
        try:
            encoded_labels.append(int(l))
        except Exception as e:
            logger.debug("Encoding label %s\n" % l)
        
            if str(l)[-1] == 'a':
                encoded_labels.append(int(l[0]))
            elif str(l)[-1] == 'b':
                encoded_labels.append(-int(l[0]))
            else:
                logger.warning("Coult not encode label %s\n" % l)
    return encoded_labels

In [None]:
for i in range(21):
    writings_df['label%d'%i] = writings_df['label%d'%i].apply(lambda l: encode_labels([l])[0])

#### Personal pronouns

In [None]:
first_person_pronouns = {"i", "me", "my", "mine", "myself"}
def encode_pronouns(tokens, pronouns={"i", "me", "my", "mine", "myself"}, relative=True):
    if not tokens:
        return np.nan
    text_len = len(tokens)
    nr_pronouns = len([t for t in tokens if t in pronouns])
    if relative:
        return nr_pronouns/text_len
    else:
        return nr_pronouns

#### Stopwords

In [None]:
stopword_list = stopwords.words("english")
def encode_stopwords(tokens, stopwords=stopword_list):
    encoded_stopwords = [0 for s in stopwords]
    if not tokens:
        return encoded_stopwords
    for i, stopword in enumerate(stopwords):
        if stopword in tokens:
            encoded_stopwords[i] += 1
    return encoded_stopwords

# aggregate

In [None]:
# Aggregate by users
def aggregate_df(writings_df):
    writings_df = writings_df.fillna(value={'text': '', 'title':''})
    column_functions = {'text': lambda t: " ".join(t), 
                                            'title': lambda t: " ".join(t),
                                            'tokenized_text': 'sum',
                                            'tokenized_title': 'sum',
                                            'text_len': 'sum',
                                            'title_len': 'sum'}
    if 'label1' in writings_df.columns:
        column_functions.update({'label%i'%i: 'min' for i in range(21)})
    writings_per_user_df = writings_df.groupby('subject').aggregate(column_functions)
    #                                          'subset': 'min'})
    writings_per_user_df = writings_per_user_df.fillna("")
    writings_per_user_df['text'] = writings_per_user_df['text'] + " " +  writings_per_user_df['title']
    writings_per_user_df['text_len'] = writings_per_user_df['text_len'] + writings_per_user_df['title_len']
    return writings_per_user_df

In [None]:
# Aggregate by users
writings_df_per_user = aggregate_df(writings_df)
writings_df_test_per_user = aggregate_df(writings_df_test2021)

In [None]:
writings_df_per_user.text_len.describe()

# SVM

In [None]:
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score


In [None]:
def get_avg_embedding(writings_df, subject, column):
    return writings_df[writings_df['subject']==subject][column].apply(lambda l: np.array(l)).values.mean()

In [None]:
avg_embeddings_text = {s: get_avg_embedding(writings_df, s, 'use_embeddings_text') 
                       for s in set(writings_df.subject.values)}
avg_embeddings_title = {s: get_avg_embedding(writings_df, s, 'use_embeddings_title') 
                       for s in set(writings_df.subject.values)}

In [None]:
series_embeddings_text = pd.Series(avg_embeddings_text)
series_embeddings_text.name = 'avg_embeddings_text'
series_embeddings_title = pd.Series(avg_embeddings_title)
series_embeddings_title.name = 'avg_embeddings_title'

In [None]:
writings_per_user_df = writings_per_user_df.join(series_embeddings_text, on='subject')
writings_per_user_df = writings_per_user_df.join(series_embeddings_title, on='subject')

In [None]:
writings_per_user_df.columns.values

In [None]:
writings_per_user_df.join?

### User embeddings

In [None]:
writings_df_per_user['subject'] = writings_df_per_user.index
writings_df_test_per_user['subject'] = writings_df_test_per_user.index

In [None]:
all_embeddings_dict = pickle.load(open('data/user_embeddings_t2_2020_model101.pkl', 'rb'))
all_embeddings_dict2 = pickle.load(open('data/user_embeddings_t2_test_2020_model101.pkl', 'rb'))
all_embeddings_dict.update(all_embeddings_dict2)

In [None]:
all_embeddings_dict_test = pickle.load(open('data/user_embeddings_t3_test2021_model101.pkl', 'rb'))
len(all_embeddings_dict_test.keys())

In [None]:
writings_df_per_user['user_embeddings_avg'] = writings_df_per_user.subject.apply(
    lambda s: np.mean(all_embeddings_dict[s], axis=0))

In [None]:
writings_df_test_per_user['user_embeddings_avg'] = writings_df_test_per_user.subject.apply(
    lambda s: np.mean(all_embeddings_dict_test[s], axis=0))

In [None]:
writings_df_test_per_user['user_embeddings_avg']

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
# writings_embeddings['use_similarity'] = writings_embeddings['use_embeddings_text'].apply()

In [None]:
# features = [np.random.rand(75) for i in range(20)]
# features = writings_per_user_df[list(categories) + emotions + ["pronouns"]]
# features = writings_per_user_df['avg_embeddings_title'].values.tolist()
features = writings_df_per_user['user_embeddings_avg'].values.tolist()

def cross_validation(folds=2):
    svmmodels= {}
    total_score = 0
    for l in range(21):
#         print("Classifier for label", l)
        labels = writings_df_per_user['label%d' % l].values
        svmmodels[l] = SVC(kernel='rbf', C=5)
        cvscores = cross_val_score(svmmodels[l], features, labels, cv=folds)
#         print(sum(cvscores)/folds, cvscores)
        total_score += sum(cvscores)/folds
    return total_score/21

print(cross_validation())

In [None]:
def results_for_label(features, l, train_examples=16):
    labels = writings_per_user_df['label%d' % l].values
    svmmodel=SVC()
    svmmodel.fit(features[:train_examples], labels[:train_examples])
    predictions = svmmodel.predict(features[train_examples:])
    print(l, predictions, labels[train_examples:], labels[:train_examples])
    return labels[train_examples:]==predictions

In [None]:
cumresults = []
for l in range(21):
    results = results_for_label(features, l)
    cumresults.append(results)

nrusers = len(cumresults[0])
nrques = 21
correct_per_user = {u: 0 for u in range(nrusers)}
for q, ques in enumerate(cumresults):
    for u, answ in enumerate(cumresults[q]):
        if answ:
            correct_per_user[u] += 1

for u in correct_per_user:
    print("u", u, correct_per_user[u]/nrques)
print("AHR", sum(correct_per_user.values())/nrusers/nrques)

## KNN

In [None]:
np.array(features).shape

In [None]:
from sklearn.neighbors import KNeighborsClassifier
label_set = []
for column in writings_per_user_df.columns:
    if column.startswith('label'):
        label_set.append(column)
label_set

In [None]:
def classifier_for_label(label_col, subjects, n=5):
    neigh = KNeighborsClassifier(n_neighbors=n)
#     X = [list(l) for l in writings_per_user_df[writings_per_user_df['subject'].isin(subjects)][
#         'user_embeddings_avg'].values]
#     y = writings_per_user_df[writings_per_user_df['subject'].isin(subjects)][label_col].values
    X = []
    y = []
    for s in subjects:
        label = writings_df_per_user[writings_df_per_user['subject']==s][label_col].values[0]
        for e in all_embeddings_dict[s]:
            y.append(label)
            X.append(e)
            
    neigh.fit(X, y)
    return neigh


In [None]:
n=3
all_subjects = list(all_embeddings_dict.keys())
training_subjects = all_subjects
test_subjects = all_embeddings_dict_test.keys()
knns = {}
for label in range(21):
    knns['label%d'%label] = classifier_for_label('label%d'%label, subjects=training_subjects,n=n)

In [None]:
from collections import Counter
predictions_avg = {}
predictions_maj = {}
predictions_avglab = {}
true_labels = {}
for subject in test_subjects:
    predictions_avg[subject] = []
    predictions_maj[subject] = []
    predictions_avglab[subject] = []
    true_labels[subject] = []
    for l in range(21):
        label = "label%d" % l
        print("\n" + subject)
        if label in writings_df_test_per_user.columns:
            true_label = writings_df_test_per_user[writings_df_test_per_user['subject']==subject][label].values[0]
        else:
            true_label = 0
        print("True label:", true_label)
        average_pred = knns[label].predict([np.mean(all_embeddings_dict_test[subject], axis=0)])[0]
        print("Average pred:", average_pred)
        all_preds = knns[label].predict(all_embeddings_dict_test[subject])
        majority_label = Counter(all_preds).most_common(1)[0][0]
        print("Majority label:", majority_label)
        print("All preds:", all_preds)
        average_lab_pred = np.mean(all_preds)
        true_labels[subject].append(str(true_label))

        
        if int(average_pred) < 0:
            average_pred = str(abs(average_pred)) + 'b'
        elif l in [15, 17] and average_pred>0:
            average_pred = str(average_pred) + 'a'
            
        if int(round(average_lab_pred)) < 0:
            average_lab_pred = str(abs(round(average_lab_pred))) + 'b'
        elif l in [15, 17] and average_lab_pred>0:
            average_lab_pred = str(round(average_lab_pred)) + 'a'
        else:
            average_lab_pred = round(average_lab_pred)
            
        if int(majority_label) < 0:
            majority_label = str(abs(majority_label)) + 'b'
        elif l in [15, 17] and majority_label>0:
            majority_label = str(majority_label) + 'a'
            
        predictions_avg[subject].append(str(average_pred))
        predictions_maj[subject].append(str(majority_label))
        predictions_avglab[subject].append(str(average_lab_pred))


In [None]:
predictions_avglab

In [None]:
# pickle.dump(predictions_avg, open('t2_predictions_k5_alltexts_userembeddings101_avg_dev.pkl', 'wb+'))
# pickle.dump(predictions_maj, open('t2_predictions_k5_alltexts_userembeddings101_maj_dev.pkl', 'wb+'))
# pickle.dump(predictions_avglab, open('t2_predictions_k5_alltexts_userembeddings101_avglab_dev.pkl', 'wb+'))
# pickle.dump(true_labels, open('t2_true_labels_dev.pkl', 'wb+'))

In [None]:
import datasets    
import numpy as np

_DESCRIPTION_T3 = """\
Metrics for measuring the performance of prediction models for eRisk 2021 Task 2 and 3.
Include decision-based performance metrics: decision-based F1, lantency-weighted F1, ERDE score.
"""

_CITATION = ""

_KWARGS_DESCRIPTION_T3 = """
Calculates how good are predictions of answers given to the depression assessment questionnaire,
given some references, using different metrics.
Each prediction and reference is expected to be of length 21, corresponding to the 21 questions.
    predictions: list of predictions to score, one for each user. Each prediction
        should be a list of 21 strings in {0, 1, 2, 3, 1a, 1b, 2a, 2b, 3a, 3b}.
    references: list of references for each prediction, one for each user. Each
        reference should be a list of 21 strings in {0, 1, 2, 3, 1a, 1b, 2a, 2b, 3a, 3b}.

Returns:
    AHR: average hit rate - ratio of cases where the automatic questionnaire
        has exactly the same answer as the real questionnaire, averaged over users.
    ACR: average closeness rate - the difference between the real and the automated answer,
        (taking into account answers are on an ordinal scale), relative to the total number
        of possible answers for the question, averaged over users.
    ADODL: average difference between overall depression levels - difference in overall
        depression levels, computed as the sum of the answers given for all questions,
        (depression levels are integers between 0 and 63).
        averaged over users.
    DCHR: fraction of cases where the automated questionnaire led to a depression category
        that is equivalent to the depression category obtained from the real questionnaire
        (among 4 possible categories: minimal/mild/moderate/severe depression). 
    
    
Examples:
    >>> t3_metric = EriskScoresT3()
    >>> results = t3_metric.compute(
        predictions = [['0', '1', '2', '3', '1a', '1b', '2a', '2b', '3', '2', 
                    '0', '1', '2', '3', '1a', '1b', '2a', '2b', '3', '2', '3b'],
                  ['0', '1', '2', '3', '1a', '1b', '2a', '2b', '3', '2', 
                    '1', '2', '3', '3a', '1', '1', '2', '2a', '3b', '2', '3b']],
        references = [['0', '0', '0', '3', '1', '1', '2a', '2b', '3', '2', 
                    '1', '1', '1', '3', '1a', '1b', '2', '2', '3', '2', '3b'],
                 ['0', '1', '2', '3', '1', '1', '2', '2', '3', '2', 
                    '1', '2', '3', '3', '1', '1', '2', '2', '3', '2', '3']]
        )
    >>> print(results)
    >>> {'AHR': 0.6190476190476191,
         'ACR': 0.9603174603174602,
         'ADODL': 0.9761904761904762,
         'DCHR': 1.0}
   
"""

def _depression_category(score):
    if score >= 0 and score <=9:
        return 'minimal'
    if score >= 10 and score <= 18:
        return 'mild'
    if score >= 19 and score <= 29:
        return 'moderate'
    if score >= 30 and score <= 63:
        return 'severe'
    raise ValueError("Invalid score for depression questionnaire: %d" % score)

def _score(reference):
    total_score = 0
    for answer in reference:
        # Consider only first letter and convert to int
        answer_int = int(answer[0])
        total_score += answer_int
    return total_score



def _dl(l1, l2, max_dl=63):
    ad = abs(l1 - l2)
    return (max_dl - ad)/max_dl
    

class EriskScoresT3(datasets.Metric):
    def _info(self):
        return datasets.MetricInfo(
            description=_DESCRIPTION_T3,
            citation=_CITATION,
            inputs_description=_KWARGS_DESCRIPTION_T3,
            features=datasets.Features({
                'predictions': datasets.Sequence(datasets.Value('string')),
                'references':datasets.Sequence(datasets.Value('string'))
            }),
            codebase_urls=[],
            reference_urls=[],
        )
    
    def _ahr(self, predictions, references):
        hr_per_user = []
        nr_users = len(predictions)
        for u in range(nr_users):
            prediction = predictions[u]
            reference = references[u]
            hr = sum(i == j for i, j in zip(prediction, reference)) / len(prediction)
            hr_per_user.append(hr)
        return sum(hr_per_user)/nr_users
    
    def _cr(self, prediction, reference, nr_answers=4):
        closeness_scores = []
        nr_questions = len(prediction)
        for q in range(nr_questions):
            # Consider only first letter and convert to int
            predictionq_int = int(prediction[q][0])
            referenceq_int = int(reference[q][0])
            mad = nr_answers - 1
            ad = abs(predictionq_int - referenceq_int)
            closeness = (mad - ad) / mad
            closeness_scores.append(closeness)
        return sum(closeness_scores)/nr_questions
            
    def _acr(self, predictions, references):
        nr_users = len(predictions)
        cr_per_user = [self._cr(predictions[u], references[u]) 
                       for u in range(nr_users)]
        return sum(cr_per_user)/nr_users
    

    def _adodl(self, predictions, references):
        nr_users = len(predictions)
        scores_predictions = [_score(p) for p in predictions]
        scores_references = [_score(r) for r in references]
        level_differences = [_dl(scores_predictions[u],scores_references[u])
                             for u in range(nr_users)]
        return sum(level_differences)/nr_users

        
    def _dchr(self, predictions, references):
        nr_users = len(predictions)
        dc_predictions = [_depression_category(_score(predictions[u])) for u in range(nr_users)]
        dc_references = [_depression_category(_score(references[u])) for u in range(nr_users)]
        hr = sum(i == j for i, j in zip(dc_predictions, dc_references)) / nr_users
        return hr
    
    def _compute(self, predictions, references):
        return {
            'AHR': self._ahr(predictions, references),
            'ACR': self._acr(predictions, references),
            'ADODL': self._adodl(predictions, references),
            'DCHR': self._dchr(predictions, references),
        }

In [None]:
# import pickle
# predictions_avg = pickle.load(open('t2_predictions_avg_dev.pkl', 'rb'))
# predictions_maj = pickle.load(open('t2_predictions_maj_dev.pkl', 'rb'))
# true_labels = pickle.load(open('t2_true_labels_dev.pkl', 'rb'))

In [None]:
metrics = EriskScoresT3()
metrics.compute(
predictions = predictions_avglab.values(),
references = true_labels.values())

In [None]:
metrics = EriskScoresT3()
metrics.compute(
predictions = predictions_maj.values(),
references = true_labels.values())

In [None]:
metrics = EriskScoresT3()
metrics.compute(
predictions = predictions_avg.values(),
references = true_labels.values())

In [None]:
for u, preds in predictions_avg.items():
    print(u, " ".join(preds))