In [None]:
# import pandas as pd
# import xml.etree.ElementTree as ET
import glob, os
# import numpy as np
from comet_ml import Experiment, Optimizer
import pickle
# import logging
# import sys
# from sklearn.utils import class_weight
# from sklearn.metrics import roc_auc_score
# from sklearn.metrics import classification_report, confusion_matrix
from tensorflow.keras import optimizers


# from matplotlib import pyplot as plt
import json
from nltk.tokenize import RegexpTokenizer, TweetTokenizer
from nltk.corpus import stopwords
# import re

In [None]:
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
# os.environ['CUDA_VISIBLE_DEVICES'] = '-1' # When cudnn implementation not found, run this
os.environ["CUDA_VISIBLE_DEVICES"] = "0" # Note: when starting kernel, for gpu_available to be true, this needs to be run
# only reserve 1 GPU
os.environ['TFHUB_CACHE_DIR'] = '/home/anasab/tf_cache'
os.environ['TF_FORCE_GPU_ALLOW_GROWTH']='true'

In [None]:
dataset_type = "selfharm"
transfer_type = None

In [None]:
# tf.test.is_gpu_available()

# Read data

### eRisk 2020 T1

In [None]:
root_dir = '/home/anasab/' 

datadirs_T1_2020 = {
    'train': ['./data/'],
    'test': ['./DATA/']
}
datadir_root_T1_2020 = {
    'train': root_dir + '/eRisk/data/eRisk2020_T1_train/eRISK2020_T1_training_data/eRISK2020_training_data/',
    'test': root_dir + '/eRisk/data/2020/T1/'
}
    
labels_files_T1_2020 = {
    'train': ['golden_truth.txt'],
    'test': ['T1_erisk_golden_truth.txt']
}

In [None]:
from read_erisk_data import read_texts_2020
if dataset_type == 'selfharm':
#     writings_df = read_texts_2020(datadir_root_T1_2020,
#                    datadirs_T1_2020,
#                    labels_files_T1_2020,
#                    test_suffix='0000',
#                     chunked_subsets=None)
    writings_df = pickle.load(open('data/writings_df_%s_all' % dataset_type, 'rb'))


## Preprocess text

In [None]:
regtokenizer = RegexpTokenizer(r'\w+')
tweet_tokenizer = TweetTokenizer()
sw = stopwords.words("english")

def tokenize(t, tokenizer=regtokenizer):
    return regtokenizer.tokenize(t.lower())

In [None]:
from feature_encoders import encode_pronouns, encode_emotions, tokenize_fields, encode_stopwords

In [None]:
writings_df = tokenize_fields(writings_df, tokenize_fct=tokenize)

## Extract features and encode data

In [None]:
hyperparams_features = json.load(open('config.json'))


In [None]:
if transfer_type:
    pretrained_model_path = hyperparams_features['pretrained_model_path']
    hyperparams, hyperparams_features = load_params(hyperparams_features['pretrained_model_path'])
    hyperparams_features['pretrained_model_path'] = pretrained_model_path

In [None]:
from resource_loading import load_NRC, load_LIWC

nrc_lexicon_path = root_dir + '/resources/NRC-Emotion-Lexicon-Wordlevel-v0.92.txt'
nrc_lexicon = load_NRC(nrc_lexicon_path)
emotions = list(nrc_lexicon.keys())
liwc_dict = load_LIWC(root_dir + '/resources/liwc.dic')

liwc_categories = set(liwc_dict.keys())
liwc_words_for_categories = pickle.load(open("data/liwc_categories_for_vocabulary_erisk_clpsych_stop_20K.pkl", "rb"))

stopword_list = stopwords.words("english")


### Encode data

In [None]:
from data_loading import load_erisk_data
from resource_loading import load_embeddings

In [None]:
from collections import Counter
Counter(writings_df['subset'].values)

In [None]:

user_level_data, subjects_split, vocabulary = load_erisk_data(writings_df, 
                                                           hyperparams_features=hyperparams_features,
                                                                                logger=None,
                                                              by_subset=True,
                                                                               )

### Data Generator

In [None]:
from DataGenerator import DataGenerator

In [None]:
from model import build_hierarchical_model

In [None]:
d = DataGenerator(user_level_data=user_level_data, subjects_split=subjects_split, set_type='train',
                 batch_size=32, seq_len=512, hyperparams_features=hyperparams_features,
                 post_groups_per_user=None, posts_per_group=10, post_offset = 0,
                 pronouns=["i", "me", "my", "mine", "myself"], 
                 compute_liwc=False, 
                 max_posts_per_user=None,
                 shuffle=True, keep_last_batch=True)

In [None]:
len(subjects_split['valid'])

## Train

In [None]:
from model import build_hierarchical_model


In [None]:
hyperparams = json.load(open('hyperparams.json'))
hyperparams['optimizer'] = optimizers.Adam(lr=hyperparams['lr'], #beta_1=0.9, beta_2=0.999, epsilon=0.0001,
                                   decay=hyperparams['decay'])
    
if transfer_type:
#     hyperparams, _ = load_params(hyperparams_features['pretrained_model_path'])
    if 'optimizer' not in hyperparams:
        hyperparams['optimizer'] = optimizers.Adam(lr=hyperparams['lr'], #beta_1=0.9, beta_2=0.999, epsilon=0.0001,
                                       decay=hyperparams['decay'])

In [None]:
# %%time
from train import initialize_experiment, train

# experiment = initialize_experiment(hyperparams, nrc_lexicon_path, emotions, hyperparams_features['embeddings_path'], 
#                       dataset_type, transfer_type, hyperparams_features)
# models, history = train(user_level_data, subjects_split, 
#           hyperparams=hyperparams, hyperparams_features=hyperparams_features, 
#               experiment=experiment, dataset_type=dataset_type, transfer_type=transfer_type,stopwords_dim=len(stopword_list),
#               validation_set='valid',
#           version=102, epochs=25, start_epoch=0
#
#                                        )

## Predict

In [None]:
from load_save_model import load_saved_model_weights, load_params

In [None]:
hyperparams_features['pretrained_model_path']='models/lstm_selfharm_hierarchical113'
# hyperparams_features['pretrained_model_path']='models/lstm_selfharm_hierarchical107'
hyperparams_features['pretrained_model_path']='models/lstm_selfharm_hierarchical117'


In [None]:
hyperparams, _ = load_params(hyperparams_features['pretrained_model_path'])
hyperparams['optimizer'] = optimizers.Adam(lr=hyperparams['lr'], #beta_1=0.9, beta_2=0.999, epsilon=0.0001,
                                   decay=hyperparams['decay'])
model = load_saved_model_weights(hyperparams_features['pretrained_model_path'], 
                                                      hyperparams, hyperparams_features, 
                                                      h5=True)

In [None]:
d = DataGenerator(user_level_data=user_level_data, subjects_split=subjects_split, set_type='train',
                 batch_size=32, seq_len=256, hyperparams_features=hyperparams_features,
                 post_groups_per_user=None, posts_per_group=hyperparams['posts_per_group'], post_offset = 0,
                 pronouns=["i", "me", "my", "mine", "myself"], 
                 compute_liwc=False, 
                 max_posts_per_user=None,
                 shuffle=True, keep_last_batch=True)
# predictions = model.predict(d)

In [None]:
# from evaluate_subjects import evaluate_for_subjects
# evaluate_for_subjects(model, subjects_split['test'], 
#                       user_level_data, hyperparams, hyperparams_features, rolling_window=3)


### Server data

In [None]:
from collections import Counter
from feature_encoders import encode_liwc_categories
def load_erisk_server_data(datarounds_json, tokenizer,
                    pronouns = ["i", "me", "my", "mine", "myself"],
                   logger=None):

    subjects_split = {'test': []}
    user_level_texts = {}
    for datapoint in datarounds_json:
#         for datapoint in datapoints_json:
            words = []
            raw_text = ""
            if "title" in datapoint:
                tokenized_title = tokenizer.tokenize(datapoint["title"])
                words.extend(tokenized_title)
                raw_text += datapoint["title"]
            if "content" in datapoint:
                tokenized_text = tokenizer.tokenize(datapoint["content"])
                words.extend(tokenized_text)
                raw_text += datapoint["content"]
            
            liwc_categs = encode_liwc_categories(words, liwc_categories, liwc_words_for_categories)
            if datapoint["nick"] not in user_level_texts.keys():
                user_level_texts[datapoint["nick"]] = {}
                user_level_texts[datapoint["nick"]]['texts'] = [words]
                user_level_texts[datapoint["nick"]]['raw'] = [raw_text]
                subjects_split['test'].append(datapoint['nick'])
            else:
                user_level_texts[datapoint["nick"]]['texts'].append(words)
                user_level_texts[datapoint["nick"]]['raw'].append(raw_text)
            
    return user_level_texts, subjects_split

In [None]:
def read_json_datapoint(jlpath):
    datapoints = []
    with open(jlpath) as f:
        for line in f:
            datapoints.append(json.loads(line))
    return datapoints

In [None]:
import logging, sys
from resource_loading import load_vocabulary
data_erisk = []
with open('data11.jl') as f:
    for line in f:
        data_erisk.append(json.loads(line))
with open('data8.jl') as f:
    for line in f:
        data_erisk.append(json.loads(line))
with open('data11.jl') as f:
    for line in f:
        data_erisk.append(json.loads(line))

erisk_server_data, erisk_server_subjects_split = load_erisk_server_data(data_erisk, 
                                                                                    
                                        tokenizer=RegexpTokenizer('\w+'))

In [None]:
from EriskDataGenerator import EriskDataGenerator

generator = EriskDataGenerator(hyperparams_features=hyperparams_features,
                                seq_len=hyperparams['maxlen'], batch_size=hyperparams['batch_size'],
                                     max_posts_per_user=None,
                                    posts_per_group=hyperparams['posts_per_group'],
                                    post_groups_per_user=None, 
                                    shuffle=False, return_subjects=True,
                                            compute_liwc=True)
generator.add_data_round(data_erisk)
generator.add_data_round(data_erisk)
model.predict(generator)

In [None]:
print("\n".join(stopwords.words("english")))

In [None]:
# %%time
# erisk_server_data, erisk_server_subjects_split, vocabulary = load_erisk_server_data(data,
#                        tokenizer=regtokenizer,
#                        liwc_words_for_categories=liwc_words_for_categories,
#                     voc_size=hyperparams_features['max_features'],
#                     emotion_lexicon=nrc_lexicon,
#                     emotions=emotions,
#                     user_level=hyperparams_features['user_level'],
#                        vocabulary=vocabulary_dict,
#     #                                                            vocabulary=pickle.load(open('vocabulary20K_selfharm.pkl', 'rb'))
#                     logger=logger)

In [None]:
dg = DataGenerator(erisk_server_data, erisk_server_subjects_split, 
                                     set_type='test', 
                                                       hyperparams_features=hyperparams_features,
                                seq_len=hyperparams['maxlen'], batch_size=hyperparams['batch_size'],
                                     max_posts_per_user=None,
                                    posts_per_group=hyperparams['posts_per_group'],
                                    post_groups_per_user=None, 
                                    shuffle=False, return_subjects=True,
                                            compute_liwc=True)
dg.data

In [None]:
%%time

server_erisk_predictions = model.predict(DataGenerator(erisk_server_data, erisk_server_subjects_split, 
                                     set_type='test', 
                                                       hyperparams_features=hyperparams_features,
                                seq_len=hyperparams['maxlen'], batch_size=hyperparams['batch_size'],
                                     max_posts_per_user=None,
                                    posts_per_group=hyperparams['posts_per_group'],
                                    post_groups_per_user=None, 
                                    shuffle=False,
                                            compute_liwc=True)
                                                               )


In [None]:
import pandas as pd
pd.Series(server_erisk_predictions.flatten()).describe()

In [None]:
# data_generator = EriskDataGenerator(hyperparams_features=hyperparams_features,
#                             seq_len=hyperparams['maxlen'], batch_size=1,
#                                  max_posts_per_user=None,
#                                 posts_per_group=hyperparams['posts_per_group'],
#                                 post_groups_per_user=None, 
#                                 shuffle=False, return_subjects=True,
#                                         compute_liwc=True)
data_round1 = [{
"redditor": 338, "content": "", 
"date": "2014-12-12T04:21:13.000+0000", 
"id": 168996, 
"title": "    Copy the Reindeer", 
"number": 1, 
"nick": "subject8081"},
{"redditor": 339, 
"content": "    When I don't have the aisle seat and have to climb over people to use the bathroom. I have a tiny girl bladder.", 
"date": "2013-10-10T13:17:01.000+0000", 
"id": 169297, 
"title": "", 
"number": 1, 
"nick": "subject2621"},
{"redditor": 340, 
"content": "    I have a question about being a visitor in Nioh(Random encounters)", 
"date": "2017-05-09T17:01:50.000+0000", 
"id": 169531, "title": "    Nioh - Become a visitor", 
"number": 1, 
"nick": "subject992"}]

data_round2 = [{
"redditor": 340, 
"content": "    New text", 
"date": "2017-05-09T17:02:50.000+0000", 
"id": 169532, 
"title": "    Nioh - Become a visitor", "number": 2, "nick": "subject992"}]




# import numpy as np
# def scores_to_alerts(predictions_dict, conservative_alerts=False, 
#                      alert_threshold=0.5, rolling_window=0):
#     '''Generates alerts decisions (1/0) from a dictionary of prediction scores per user
#     Parameters:
#         predictions_dict: dictionary with ordered predictions per user (indexed by user id) 
#         rolling_window: window of rolling average to be computed across prediction scores
#             history for a given user in order to get a "smoothed" prediction for each datapoint
#             If 0, then no rolling average is computed.
#         conservative_alerts: if True, will only emit positive alerts if enough input posts are
#             used for prediction (will only trust predictions based on at least as many posts
#             as were used in one datapoint in the training stage)
#         posts_per_datapoint: integer denoting number of posts per datapoint used in the training stage
#             used in case of conservative_alerts=True
#         alert_threshold: threshold on the score value above which to emit a positive alert
#         Returns: nested dictionary indexed by users, including the original prediction score
#             ('scores' key) and the alert value (1/0) (the 'alerts' key)'''
#     users = predictions_dict.keys()
#     scores_per_user = dict(predictions_dict)
#     def _rolling_average(scores, window):
#         if window < len(scores):
#             return scores
#         rolling_predictions = []
#         rolling_predictions[:rolling_window-1] = scores[:rolling_window-1]
#         rolling_predictions.extend(np.convolve(scores, np.ones(rolling_window), 'valid') / rolling_window)
#         return rolling_predictions
#     if rolling_window:
#         scores_per_user = {u: _rolling_average(scores_per_user[u], rolling_window) for u in users}
#     alerts_per_user = {}
#     for u in users:
#         if conservative_alerts:
#             alerts_per_user[u] = [0 for p in scores_per_user[u]]
#         else:
#             alerts_per_user[u] = [int(p>=alert_threshold) for p in scores_per_user[u]]
#     return {u: {'scores': scores_per_user[u], 'alerts': alerts_per_user[u]} for u in users}



# # model_path = RUNS_MODEL_PATHS[run_nr]
# # hyperparams, hyperparams_features = load_params(model_path)

# # model = load_saved_model_weights(model_path, hyperparams, hyperparams_features, 
# #                                                   h5=True)
# rolling_window=50
# conservative_alerts=False
# alert_threshold=0.5
# data_generator = EriskDataGenerator(hyperparams_features=hyperparams_features,
#                             seq_len=hyperparams['maxlen'], batch_size=hyperparams['batch_size'],
#                                  max_posts_per_user=None,
#                                 posts_per_group=hyperparams['posts_per_group'],
#                                 post_groups_per_user=None, 
#                                 shuffle=False, return_subjects=True,
#                                         compute_liwc=True)

# for data_round in [data_round1, data_round2]:
#     data_generator.add_data_round(data_round)


# predictions_per_user = {}
# for dp in data_generator:
#     prediction = model.predict_step(dp)
#     u = dp[1][0]
#     print(u)
#     if u not in predictions_per_user:
#         predictions_per_user[u] = []
#     predictions_per_user[u].append(prediction.numpy()[0].item())
# alerts_per_user = scores_to_alerts(predictions_per_user, rolling_window=rolling_window,
#     alert_threshold=alert_threshold,
#     conservative_alerts=(conservative_alerts and len(data_rounds) < hyperparams['posts_per_group']))
# print(alerts_per_user)



In [None]:
from predict_erisk import predict, scores_to_alerts
# predict(run_nr=1, data_rounds=[data_round1, data_round2])

In [None]:
data_generator = EriskDataGenerator(hyperparams_features=hyperparams_features,
                            seq_len=hyperparams['maxlen'], batch_size=1,
                                 max_posts_per_user=None,
                                posts_per_group=hyperparams['posts_per_group'],
                                post_groups_per_user=None, 
                                shuffle=False, return_subjects=True,
                                        compute_liwc=True)
for data_round in [data_round1, data_round2]:
    data_generator.add_data_round(data_round)
rolling_window=0
alert_threshold=0.5
conservative_alerts=False

predictions_per_user = {}
for dp in data_generator:
    prediction = model.predict_step(dp)
    u = dp[1][0]
    print(dp[1])
    if u not in predictions_per_user:
        predictions_per_user[u] = []
    predictions_per_user[u].append(prediction.numpy()[0].item())
alerts_per_user = scores_to_alerts(predictions_per_user, rolling_window=rolling_window,
    alert_threshold=alert_threshold,
    conservative_alerts=(conservative_alerts and len(data_rounds) < hyperparams['posts_per_group']))
print(alerts_per_user)
print(predictions_per_user)

In [None]:
predict(run_nr=1, data_rounds=[data_round1, data_round2])

### Seding results to server!

In [None]:
# subjects = [d['nick'] for d in read_json_datapoint("client/data0.jl")]
# results = {s: 0 for s in subjects}

In [None]:
# def get_next_data(rnd, results):
#     # TODO: send results to get data
#     response = build_response(results)
#     # Send response
#     data = {"..."}
#     # Make sure it's the correct round
#     assert data['number'] == rnd
#     serialize_data(data)
#     return data

In [None]:
# def get_next_data_dummy(rnd, results):
#     return read_json_datapoint("client/data0.jl")

In [None]:
# Results for round and model
results = {key: {} for key in models_runs}

In [None]:
def get_data_chunk(rnds):
    # Send same results for a chunk of rounds to get new posts
    data = [read_json_datapoint("data_server/data%d.jl" % i) for i in rnds]
#     data_chunks = []
#     for rnd in rnds:
#         # TODO: REPLACE THIS WITH THE CORRECT ONE
#         data = get_next_data_dummy(rnd, results)
#         data_chunks.append(data)
#         all_data[rnd] = data
    return data

In [None]:
%%time
def predict_for_round_chunk(model, hyperparams, hyperparams_features, vocabulary, data_chunk, subjects=[],
                           model_key='', cache_round=None): 
    # preload for subjects not occurring in the round with results from previous round
#     results = {s: 0 for s in subjects}
    if cache_round:
        results = load_results(model_key, cache_round)
    else:
        results = {s: 0 for s in subjects}
        
    
    erisk_server_data, erisk_server_subjects_split, vocabulary = load_erisk_server_data(data_chunk,
                       tokenizer=regtokenizer,
                       liwc_words_for_categories=liwc_words_for_categories,
                    voc_size=hyperparams_features['max_features'],
                    emotion_lexicon=nrc_lexicon,
                    emotions=emotions,
                    user_level=1,
                       vocabulary=vocabulary,
                    logger=logger)

    for features, subjects, _ in DataGenerator(erisk_server_data, erisk_server_subjects_split, 
                                         set_type='test', vocabulary=vocabulary, 
                                       hierarchical=hyperparams['hierarchical'],
                                    seq_len=hyperparams['maxlen'], batch_size=hyperparams['batch_size'],
                                         max_posts_per_user=None,
                                       pad_with_duplication=False,
                                        posts_per_group=hyperparams['posts_per_group'],
                                        post_groups_per_user=None, 
                                         sample_seqs=False, shuffle=False,
                                      return_subjects=True):
        predictions = model.predict_on_batch(features)
        print(len(features[0]), len(subjects), len(predictions), len(results))
        for i,s in enumerate(subjects):
            results["subject" + str(s)] = predictions[i].item()
    return(results)


In [None]:
def build_response(results, decision_thresh=0.5, model_name='', rnd=0):
    response = []
    for subject, score in results.items():
        prediction = 1 if score >= decision_thresh else 0
        response.append({'nick': subject, 'score': score, 'decision': prediction})
    json.dump(response, open("data_server/response_run%s_rnd%d.json" % (model_name, rnd), 'w+'))
    return response
# build_response(results, 50000)

In [None]:
# def get_ensemble_results(rnd, all_results, model_keys_to_average=['lstm_seq', 'cnn_hierarch']):
#     subjects = [s for s in all_results[model_keys_to_average[0]][rnd]]
#     results_ensemble = {}
#     for sub in subjects:
#         s = 0
#         for k in model_keys_to_average:
#             s += all_results[k][rnd][sub]
#         results_ensemble[sub] = s/len(model_keys_to_average)
#     return results_ensemble

In [None]:
def get_ensemble_results(results_to_average):
    subjects = [s for s in results_to_average[0]]
    results_ensemble = {}
    for sub in subjects:
        s = 0
        for res in results_to_average:
            s += res[sub]
        results_ensemble[sub] = s/len(results_to_average)
    return results_ensemble

In [None]:
def get_transfer_results(rnd, all_results, model_to_average='lstm_seq', rounds_back=100):
    subjects = [s for s in all_results[model_to_average][rnd]]
    results_ensemble = {}
    for sub in subjects:
        s = 0
        existing_rounds = 0
        for prev_rnd in range(rnd-rounds_back, rnd+1):
#             print("rolling rnds", prev_rnd)
            if prev_rnd in all_results[model_to_average]:
                s += all_results[model_to_average][prev_rnd][sub]
                existing_rounds += 1
        results_ensemble[sub] = s/existing_rounds
#         print("Have found a rolling window of %d for the transfer model" % existing_rounds)
    return results_ensemble

In [None]:
def load_results(model_key, rnd):
    results = {}
    with open("data_server/response_run%s_rnd%d.json" % (models_runs[model_key], rnd)) as f:
        response = json.loads(f.read())
        for line in response:
            results[line['nick']] = line['score']
    return results

In [None]:
# results['lstm_seq'][20] = load_results('lstm_seq', 20)
# results['lstm_seq'][40] = 
results# results['bert'][40] = load_results('bert', 40)

In [None]:
rnds = range(500,600)
decision_thresh = 0.5
data_chunks = get_data_chunk(rnds)
subjects = [d['nick'] for d in read_json_datapoint("data_server/data0.jl")]
# for key in ['transfer', 'ensemble']:
for model_key in [
                  'lstm_seq',
                    'bert',
                  'cnn_hierarch', 
                  'transfer', 
                  'ensemble',
                ]:
    print(model_key)
    end_rnd = rnds[-1]
#     if model_key=='lstm_seq':
#         results[model_key][end_rnd]=load_results('lstm_seq', 20)
    if model_key=='cnn_hierarch':
        results[model_key][end_rnd]=load_results('cnn_hierarch', 40)
    elif model_key=='bert':
        results[model_key][end_rnd]=load_results('bert', end_rnd)
    elif model_key=='ensemble':
        model_keys_to_average=['bert', 'cnn_hierarch', 'lstm_seq']
        missing_models = [m for m in model_keys_to_average if not results[m]]
        if len(missing_models)!=0:
            print("Missing models! cannot compute ensemble results", missing_models)
            continue
        results_to_average = [results[m][end_rnd] for m in model_keys_to_average]
#         results[model_key][end_rnd] = get_ensemble_results(rnd, results, 
#                                                 model_keys_to_average)
        results[model_key][end_rnd] = get_ensemble_results(results_to_average)
    ## For now
    elif model_key=='transfer':
        results[model_key][end_rnd]=get_transfer_results(
            end_rnd, results, model_to_average='lstm_seq', rounds_back=60)
#         results[model_key][end_rnd]=results['lstm_seq'][end_rnd]
    ##
    else:
        with session_collection[model_key].as_default():
            with session_collection[model_key].graph.as_default():
                results[model_key][end_rnd] = predict_for_round_chunk(models_collection[model_key], 
                                              hyperparams_collection[model_key], hyperparams_features, 
                                              vocabulary_dict, 
                                          data_chunks, subjects=subjects, model_key=model_key, cache_round=499)

    
    print(len(results[model_key][end_rnd].values()), "positive:", 
      len([r for r in results[model_key][end_rnd].values() if r >=0.5]))
    response1 = build_response(results[model_key][end_rnd], rnd=end_rnd, 
                               model_name=models_runs[model_key], decision_thresh=decision_thresh)


In [None]:
results['lstm_seq'].keys()

In [None]:
pd.Series(list(results['transfer'][180].values())).describe()

In [None]:
from scipy.stats import pearsonr
pearsonr(list(results['bert'][220].values()), list(results['bert'][180].values()))

In [None]:
pd.Series(list(results['lstm_seq'][160].values()))

### Predict on eRisk data

In [None]:
# results = {}
# labels = {}
# featuresall = {}
# with session_collection['lstm_seq2'].as_default():
#     with session_collection['lstm_seq2'].graph.as_default():
        # for features, subjects, lbls in DataGenerator(user_level_data, subjects_split, 
        #                                          set_type='train', vocabulary=vocabulary_dict,
        #                                        hierarchical=hyperparams1['hierarchical'],
        #                                     seq_len=hyperparams1['maxlen'], batch_size=hyperparams1['batch_size'],
        #                                          max_posts_per_user=None,
        #                                        pad_with_duplication=False,
        #                                         posts_per_group=hyperparams1['posts_per_group'],
        #                                         post_groups_per_user=None, 
        #                                          sample_seqs=False, shuffle=False,
        #                                                return_subjects=True):

        #     predictions = loaded_model.predict_on_batch(features)
        #     print(len(features[0]), len(subjects), len(predictions), len(labels), len(results))
        #     for i,s in enumerate(subjects):
        #         if s not in results:
        #             results[s] = []
        #             featuresall[s] = []
        #         results[s].append(predictions[i].item())
        #         featuresall[s].append([features[j][i] for j in range(len(features))])
        #         labels[s] = lbls[i]


In [None]:
for subject in results:
    if not labels[subject]:
        if np.std(results[subject])>0.0:
            print(subject), print(results[subject][0], results[subject][-1]-results[subject][0])
            pd.Series(results[subject]).rolling(window=5).mean().plot()

In [None]:
[featuresall[4278][i][0].sum() for i in range(len(featuresall[4278]))]

In [None]:
len(user_level_data['subject4278']['raw'])

In [None]:
for x, s, y in DataGenerator(user_level_data, subjects_split, 
                                         set_type='test', vocabulary=vocabulary_dict,
                                       hierarchical=hyperparams1['hierarchical'],
                                    seq_len=hyperparams1['maxlen'], batch_size=hyperparams1['batch_size'],
                                         max_posts_per_user=None,
                                       pad_with_duplication=False,
                                        posts_per_group=hyperparams1['posts_per_group'],
                                        post_groups_per_user=None, 
                                         sample_seqs=False, shuffle=False,
                                               return_subjects=True):
    print("subject", s, "features", x[0].sum(axis=1))
    

In [None]:
model_key='lstm_seq'
with session_collection[model_key].as_default():
    with session_collection[model_key].graph.as_default():
        res = models_collection[model_key].evaluate_generator(DataGenerator(user_level_data, subjects_split, 
                                              liwc_words_for_categories=liwc_words_for_categories,
                                         set_type='test', vocabulary=vocabulary_dict,
                                       hierarchical=hyperparams_collection[model_key]['hierarchical'],
                                    seq_len=hyperparams_collection[model_key]['maxlen'], 
                                    batch_size=hyperparams['batch_size'],
                                         max_posts_per_user=None,
                                       pad_with_duplication=False,
                                        posts_per_group=hyperparams_collection[model_key]['posts_per_group'],
                                        post_groups_per_user=1,#None, 
                                         sample_seqs=False, shuffle=False,
                                             compute_liwc=False))
        print(res)