In [None]:
import pandas as pd
import xml.etree.ElementTree as ET
import glob, os
import numpy as np
from comet_ml import Experiment, Optimizer
import pickle
import logging
import sys
from sklearn.utils import class_weight
from sklearn.metrics import roc_auc_score
from sklearn.metrics import classification_report, confusion_matrix


from matplotlib import pyplot as plt
import json
import re

In [2]:
os.environ['TF_KERAS'] = '1'
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.models import Sequential, Model, load_model
from tensorflow.keras.layers import Dense, Dropout, Embedding, LSTM, Lambda, BatchNormalization, TimeDistributed, \
    Bidirectional, Input, InputLayer, concatenate, Flatten, RepeatVector, Activation, Multiply, Permute, \
    Conv1D, GlobalMaxPooling1D
from tensorflow.keras import regularizers
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras import callbacks, optimizers
from tensorflow.keras import backend as K
from tensorflow.keras.utils import plot_model, Sequence

from keras.utils import np_utils

from nltk.tokenize import RegexpTokenizer, TweetTokenizer
from nltk.corpus import stopwords

import tensorflow as tf

In [3]:
from collections import Counter
import json

In [1]:
vocabs = {
    'erisk_depression': None,
    'erisk_anorexia': None,
    'erisk_selfharm': None,
    'clpsych': None,
    'symanto': None
    
}

In [20]:
for k in vocabs:
    with open(f'data/{k}_vocab.json') as f:
        vocabs[k] = dict(Counter(json.loads(f.read())).most_common(20000))

In [21]:
overlaps = {}
for k in vocabs:
    for k2 in vocabs:
        if k==k2:
            continue
        overlap = len([vocabs[k][w] for w in vocabs[k] if w in vocabs[k2]])
        overlapercent1 = overlap / len(vocabs[k])        
        overlapercent2 = overlap / len(vocabs[k2])
        overlaps[(k,k2)] = (overlapercent1, overlapercent2)

In [29]:
sum(sorted([t[1] for t in overlaps.values()]))/len(overlaps)

0.6778949999999998

In [20]:
datadirs_clpsych = {
    'train': [''],
    'test': ['']
}
datadir_root_clpsych = {
    'train':'../data/clpsych/shared_task_data/final_training_data/',
    'test': '../data/clpsych/shared_task_data/final_testing_data/'
}
    
labels_files_clpsych = ['../data/clpsych/anonymized_user_info_by_chunk.csv']
def read_texts_clpsych(datadir_root_clpsych,
                   datadirs_clpsych,
                   labels_files_T1_2019,
                      label_by=['depression']):
    writings = {'train': [], 'test': []}
    writings_df = pd.DataFrame()
    labels_df = pd.DataFrame()

    for subset in ('test',):#, 'test'):
        for subdir in [os.path.join(datadir_root_clpsych[subset], subp) for subp in datadirs_clpsych[subset]]:
            for subject_file in glob.glob(subdir + "/*.tweets"):
#                 if subject_file.split("/")[-1] != 'sZVVktDN8qqjA.tweets':
#                     continue
#                 writings[subset].extend(read_subject_data_clpsych(os.path.join(subdir, subject_file)))
                writings[subset].extend(read_subject_data_clpsych(subject_file))
        writings_df_part = pd.DataFrame(writings[subset])
        writings_df_part['subset'] = subset
        writings_df = pd.concat([writings_df, writings_df_part])
        writings_df.reindex()

    for label_file in labels_files_clpsych:
        labels = pd.read_csv(label_file, names=['subject','age','num_tweets','gender','condition','chunk_index'])
        labels['label'] = labels['condition'].apply(lambda c: 1 if c in label_by else 0)
        
        labels_df = pd.concat([labels_df, labels])
        labels_df = labels_df.drop_duplicates()
        labels_df = labels_df.set_index('subject')

        # TODO: this deduplication throws some unicode, surrogates not allowed, exception
#     writings_df = writings_df.drop_duplicates(subset=['id', 'subject', 'subset', 'created_at', 'text'])
    
    writings_df = writings_df.join(labels_df, on='subject')
    writings_df['date'] = writings_df['created_at']
    
    return writings_df

def read_subject_data_clpsych(subject_file):
    writings = []
    with open(subject_file, "rt", encoding="utf-8") as sf:
        user = subject_file.split("/")[-1].split(".")[0]
#         print(subject_file)

        for line in sf:
            data = json.loads(line)#.encode('utf-16','surrogatepass').decode('utf-16'))
            data['subject'] = user
            writings.append(data)
    return writings

In [None]:
if dataset_type == "combined":
    writings_df_selfharm = pickle.load(open('data/writings_df_selfharm_all', 'rb'))
    writings_df_anorexia = pickle.load(open('data/writings_df_anorexia_liwc', 'rb'))
    writings_df_depression = pickle.load(open('data/writings_df_depression_liwc', 'rb'))
    writings_df_selfharm['source'] = 'selfharm'
    writings_df_anorexia['source'] = 'anorexia'
    writings_df_depression['source'] = 'depression'
    writings_df = pd.DataFrame()
    writings_df = pd.concat([writings_df, writings_df_depression])
    writings_df = pd.concat([writings_df, writings_df_selfharm])
    writings_df = pd.concat([writings_df, writings_df_anorexia])
elif dataset_type == "combined_depr":
    writings_df = pd.DataFrame.from_dict(json.load(open('data/writings_df_depression_all.json')))
elif dataset_type == "clpsych":
#     writings_df = pd.DataFrame.from_dict(json.load(open('data/writings_df_%s_liwc_affect.json' % dataset_type)))#read_texts_clpsych(datadir_root_clpsych, datadirs_clpsych, labels_files_clpsych)
#     writings_df_test = pd.DataFrame.from_dict(json.load(open('writings_df_%s_test.json' % dataset_type)))#read_texts_clpsych(datadir_root_clpsych, datadirs_clpsych, labels_files_clpsych)
    writings_df_test = read_texts_clpsych(datadir_root_clpsych, datadirs_clpsych, labels_files_clpsych)
    label_by = ['depression', 'ptsd']
    writings_df.drop(writings_df[writings_df['condition']=='depression'].index, inplace=True)
    writings_df['label'] = writings_df['condition'].apply(lambda c: 1 if c in label_by else 0)
#     writings_df['date'] = writings_df['created_at']
elif dataset_type == "symanto":
#     writings_df = read_texts_symanto()
    writings_df = pickle.load(open('../data/%s/writings_df_%s_liwc' % (dataset_type,dataset_type), 'rb'))
elif dataset_type == 'selfharm':
    writings_df = pickle.load(open('data/writings_df_%s_all' % dataset_type, 'rb'))
elif dataset_type in ["depression", "anorexia", "selfharm"]:
    writings_df = pickle.load(open('data/writings_df_%s_liwc' % dataset_type, 'rb'))
else:
    logger.error("Unknown dataset %s" % dataset_type)

In [None]:
writings_df.columns

In [None]:
writings_df.head()

In [54]:
%%time
v = Counter()
for text in writings_df.tokenized_text.values:
    v.update(text)


CPU times: user 133 ms, sys: 3.35 ms, total: 136 ms
Wall time: 135 ms


In [55]:
v.most_common(20)

[('i', 15551),
 ('rt', 13897),
 ('the', 12888),
 ('to', 12363),
 ('a', 10429),
 ('you', 8716),
 ('and', 7642),
 ('my', 6223),
 ('is', 6022),
 ('of', 5945),
 ('in', 5461),
 ('for', 5089),
 ('me', 4683),
 ('it', 4583),
 ('this', 4064),
 ('that', 3688),
 ('on', 3671),
 ('so', 3453),
 ('be', 3157),
 ('with', 2903)]

In [46]:
# vocab_erisk_selfharm.most_common(20)

In [43]:
# vocab_erisk_anorexia.most_common(20)

In [38]:
# vocab_erisk_depr.most_common(20)

In [58]:
# vocab_symanto.most_common(20)