In [None]:
import pandas as pd
import xml.etree.ElementTree as ET
import glob, os
import numpy as np
from comet_ml import Experiment, Optimizer
import pickle
import logging
import sys
from sklearn.utils import class_weight
from sklearn.metrics import roc_auc_score
from matplotlib import pyplot as plt
from scipy.stats import pearsonr, ttest_ind

import json
import re

In [None]:
dataset_type = "anorexia"

In [None]:
if dataset_type == "combined":
    writings_df_selfharm = pickle.load(open('data/writings_df_selfharm_all', 'rb'))
    writings_df_anorexia = pickle.load(open('data/writings_df_anorexia_liwc', 'rb'))
    writings_df_depression = pickle.load(open('data/writings_df_depression_liwc', 'rb'))
    writings_df = pd.DataFrame()
    writings_df = pd.concat([writings_df, writings_df_depression])
    writings_df = pd.concat([writings_df, writings_df_selfharm])
    writings_df = pd.concat([writings_df, writings_df_anorexia])
elif dataset_type == "combined_depr":
    writings_df = pd.DataFrame.from_dict(json.load(open('data/writings_df_depression_all.json')))
elif dataset_type == "clpsych":
    writings_df = pd.DataFrame.from_dict(json.load(open('data/writings_df_%s_liwc_affect.json' % dataset_type)))#read_texts_clpsych(datadir_root_clpsych, datadirs_clpsych, labels_files_clpsych)
#     writings_df_test = pd.DataFrame.from_dict(json.load(open('writings_df_%s_test.json' % dataset_type)))#read_texts_clpsych(datadir_root_clpsych, datadirs_clpsych, labels_files_clpsych)
#     writings_df_test = read_texts_clpsych(datadir_root_clpsych, datadirs_clpsych, labels_files_clpsych)
    label_by = ['depression', 'ptsd']
    writings_df = writings_df.drop(writings_df[writings_df['condition']=='depression'].index)
#     writings_df['label'] = writings_df['condition'].apply(lambda c: 1 if c in label_by else 0)
#     writings_df['date'] = writings_df['created_at']
# elif dataset_type == "symanto":
#     writings_df = read_texts_symanto()
    writings_df = read_texts_symanto()
elif dataset_type == 'selfharm':
    writings_df = pickle.load(open('data/writings_df_%s_all' % dataset_type, 'rb'))
elif dataset_type in ["depression", "anorexia", "selfharm", "symanto"]:
    writings_df = pickle.load(open('data/writings_df_%s_liwc' % dataset_type, 'rb'))
else:
    logger.error("Unknown dataset %s" % dataset_type)

In [None]:
def load_NRC(nrc_path):
    word_emotions = {}
    emotion_words = {}
    with open(nrc_path) as in_f:
        for line in in_f:
            line = line.strip()
            if not line:
                continue
            word, emotion, label = line.split()
            if word not in word_emotions:
                word_emotions[word] = set()
            if emotion not in emotion_words:
                emotion_words[emotion] = set()
            label = int(label)
            if label:
                word_emotions[word].add(emotion)
                emotion_words[emotion].add(word)
    return emotion_words

nrc_lexicon_path = '/home/anasab/resources/NRC-Emotion-Lexicon-Wordlevel-v0.92.txt'
nrc_lexicon = load_NRC(nrc_lexicon_path)
emotions = list(nrc_lexicon.keys())


In [None]:
from liwc_readDict import readDict

liwc_dict = {}
for (w, c) in readDict('/home/anasab/resources/liwc.dic'):
    if c not in liwc_dict:
        liwc_dict[c] = []
    liwc_dict[c].append(w)

categories = set(liwc_dict.keys())
len(categories)

In [None]:
auxwords = {}

positive_df = writings_df[writings_df['label']==1]
negative_df = writings_df[writings_df['label']==0]

for word in liwc_dict['auxverb']:
    auxwords[word + "_pos"] = positive_df.tokenized_text.apply(
        lambda tokens: len([t for t in tokens if t==word])/len(tokens) if tokens else 0)
    auxwords[word + "_neg"] = negative_df.tokenized_text.apply(
        lambda tokens: len([t for t in tokens if t==word])/len(tokens) if tokens else 0)

In [None]:
for word in liwc_dict['auxverb']:
    print(word, np.mean(auxwords[word + "_pos"]), np.mean(auxwords[word + "_neg"]), 
          np.mean(auxwords[word + "_pos"]) / np.mean(auxwords[word + "_neg"]))

In [None]:
auxwords.mean()

## Analysis

In [None]:
feature = 'negemo'
writings_df[writings_df['label']==1].groupby('subject').mean()[feature].hist(alpha=0.5, label='positive', bins=40)
writings_df[writings_df['label']==0].groupby('subject').mean()[feature].hist(alpha=0.5, label='negative', bins=20)
ttest = ttest_ind(writings_df[writings_df['label']==0].groupby('subject').mean()[feature].values,
                              writings_df[writings_df['label']==1].groupby('subject').mean()[feature].values, 
                  nan_policy='omit')
print('\tttest', ttest)
plt.legend()

In [None]:
writings_df.columns

In [None]:
def merge_tokens(row, has_title=True):
    tokens = []
    if row.tokenized_text:
        tokens += row.tokenized_text
    if has_title:
        if row.tokenized_title:
            tokens += row.tokenized_title
    return tokens
writings_df['all_tokens'] = writings_df.apply (lambda row: merge_tokens(row,
                                                                       has_title='title' in writings_df.columns), 
                                               axis=1)

def extract_emotions(tokens, emotion, relative=True):
    if not tokens:
        return None
    emotion_words = [t for t in tokens 
                     if t in nrc_lexicon[emotion]]
    if relative and len(tokens):
        return len(emotion_words) / len(tokens)
    else:
        return len(emotion_words)
    
    return encoded_emotions

from functools import partial
for emotion in emotions:
    writings_df[emotion] = writings_df['all_tokens'].apply(partial(extract_emotions, emotion=emotion, 
                                                                   relative=True))

In [None]:
def mentions(tokenized_text, terms=['diagnosis', 'diagnosed', 'diagnose']):
    if not tokenized_text:
        return False
    for term in terms:
        if term in tokenized_text:
            return True
    return False

writings_df['diagnosis'] = writings_df['tokenized_text'].apply(mentions)
writings_df['depression_mention'] = writings_df['tokenized_text'].apply(lambda t: 
                                                                        mentions(t, ['depressed', 'depression']))

In [None]:
writings_df[writings_df['label']==1].groupby('subject').sum()[['label', 'diagnosis', 'depression_mention']
                                                             ].head()

## Statistical tests between classes

In [None]:
pd.set_option('display.max_rows', 100)

In [None]:
writings_df.groupby('subject').mean()[['label'] + 
                list(categories)].corr()['label'].sort_values().head(100)

In [None]:
writings_df.groupby('subject').mean()[['label'] + 
                emotions].corr()['label'].sort_values().head(100)

## Evolution

In [None]:
import datetime
def normalize_date(date):
        return " ".join(date.strip().split())

writings_df['date'] = writings_df['date'].apply(normalize_date)

In [None]:
writings_df['datetime'] = writings_df.date.apply(lambda d: datetime.datetime.strptime(
            normalize_date(d), '%Y-%m-%d %H:%M:%S'))
# writings_df['datetime'] = writings_df.date.apply(lambda d: datetime.datetime.strptime(
#             normalize_date(d), '%a %b %d %H:%M:%S +%f %Y'))


writings_df['date_day'] = writings_df.datetime.apply(lambda d: d.date())


In [None]:
first_days = {}
for subject in set(writings_df.subject.values):
    first_day = writings_df[writings_df['subject']==subject].groupby('subject').min().datetime.values[0]
    first_days[subject] = first_day
    
print(first_days)

In [None]:
last_days = {}
for subject in set(writings_df.subject.values):
    last_day = writings_df[writings_df['subject']==subject].groupby('subject').max().datetime.values[0]
    last_days[subject] = last_day
    
print(last_days)

In [None]:
import datetime
def days_difference(date1, subject, reversed=False, unit='days', date_format='%Y-%m-%d %H:%M:%S'):
    
    if reversed:
        try:
#             date1 = datetime.datetime.strptime(
#             normalize_date(date1), date_format
#                                       )
#             date2 = datetime.datetime.strptime(normalize_date(last_days[subject]), date_format)
            date2 = last_days[subject]
            o = date1 - date2
        except Exception as e:
            print(e)
            return None
    else:
        try:
            date2 = first_days[subject]
            o = date1 - date2
        except Exception as e:
            print(e)
            return None
    if unit=='days':
        return o.days
    if unit=='weeks':
        return o.days//7
    if unit=='months':
        return o.days//30

In [None]:
writings_df['writing_days'] = writings_df.apply(lambda x: days_difference(x['datetime'], x['subject'], reversed=False, 
                                                                          unit='days'),
                                                
                                                axis=1)
#                                                                           date_format='%a %b %d %H:%M:%S +%f %Y'), 


In [None]:
writings_df['writing_days_reverse'] = writings_df.apply(lambda x: days_difference(x['datetime'], x['subject'], reversed=True, 
                                                                          unit='days'), 
#                                                                                   date_format='%a %b %d %H:%M:%S +%f %Y'), 
                                                axis=1)

In [None]:
writings_df['first_day'] = writings_df['subject'].apply(lambda u: first_days[u])
writings_df['last_day'] = writings_df['subject'].apply(lambda u: last_days[u])

In [None]:
writings_df = writings_df.loc[:,~writings_df.columns.duplicated()]


In [None]:
positive_df = writings_df[writings_df['label']==1]
negative_df = writings_df[writings_df['label']==0] #.sample(frac=1)[:len(positive_df)]

def plot_evolution(df, emotion, writing_day_cutoff, rolling_window, label='', date_field='writing_days'):

#     df[df[date_field]>=writing_day_cutoff][
#             ['text', 'label', 'pronouns', 'text_len', 'subject', 'date', 'date_day', 'writing_days', 'negemo', 'posemo'
#             ] + emotions + list(categories)
#     ].groupby(date_field).mean()[emotion].rolling(rolling_window).mean().plot(label=label)
    
#     df[df['date_day']>=writing_day_cutoff][
    df[df['writing_days']<=writing_day_cutoff][
            ['text', 'label', 'text_len', 'subject', 'date', 'date_day', 'writing_days', 'writing_days_reverse', 
            'depression_mention', 'diagnosis'] + emotions + [c for c in set(categories) if c in df.columns]
    ].groupby(date_field).mean()[emotion].rolling(rolling_window).mean().plot(label=label)
#                                 ].apply(lambda c: np.log(c) if c>0 else 0
#                                        ).rolling(rolling_window).mean().plot(label=label)

def plot_subjects(df, emotion, writing_day_cutoff, rolling_window, label=''):

    df[abs(df['writing_days'])<=writing_day_cutoff][
            ['text', 'label', 'text_len', 'subject', 'date', 'date_day', 'writing_days',  'writing_days_reverse',
            ] + emotions + [c for c in set(categories) if c in df.columns]
    ].groupby('date_day').count().subject.plot(label=label)

def plot_stationary(df, emotion, writing_day_cutoff, label='', date_field='date_day'):
    df[df[date_field]>=writing_day_cutoff][
            ['text', 'label', 'text_len', 'subject', 'date', 'writing_days', 'date_day', 'datetime', 'writing_days_reverse',
            ] + emotions + [c for c in set(categories) if c in df.columns]
    ].groupby(date_field).mean()[emotion].diff().plot(label=label)
    

emo = 'posemo'
# days = datetime.datetime.strptime('2000','%Y').date()
days = 365

# disorder = dataset_type
disorder = "positive"
plot_evolution(positive_df, emo, days, 7, disorder + ' diagnosis', 'date_day')
# plot_evolution(writings_df, emo, days, 100, 'all', 'writing_days_reverse')
plot_evolution(negative_df, emo, days, 7, 'no ' + disorder + ' diagnosis', 'date_day')

# plot_stationary(positive_df, emo, days, dataset_type + ' diagnosis', 'date_day')
# plot_stationary(writings_df, emo, days, 'all', 'date_day')
# plot_stationary(negative_df, emo, days, 'no ' + dataset_type + ' diagnosis', 'date_day')

# plot_subjects(positive_df, emo, days, 100, dataset_type + ' diagnosis')
# plot_subjects(writings_df, emo, days, 100, 'all')
# plot_subjects(negative_df, emo, days, 100, 'no ' + dataset_type + ' diagnosis')

plt.xlabel("Days from first post")
plt.ylabel(emo + " scores")
plt.legend()
plt.show()

### Evolution correlations

In [None]:
len(set(writings_df[writings_df['label']==1].subject))
# len(set(writings_df[writings_df['subset']=='test'].subject))
# writings_df = writings_df[writings_df['subset']=='train']

In [None]:
def get_evolution_series(df, emotion, writing_day_cutoff, rolling_window, date_field='writing_days'):

#     df[df[date_field]>=writing_day_cutoff][
#             ['text', 'label', 'pronouns', 'text_len', 'subject', 'date', 'date_day', 'writing_days', 'negemo', 'posemo'
#             ] + emotions + list(categories)
#     ].groupby(date_field).mean()[emotion].rolling(rolling_window).mean().plot(label=label)
    
#     return df[df['date_day']>=writing_day_cutoff][
    return df[df['writing_days']<=writing_day_cutoff][
            ['text', 'label', 'text_len', 'subject', 'date', 'date_day', 'writing_days', 'writing_days_reverse', 
            'depression_mention', 'diagnosis'] + emotions + [c for c in set(categories) if c in df.columns]
    ].groupby(date_field).mean()[emotion].rolling(rolling_window).mean()
#                                 ].apply(lambda c: np.log(c) if c>0 else 0
#                                        ).rolling(rolling_window).mean().plot(label=label)


# emo = 'positive'
# days = datetime.datetime.strptime('2000','%Y').date()
days = 5000
evolution_df_pos = pd.DataFrame()
evolution_df_pos['anger'] = pd.Series()
evolution_df_pos['positive'] = get_evolution_series(writings_df[writings_df['label']==1], 'positive', days, 100, 'writing_days')
evolution_df_pos['negative'] = get_evolution_series(writings_df[writings_df['label']==1], 'negative', days, 100, 'writing_days')
for emo in emotions[1:]:
    evolution = get_evolution_series(writings_df[writings_df['label']==1], emo, days, 100, 'writing_days')
    evolution_df_pos[emo] = evolution
for emo in categories:
    evolution = get_evolution_series(writings_df[writings_df['label']==1], emo, days, 100, 'writing_days')
    try:
        evolution_df_pos[emo] = evolution
    except:
        print(emo)
        
evolution_df_neg = pd.DataFrame()
evolution_df_neg['anger'] = pd.Series()
evolution_df_neg['positive'] = get_evolution_series(writings_df[writings_df['label']==0], 'positive', days, 100, 'writing_days')
evolution_df_neg['negative'] = get_evolution_series(writings_df[writings_df['label']==0], 'negative', days, 100, 'writing_days')
for emo in emotions[1:]:
    evolution = get_evolution_series(writings_df[writings_df['label']==0], emo, days, 100, 'writing_days')
    evolution_df_neg[emo] = evolution
for emo in categories:
    evolution = get_evolution_series(writings_df[writings_df['label']==0], emo, days, 100, 'writing_days')
    try:
        evolution_df_neg[emo] = evolution
    except Exception as e:
        print(emo, e)
# evolution_df_pos.corr()


In [None]:
evolution_df_pos.negative.rolling(5).mean().plot(label='negative')
evolution_df_pos.cause.rolling(5).mean().plot(label='cause')
evolution_df_pos.positive.rolling(5).mean().plot(label='positive')
plt.legend()

In [None]:
from scipy.stats import pearsonr, spearmanr

def calculate_pvalues(df, method='pearson'):
    df = df.dropna()._get_numeric_data()
    dfcols = pd.DataFrame(columns=df.columns)
    pvalues = dfcols.transpose().join(dfcols, how='outer')
    for r in df.columns:
        for c in df.columns:
            if method=='pearson':
                pvalues[r][c] = round(pearsonr(df[r], df[c])[1], 4)
            elif method=='spearman':
                pvalues[r][c] = round(spearmanr(df[r], df[c])[1], 4)
                
    return pvalues

In [None]:
evolution_df_pos.corr('spearman')[['positive', 'negative'] + [e for e in emotions if e not in ['positive', 'negative', 'anger']]
                       ]

In [None]:
evolution_pos_correlations = evolution_df_pos.corr('pearson')[['positive', 'negative'] + [e for e in emotions if e not in ['positive', 'negative']]]
evolution_neg_correlations = evolution_df_neg.corr('pearson')[['positive', 'negative'] + [e for e in emotions if e not in ['positive', 'negative']]]
evolution_pos_correlations = evolution_pos_correlations.rename(columns={c: c+"_ptsd" for c in evolution_pos_correlations.columns})
evolution_neg_correlations = evolution_neg_correlations.rename(columns={c: c+"_notptsd" for c in evolution_neg_correlations.columns})

In [None]:
evolution_pos_pvalues = calculate_pvalues(evolution_df_pos, 'pearson')[['positive', 'negative'] + [e for e in emotions if e not in ['positive', 'negative']]]
evolution_neg_pvalues = calculate_pvalues(evolution_df_neg, 'pearson')[['positive', 'negative'] + [e for e in emotions if e not in ['positive', 'negative']]]
evolution_pos_pvalues = evolution_pos_pvalues.rename(columns={c: c+"_ptsd_pval" for c in evolution_pos_pvalues.columns})
evolution_neg_pvalues = evolution_neg_pvalues.rename(columns={c: c+"_notptsd_pval" for c in evolution_neg_pvalues.columns})

In [None]:
evolution_df = pd.concat([evolution_pos_correlations, evolution_neg_correlations], axis=1)

In [None]:
evolution_df_pvals = pd.concat([evolution_pos_pvalues, evolution_neg_pvalues], axis=1)

In [None]:
# evolution_df['diff'] = evolution_df.apply(lambda x: abs(x['negative_depr'] - x['negative_notdepr']), axis=1)
# evolution_df.sort_values('diff', ascending=False)
evolution_df = pd.concat([evolution_df, evolution_df_pvals], axis=1)

In [None]:
evolution_df.drop([e for e in emotions], inplace=True)
evolution_df_pvals.drop([e for e in emotions], inplace=True)
# evolution_df.drop(columns=['diff'], inplace=True)

In [None]:
# evolution_df=evolution_df.combine(evolution_df_pvals, func=lambda x, y: x.astype(str)+","+y.astype(str))
emotion_columns = ['positive', 'negative'] + [e for e in emotions if e not in ['positive', 'negative']]
evolution_df = evolution_df[sum([[e + "_ptsd", e + "_ptsd_pval"
                                 ] for e in emotion_columns], []
                               ) + sum([[e + "_notptsd", e + "_notptsd_pval"
                                 ] for e in emotion_columns], [])
                                ]
evolution_df

In [None]:
evolution_df.columns

In [None]:
with open("evolution_correlations_selfharm_vsnot_erisk_rolling100_pearson_withpvalues.csv", "w+") as f:
    f.write(evolution_df.to_csv())

### Compare correlations

In [None]:
from CorrelationStats.corrstats import independent_corr

In [None]:
pos_sample_size = len(evolution_df_pos.index)
neg_sample_size = len(evolution_df_neg.index)
print(pos_sample_size, neg_sample_size)

In [None]:
for col in emotions:
    evolution_df['%s_diff_zval' % col] = evolution_df.apply(
        lambda x: round(
            independent_corr(x['%s_selfharm' % col], x['%s_notselfharm' % col], pos_sample_size, neg_sample_size)[0], 4),
    axis=1)
    evolution_df['%s_diff_pval' % col] = evolution_df.apply(
        lambda x: round(
            independent_corr(x['%s_selfharm' % col], x['%s_notselfharm' % col], pos_sample_size, neg_sample_size)[1], 4),
    axis=1)

In [None]:
evolution_df[['negative_selfharm', 'negative_notselfharm', 'negative_diff_zval', 'negative_diff_pval']]

In [None]:
# Sort columns
emotion_columns = ['positive', 'negative'] + [e for e in emotions if e not in ['positive', 'negative']]
evolution_df = evolution_df[sum([[e + "_selfharm", e + "_selfharm_pval", 
                                 ] for e in emotion_columns], []
                               ) + sum([[e + "_notselfharm", e + "_notselfharm_pval", e + "_diff_zval", e + "_diff_pval"
                                 ] for e in emotion_columns], [])
                                ]
evolution_df
evolution_df.columns

In [None]:
with open("evolution_correlations_selfharm_vsnot_erisk_rolling100_pearson_with_diffsignificance.csv", "w+") as f:
    f.write(evolution_df.to_csv())

In [None]:
topics_emotions_significance = {}
for emotion in emotions:
    topics_emotions_significance[emotion] = {
        'significant': ", ".join(evolution_df[evolution_df[emotion + "_diff_pval"] < 0.005].index.values),
        'not significant': ", ".join(evolution_df[evolution_df[emotion + "_diff_pval"] >= 0.005].index.values)
    }

with open("selfharm_notselfharm_significant_differences.csv", "w+") as f:
    f.write(pd.DataFrame.from_dict(topics_emotions_significance).to_csv())


## Co-occurrence / prevalence analysis

In [None]:
# Assuming probabilities that 2 categories occur in a text are independent
# Even though some categories have some common words...

In [None]:
categories1 = ['cause', 'certain']
categories2 = ['negative', 'positive']

In [None]:
# categories1 = categories
# categories2 = ['negative', 'positive']

### At text level

In [None]:
writings_df.columns

In [None]:
def compute_pmis(df, categories1, categories2):
    # Compute joint probabilities
    for cat1 in categories1:
        for cat2 in categories2:
            df[cat1 + "_" + cat2] = df.apply(lambda x: x[cat1] * x[cat2], axis=1)
    # Compute pmi
    pmis = {}
    for cat1 in categories1:
        for cat2 in categories2:
            pmis[(cat1, cat2)] = np.log(df[cat1 + "_" + cat2].mean() / (df[cat1].mean() * df[cat2].mean()))
            
    return pmis

In [None]:
writings_df_training = writings_df[writings_df['subset']=='train']

In [None]:
writings_df_training

In [None]:
%%time
pmis_positive = compute_pmis(writings_df_training[writings_df_training['label']==1], categories1=categories, categories2=emotions)
pmis_negative = compute_pmis(writings_df_training[writings_df_training['label']==0], categories1=categories, categories2=emotions)

In [None]:
# with open("pmis_depressed_training.pkl", "wb+") as f:
#     pickle.dump(pmis_positive, f)

In [None]:
[i for i in pmis_positive.items() if i[1] > 0]

In [None]:
pmis_negative

In [None]:
# with open("pmis_notdepressed_training.pkl", "wb+") as f:
#     pickle.dump(pmis_negative, f)

In [None]:
[i for i in pmis_negative.items() if i[1] > 1]

In [None]:
pmis_per_subject_positive = {}
for subject in set(writings_df[writings_df['label']==1].subject):
    pmis_positive = compute_pmis(writings_df[writings_df['subject']==subject],
                                 categories, emotions)
    pmis_per_subject_positive[subject] = pmis_positive

In [None]:
pd.Series([t[('certain', 'negative')] 
           for (s, t) in pmis_per_subject_positive.items()
           if t[('certain', 'negative')]>0 
          ]).describe()

In [None]:
# with open("pmis_depressed_persubject.part.pkl", "wb+") as f:
#     pickle.dump(pmis_per_subject_positive, f)

In [None]:
pmis_per_subject_negative = {}
for subject in set(writings_df[writings_df['label']==0].subject):
    pmis_negative = compute_pmis(writings_df[writings_df['subject']==subject],
                                 categories, emotions)
    pmis_per_subject_negative[subject] = pmis_negative

In [None]:
# with open("pmis_notdepressed_persubject.pkl", "wb+") as f:
#     pickle.dump(pmis_per_subject_negative, f)

### At day level (same day same user)

In [None]:
writings_df.groupby(['writing_days', 'subject']).mean()[['negative', 'positive']]

In [None]:
pmis_positive_days = compute_pmis(writings_df_training[writings_df_training['label']==1].groupby(['date_day', 'subject']).mean(),
                                 categories, emotions)

In [None]:
pmis_negative_days = compute_pmis(writings_df_training[writings_df_training['label']==0].groupby(['date_day', 'subject']).mean(),
                                                                  categories, emotions)

In [None]:
[pmis_positive_days

In [None]:
pmis_negative_days

In [None]:
# with open("pmis_depressed_perday_training.pkl", "wb+") as f:
#     pickle.dump(pmis_positive_days, f)

In [None]:
# with open("pmis_notdepressed_perday_training.pkl", "wb+") as f:
#     pickle.dump(pmis_negative_days, f)

In [None]:
pmis_per_subject_positive = {}
for subject in set(writings_df[writings_df['label']==1].subject):
    pmis_positive_days = compute_pmis(writings_df[writings_df['subject']==subject].groupby(['date_day', 'subject']).mean(),
                                 categories, emotions)
    pmis_per_subject_positive[subject] = pmis_positive_days

In [None]:
# with open("pmis_depressed_persubject_perday.pkl", "wb+") as f:
#     pickle.dump(pmis_per_subject_positive, f)

In [None]:
pmis_per_subject_negative = {}
for subject in set(writings_df[writings_df['label']==0].subject):
    pmis_negative_days = compute_pmis(writings_df[writings_df['subject']==subject].groupby(['date_day', 'subject']).mean(),
                                 categories, emotions)
    pmis_per_subject_negative[subject] = pmis_negative_days

In [None]:
# with open("pmis_notdepressed_persubject_perday.pkl", "wb+") as f:
#     pickle.dump(pmis_per_subject_negative, f)

In [None]:
positive_pmis_cause = 0
positive_pmis_certain = 0
avg_pmi_cause = 0
avg_pmi_certain = 0
all_pmis_negative_cause = []
all_pmis_negative_certain = []
for subject, pmis in pmis_per_subject_negative.items():
    print(subject, [p for p in pmis.items() if p[1]>0 and p[0][1]=='negative' 
                    and p[0][0] in ['cause', 'certain']])
    positive_pmis_cause += len([p for p in pmis.items() if p[1]>0 and p[0][1]=='negative' 
                    and p[0][0] == 'cause'])
    positive_pmis_certain += len([p for p in pmis.items() if p[1]>0 and p[0][1]=='negative' 
                    and p[0][0] == 'certain'])
    try:
        avg_pmi_cause += [p[1] for p in pmis.items() if p[1]>0 and p[0][1]=='negative' 
                    and p[0][0] == 'cause'][0]
        all_pmis_negative_cause.append([p[1] for p in pmis.items() if p[1]>0 and p[0][1]=='negative' 
                    and p[0][0] == 'cause'][0])
    except:
        all_pmis_negative_cause.append(0)

        pass
    try:
        avg_pmi_certain += [p[1] for p in pmis.items() if p[1]>0 and p[0][1]=='negative' 
                    and p[0][0] == 'certain'][0]
        all_pmis_negative_certain.append([p[1] for p in pmis.items() if p[1]>0 and p[0][1]=='negative' 
                    and p[0][0] == 'certain'][0])
    except:
        all_pmis_negative_certain.append(0)

        pass
print(positive_pmis_cause/len(pmis_per_subject_negative))
print(positive_pmis_certain/len(pmis_per_subject_negative))
print(avg_pmi_cause/len(pmis_per_subject_negative))
print(avg_pmi_certain/len(pmis_per_subject_negative))

In [None]:
positive_pmis_cause = 0
positive_pmis_certain = 0
all_pmis_positive_cause = []
all_pmis_positive_certain = []
avg_pmi_cause = 0
avg_pmi_certain = 0
for subject, pmis in pmis_per_subject_positive.items():
    print(subject, [p for p in pmis.items() if p[1]>0 and p[0][1]=='negative' 
                    and p[0][0] in ['cause', 'certain']])
    positive_pmis_cause += len([p for p in pmis.items() if p[1]>0 and p[0][1]=='negative' 
                    and p[0][0] == 'cause'])
    positive_pmis_certain += len([p for p in pmis.items() if p[1]>0 and p[0][1]=='negative' 
                    and p[0][0] == 'certain'])
    try:
        avg_pmi_cause += [p[1] for p in pmis.items() if p[1]>0 and p[0][1]=='negative' 
                    and p[0][0] == 'cause'][0]
        all_pmis_positive_cause.append([p[1] for p in pmis.items() if p[1]>0 and p[0][1]=='negative' 
                    and p[0][0] == 'cause'][0])
    except:
        all_pmis_positive_cause.append(0)
        pass
    try:
        avg_pmi_certain += [p[1] for p in pmis.items() if p[1]>0 and p[0][1]=='negative' 
                    and p[0][0] == 'certain'][0]
        all_pmis_positive_certain.append([p[1] for p in pmis.items() if p[1]>0 and p[0][1]=='negative' 
                    and p[0][0] == 'certain'][0])
    except:
        all_pmis_positive_certain.append(0)

        pass
print(positive_pmis_cause/len(pmis_per_subject_positive))
print(positive_pmis_certain/len(pmis_per_subject_positive))
print(avg_pmi_cause/len(pmis_per_subject_positive))
print(avg_pmi_certain/len(pmis_per_subject_positive))

In [None]:
pd.Series(all_pmis_positive_cause).hist(alpha=0.5, label = 'depressed', bins=30)
pd.Series(all_pmis_negative_cause).hist(alpha=0.5, label = 'not depressed', bins=40)
plt.legend()

In [None]:
# pd.Series(all_pmis_positive_cause).describe()
pd.Series(all_pmis_negative_cause).describe()


In [None]:
pd.Series(all_pmis_positive_certain).hist(alpha=0.5, label = 'depressed', bins=30, log=True)
pd.Series(all_pmis_negative_certain).hist(alpha=0.5, label = 'not depressed', bins=30, log=True)
plt.xlabel("Co-occurrence scores certain-negative")
plt.legend()

In [None]:
pd.Series(all_pmis_positive_cause).hist(alpha=0.5, label = 'depressed', bins=30, log=True)
pd.Series(all_pmis_negative_cause).hist(alpha=0.5, label = 'not depressed', bins=30, log=True)
plt.xlabel("Co-occurrence scores cause-negative")

plt.legend()

In [None]:
pd.Series(all_pmis_negative_certain + all_pmis_positive_certain).hist(bins=30)
# pd.Series(all_pmis_negative_certain).hist(bins=30)
# pd.Series(all_pmis_positive_certain).hist(bins=30)


In [None]:
# pd.Series(all_pmis_negative_cause + all_pmis_positive_cause).hist(bins=30)
pd.Series(all_pmis_positive_cause).hist(bins=30)
# pd.Series(all_pmis_negative_cause).hist(bins=30)
