In [2]:
import os
import re
import json
import liwc
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from nltk import word_tokenize
from collections import defaultdict, Counter

In [3]:
LIWC_parse, category_names = liwc.load_token_parser('../LIWC2015_English.dic')

In [5]:
list(LIWC_parse("accept"))

['verb (Verbs)',
 'affect (Affect)',
 'posemo (Positive Emotions)',
 'cogproc (Cognitive Processes)',
 'insight (Insight)']

In [6]:
depress_sel_posts = open("./processed/combined_maxsim16/depress_sel_posts2.txt").readlines()
depress_non_sel_posts = open("./processed/combined_maxsim16/depress_non_sel_posts2.txt").readlines()
len(depress_sel_posts), len(depress_non_sel_posts)

(425, 423)

In [8]:
import pickle
with open("processed/miniLM_L6_embs.pkl", "rb") as f:
    data = pickle.load(f)

train_posts = data["train_posts"]
train_tags = data["train_labels"]
test_posts = data["test_posts"]
test_tags = data["test_labels"]

In [18]:
test_mappings = data["test_mappings"]

In [19]:
depress_screen_posts = depress_sel_posts + depress_non_sel_posts
depress_all_posts = []
for mapping, tag in zip(test_mappings, test_tags):
    if tag == 1:
        for i in mapping:
            depress_all_posts.append(test_posts[i])
len(depress_screen_posts), len(depress_all_posts)

(848, 18729)

In [28]:
tmp = set(depress_screen_posts)
depress_non_screen_posts = [x.replace("\n", " ") for x in depress_all_posts if x.replace("\n", " ")+"\n" not in tmp]
len(depress_non_screen_posts)

17883

In [10]:
depress_sel_post_words = word_tokenize("\n".join(depress_sel_posts).lower())
depress_non_sel_post_words = word_tokenize("\n".join(depress_non_sel_posts).lower())
depress_screen_post_words = depress_sel_post_words + depress_non_sel_post_words
len(depress_screen_post_words)

61475

In [38]:
len(depress_sel_post_words), len(depress_non_sel_post_words)

(35231, 26244)

In [29]:
depress_non_screen_post_words = word_tokenize("\n".join(depress_non_screen_posts).lower().strip())

In [30]:
len(depress_non_screen_post_words)

802144

In [31]:
depress_sel_post_liwc = {k:0 for k in category_names}
depress_non_sel_post_liwc = {k:0 for k in category_names}
depress_screen_post_liwc = {k:0 for k in category_names}
depress_non_screen_post_liwc = {k:0 for k in category_names}
def fill_liwc(words, liwc_counts):
    for wd in words:
        for cat in LIWC_parse(wd):
            liwc_counts[cat] += 1
fill_liwc(depress_sel_post_words, depress_sel_post_liwc)
fill_liwc(depress_non_sel_post_words, depress_non_sel_post_liwc)
fill_liwc(depress_screen_post_words, depress_screen_post_liwc)
fill_liwc(depress_non_screen_post_words, depress_non_screen_post_liwc)

In [33]:
depress_sel_post_liwc = pd.Series(depress_sel_post_liwc)
depress_non_sel_post_liwc = pd.Series(depress_non_sel_post_liwc)
depress_screen_post_liwc = pd.Series(depress_screen_post_liwc)
depress_non_screen_post_liwc = pd.Series(depress_non_screen_post_liwc)

In [34]:
all_df = pd.DataFrame({
    "sel": depress_sel_post_liwc,
    "non_sel": depress_non_sel_post_liwc,
    "screen": depress_screen_post_liwc,
    "non_screen": depress_non_screen_post_liwc
})
all_df

Unnamed: 0,sel,non_sel,screen,non_screen
function (Function Words),18421,13297,31718,382526
pronoun (Pronouns),6400,4412,10812,120096
ppron (Personal Pronouns),4177,2883,7060,75826
i (I),2868,1918,4786,39330
we (We),79,74,153,3317
...,...,...,...,...
swear (Swear),100,63,163,2683
netspeak (Netspeak),45,49,94,2665
assent (Assent),59,60,119,2211
nonflu (Nonfluencies),53,43,96,1599


In [35]:
all_df_norm = all_df / all_df.sum(0)
all_df_norm

Unnamed: 0,sel,non_sel,screen,non_screen
function (Function Words),0.186702,0.185975,0.186397,0.186108
pronoun (Pronouns),0.064866,0.061707,0.063539,0.058430
ppron (Personal Pronouns),0.042335,0.040322,0.041489,0.036891
i (I),0.029068,0.026826,0.028126,0.019135
we (We),0.000801,0.001035,0.000899,0.001614
...,...,...,...,...
swear (Swear),0.001014,0.000881,0.000958,0.001305
netspeak (Netspeak),0.000456,0.000685,0.000552,0.001297
assent (Assent),0.000598,0.000839,0.000699,0.001076
nonflu (Nonfluencies),0.000537,0.000601,0.000564,0.000778


In [36]:
all_df_norm.to_csv("./LIWC_normalized.csv")

In [37]:
from statsmodels.stats.proportion import proportions_ztest

In [48]:
for dim in ["i (I)", "posemo (Positive Emotions)", "negemo (Negative Emotions)", "sad (Sad)", "health (Health)", "anx (Anx)"]:
    stat, pval = proportions_ztest(
        count = [all_df.loc[dim, "sel"], all_df.loc[dim, "non_sel"]],
        nobs = [len(depress_sel_post_words), len(depress_non_sel_post_words)]
    )
    print(dim, pval)

i (I) 0.00013943763033740197
posemo (Positive Emotions) 0.5834230566940983
negemo (Negative Emotions) 0.08898760629719134
sad (Sad) 0.010833097620077574
health (Health) 1.1624039242552796e-09
anx (Anx) 0.09349313153587031


In [49]:
for dim in ["i (I)", "posemo (Positive Emotions)", "negemo (Negative Emotions)", "sad (Sad)", "health (Health)", "anx (Anx)"]:
    stat, pval = proportions_ztest(
        count = [all_df.loc[dim, "screen"], all_df.loc[dim, "non_screen"]],
        nobs = [len(depress_screen_post_words), len(depress_non_screen_post_words)]
    )
    print(dim, pval)

i (I) 8.423241160228661e-215
posemo (Positive Emotions) 0.015820488819570006
negemo (Negative Emotions) 3.771591691360736e-106
sad (Sad) 6.039499579032894e-172
health (Health) 1.5604994344610583e-55
anx (Anx) 4.984018015264887e-21


In [50]:
non_depress_all_posts = []
for mapping, tag in zip(test_mappings, test_tags):
    if tag == 0:
        for i in mapping:
            non_depress_all_posts.append(test_posts[i])

In [51]:
non_depress_post_words = word_tokenize("\n".join(non_depress_all_posts).lower().strip())

In [52]:
non_depress_post_liwc = {k:0 for k in category_names}
fill_liwc(non_depress_post_words, non_depress_post_liwc)

In [54]:
all_df = pd.DataFrame({
    "sel": depress_sel_post_liwc,
    "non_sel": depress_non_sel_post_liwc,
    "screen": depress_screen_post_liwc,
    "non_screen": depress_non_screen_post_liwc,
    "non_depress": non_depress_post_liwc
})
all_df

Unnamed: 0,sel,non_sel,screen,non_screen,non_depress
function (Function Words),18421,13297,31718,382526,3339969
pronoun (Pronouns),6400,4412,10812,120096,903445
ppron (Personal Pronouns),4177,2883,7060,75826,524049
i (I),2868,1918,4786,39330,240230
we (We),79,74,153,3317,35660
...,...,...,...,...,...
swear (Swear),100,63,163,2683,25967
netspeak (Netspeak),45,49,94,2665,25771
assent (Assent),59,60,119,2211,19483
nonflu (Nonfluencies),53,43,96,1599,13873


In [55]:
all_df_norm = all_df / all_df.sum(0)
all_df_norm

Unnamed: 0,sel,non_sel,screen,non_screen,non_depress
function (Function Words),0.186702,0.185975,0.186397,0.186108,0.186198
pronoun (Pronouns),0.064866,0.061707,0.063539,0.058430,0.050366
ppron (Personal Pronouns),0.042335,0.040322,0.041489,0.036891,0.029215
i (I),0.029068,0.026826,0.028126,0.019135,0.013392
we (We),0.000801,0.001035,0.000899,0.001614,0.001988
...,...,...,...,...,...
swear (Swear),0.001014,0.000881,0.000958,0.001305,0.001448
netspeak (Netspeak),0.000456,0.000685,0.000552,0.001297,0.001437
assent (Assent),0.000598,0.000839,0.000699,0.001076,0.001086
nonflu (Nonfluencies),0.000537,0.000601,0.000564,0.000778,0.000773


In [56]:
all_df_norm.to_csv("./LIWC_normalized.csv")

In [57]:
for dim in ["i (I)", "posemo (Positive Emotions)", "negemo (Negative Emotions)", "sad (Sad)", "health (Health)", "anx (Anx)"]:
    stat, pval = proportions_ztest(
        count = [all_df.loc[dim, "non_depress"], all_df.loc[dim, "non_screen"]],
        nobs = [len(non_depress_post_words), len(depress_non_screen_post_words)]
    )
    print(dim, pval)

i (I) 0.0
posemo (Positive Emotions) 2.3699360445640995e-84
negemo (Negative Emotions) 7.62803312940832e-09
sad (Sad) 6.348395427679877e-34
health (Health) 3.49825813041458e-27
anx (Anx) 1.4605760325009146e-42
