In [None]:
import csv
import numpy as np
import pandas as pd
import json
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
import re
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
from mittens import Mittens
import pickle

# Loading GloVe Model

In [None]:
embeddings_dim = 100
glove_filepath = '/home/anasab/resources/glove.twitter.27B/glove.twitter.27B.%dd.txt' % embeddings_dim


In [None]:
def glove2dict(glove_filename):
    with open(glove_filename, encoding='utf-8') as f:
        reader = csv.reader(f, delimiter=' ',quoting=csv.QUOTE_NONE)
        embed = {line[0]: np.array(list(map(float, line[1:])))
                for line in reader}
    return embed

pre_glove = glove2dict(glove_filepath)

# Loading & preprocessing text data

In [None]:
writings_clpsych = pd.DataFrame.from_dict(json.load(open('data/writings_df_clpsych_all.json')))
writings_erisk_selfharm = pickle.load(open('data/writings_df_selfharm_liwc_subsets', 'rb'))
writings_erisk_anorexia = pickle.load(open('data/writings_df_anorexia_liwc', 'rb'))
writings_erisk_depression = pickle.load(open('data/writings_df_depression_liwc', 'rb'))

#### Select just depressed/non depressed?

In [None]:
writings_clpsych = writings_clpsych[~writings_clpsych['condition'].isin(['depression', 'ptsd'])]
writings_erisk_selfharm = writings_erisk_selfharm[writings_erisk_selfharm['label']==0]
writings_erisk_anorexia = writings_erisk_anorexia[writings_erisk_anorexia['label']==0]
writings_erisk_depression = writings_erisk_depression[writings_erisk_depression['label']==0]

In [None]:
writings_clpsych_grouped = writings_clpsych.groupby('subject').aggregate(lambda l: " ".join(l))

In [None]:
writings_clpsych_grouped.columns

In [None]:
texts_clpsych = "\n".join(writings_clpsych.text.dropna().values)
texts_erisk_selfharm = "\n".join(writings_erisk_selfharm.dropna().text.values + writings_erisk_selfharm.dropna().title.values)
texts_erisk_anorexia = "\n".join(writings_erisk_anorexia.text.dropna().values + writings_erisk_anorexia.title.dropna().values)
texts_erisk_depression = "\n".join(writings_erisk_depression.text.dropna().values + writings_erisk_depression.title.dropna().values)

In [None]:
tt = TweetTokenizer()
sw = stopwords.words("english")
def tokenize_tweets(t, tokenizer=tt, stop=False):
    tokens = tokenizer.tokenize(t.lower())
    tokens_clean = [token for token in tokens if 
                            re.match("^[a-z]*$", token)]
    if not stop:
        tokens_clean = [token for token in tokens_clean 
                        if token not in sw]
    return tokens_clean

In [None]:
clpsych_texts_tokenized = tokenize_tweets(texts_clpsych, stop=True)
erisk_depression_texts_tokenized = tokenize_tweets(texts_erisk_depression, stop=True)
erisk_anorexia_texts_tokenized = tokenize_tweets(texts_erisk_anorexia, stop=True)
erisk_selfharm_texts_tokenized = tokenize_tweets(texts_erisk_selfharm, stop=True)

In [None]:
# all_texts_tokenized_clean = [token.lower() for token in all_texts_tokenized if (token.lower() not in sw)
#                             and re.match("^[a-z]*$", token.lower())]

In [None]:
# oov = [token for token in all_texts_tokenized_clean if token not in pre_glove.keys()]

In [None]:
def get_rareoov(xdict, val):
    return [k for (k,v) in Counter(xdict).items() if v<=val]
oov_rare = get_rareoov(oov, 1)
# corp_vocab = list(pre_glove.keys()) + 
corp_vocab = list(set(oov) - set(oov_rare))

In [None]:
clpsych_vocab = Counter(clpsych_texts_tokenized).most_common()
clpsych_vocab

In [None]:
erisk_depression_vocab = Counter(erisk_depression_texts_tokenized).most_common()
erisk_depression_vocab

In [None]:
erisk_anorexia_vocab = Counter(erisk_anorexia_texts_tokenized).most_common()
erisk_anorexia_vocab

In [None]:
erisk_selfharm_vocab = Counter(erisk_selfharm_texts_tokenized).most_common()
erisk_selfharm_vocab

In [None]:
print("Texts size: \nclpsych %d, \nerisk depression %d, \nerisk anorexia %d, \nerisk selfharm %d\n" % (
    len(clpsych_texts_tokenized), len(erisk_depression_texts_tokenized), len(erisk_anorexia_texts_tokenized), len(erisk_selfharm_texts_tokenized)))

In [None]:
print("Vocab size: \nclpsych %d, \nerisk depression %d, \nerisk anorexia %d, \nerisk selfharm %d\n" % (
    len(clpsych_vocab), len(erisk_depression_vocab), len(erisk_anorexia_vocab), 
    len(erisk_selfharm_vocab)))

In [None]:
all_texts = texts_clpsych + texts_erisk_depression + texts_erisk_anorexia + texts_erisk_selfharm

In [None]:
all_vocab = Counter(clpsych_texts_tokenized + erisk_depression_texts_tokenized + 
                    erisk_anorexia_texts_tokenized + erisk_selfharm_texts_tokenized)
len(all_vocab)

In [None]:
pickle.dump(all_texts, open("all_texts_clpsych_erisk.pkl", "wb+"))

In [None]:
pickle.dump(texts_erisk_selfharm, open("texts_erisk_selfharm.pkl", "wb+"))

In [None]:
pickle.dump(all_vocab, open("all_vocab_clpsyck_erisk_stop.pkl", "wb+"))

In [None]:
all_vocab_40000 = Counter(clpsych_texts_tokenized + erisk_depression_texts_tokenized + 
                    erisk_anorexia_texts_tokenized + erisk_selfharm_texts_tokenized).most_common(40000)
pickle.dump(list(set([k for k,v in all_vocab_40000])), open("all_vocab_clpsych_erisk_stop_40000.pkl", "wb+"))

In [None]:
[k for k,v in all_vocab_10000]

In [None]:
clpsych_vocab_negative[30:100]

# Fine-tuning with Mittens

In [None]:
from mittens import Mittens

In [None]:
glove_embeddings = pickle.load(open("original_glove_clpsych_erisk_stop_20000.pkl", "rb"))

In [None]:
# mittens_embeddings = pickle.load(open("finetuned_glove_clpsych_erisk_20000.pkl", "rb"))
mittens_embeddings = pickle.load(open("embeddings/finetuned_glove_clpsych_erisk_stop_20000_2.pkl", "rb"))

In [None]:
mittens_embeddings['me']

In [None]:
def embeddings_to_csv(embeddings_dict, outfile_path="mittens_embeddings.tsv",
                      metadata_path="mittens_embeddings_meta.tsv", separator="\t"):
    words = []
    with open(outfile_path, "w+") as f:
        for key, vector in embeddings_dict.items():
            if not metadata_path:
                f.write(separator.join([key] + [str(n) for n in vector]) + "\n")
            else:
                f.write(separator.join([str(n) for n in vector]) + "\n")
                words.append(key)
    if metadata_path:
        with open(metadata_path, "w+") as f:
            for word in words:
                f.write(word + "\n")
        

In [None]:
embeddings_to_csv(glove_embeddings, "glove_clpsych_erisk_stop_embeddings_20000.tsv", 
                  "glove_clpsych_erisk_stop_meta_20000.tsv")

In [None]:
embeddings_to_csv(mittens_embeddings, "mittens_clpsych_erisk_stop_positive_embeddings_20000.tsv", 
                  "mittens_clpsych_erisk_stop_positive_meta_20000.tsv")

In [None]:
vocab = pickle.load(open("vocab_clpsych_10000.pkl", "rb"))

In [None]:
sorted(vocab)

## Analyzing the embedding space

In [None]:
def get_embeddings_matrix(embeddings_dict):
    return np.array(list(embeddings_dict.values()))
def get_embeddings_dict(embeddings_matrix, embeddings_keys):
    embeddings_dict = {}
    for i, k in enumerate(embeddings_keys):
        embeddings_dict[k] = embeddings_matrix[i]
    return embeddings_dict

In [None]:
get_embeddings_dict(get_embeddings_matrix(glove_embeddings), glove_embeddings.keys())

In [None]:
glove_embeddings

In [None]:
def normalize_embeddings(embedding_matrix):
    emb_mean = np.mean(embedding_matrix,axis = 0)
    emb_std = np.std(embedding_matrix, axis = 0)
    return (embedding_matrix-emb_mean)/emb_std

In [None]:
pd.Series(np.array(list(glove_embeddings.values())).mean(axis=1)).hist(alpha=0.5, bins=50)
pd.Series(normalize_embeddings(np.array(list(glove_embeddings.values()))).mean(axis=1)).hist(alpha=0.5, bins=50)

In [None]:
pd.Series(np.array(list(mittens_embeddings.values())).mean(axis=1)).hist(alpha=0.5, bins=50)
pd.Series(normalize_embeddings(np.array(list(mittens_embeddings.values()))).mean(axis=1)).hist(alpha=0.5, bins=50)

In [None]:
pd.Series(np.array(list(mittens_embeddings2.values())).mean(axis=1)).hist(alpha=0.5, bins=50)
pd.Series(normalize_embeddings(np.array(list(mittens_embeddings2.values()))).mean(axis=1)).hist(alpha=0.5, bins=50)

In [None]:
glove_embeddings_normalized = get_embeddings_dict(normalize_embeddings(
    get_embeddings_matrix(glove_embeddings)), glove_embeddings.keys())
glove_embeddings_normalized

In [None]:
mittens_embeddings_normalized = get_embeddings_dict(normalize_embeddings(
    get_embeddings_matrix(mittens_embeddings)), mittens_embeddings.keys())
mittens_embeddings_normalized

In [None]:
mittens_embeddings2_normalized = get_embeddings_dict(normalize_embeddings(
    get_embeddings_matrix(mittens_embeddings)), mittens_embeddings2.keys())
mittens_embeddings2_normalized

In [None]:
pd.Series(np.array(list(mittens_embeddings.values())).mean(axis=1)).hist(alpha=0.5, bins=50)
pd.Series(np.array(list(mittens_embeddings_normalized.values())).mean(axis=1)).hist(alpha=0.5, bins=50)


In [None]:
pd.Series(np.array(list(glove_embeddings.values())).mean(axis=1)).hist(alpha=0.5, bins=50)
pd.Series(np.array(list(glove_embeddings_normalized.values())).mean(axis=1)).hist(alpha=0.5, bins=50)

In [None]:
# pickle.dump(glove_embeddings_normalized, open("original_glove_clpsych_erisk_normalized_20000.pkl", "wb+"))
pickle.dump(mittens_embeddings_normalized, open("finetuned_glove_clpsych_erisk_stop_normalized_20000.pkl", "wb+"))
# pickle.dump(mittens_embeddings2_normalized, open("finetuned_glove_clpsych_erisk_normalized_2_20000.pkl", "wb+"))

In [None]:
embeddings_to_csv(glove_embeddings_normalized, "glove_clpsych_erisk_normalized_embeddings_20000.tsv", 
                  "glove_clpsych_erisk_normalized_meta_20000.tsv")
embeddings_to_csv(mittens_embeddings_normalized, "mittens_clpsych_erisk_normalized_embeddings_20000.tsv", 
                  "mittens_clpsych_erisk_normalized_meta_20000.tsv")
embeddings_to_csv(mittens_embeddings2_normalized, "mittens_clpsych_erisk2_normalized_embeddings_20000.tsv", 
                  "mittens2]_clpsych_erisk2_normalized_meta_20000.tsv")


In [None]:
[w for w in mittens_embeddings2.keys() if w in stopwords.words("english")]

In [None]:
sum(sorted(np.array(list(mittens_embeddings_normalized.values())).mean(axis=1).tolist(), reverse=True))

## Embeddings comparison - feature selection

In [None]:
from gensim.models import KeyedVectors

In [None]:
embeddings_positive = pickle.load(open("finetuned_glove_clpsych_erisk_positive_stop_20000.pkl", "rb"))

In [None]:
embeddings_negative = pickle.load(open("finetuned_glove_clpsych_erisk_negative_stop_20000.pkl", "rb"))

In [None]:
def write_word2vec_format(embeddings, filepath, voc=20000, size=100, sep=" "):
    with open(filepath, "w+") as f:
        f.write(sep.join([str(voc), str(size)]))
        for key, vector in embeddings.items():
            f.write("\n")
            f.write(key + sep + sep.join([str(n) for n in vector]))
            

In [None]:
write_word2vec_format(embeddings_positive, "finetuned_clpsych_erisk_positive_stop_20000.wv")
write_word2vec_format(embeddings_negative, "finetuned_clpsych_erisk_negative_stop_20000.wv")

In [None]:
model_positive = KeyedVectors.load_word2vec_format("finetuned_clpsych_erisk_positive_stop_20000.wv", binary=False)
model_negative = KeyedVectors.load_word2vec_format("finetuned_clpsych_erisk_negative_stop_20000.wv", binary=False)

In [None]:
model_positive.most_similar("rituals")

In [None]:
model_negative.most_similar("rituals", topn=10)

In [None]:
head_words_uneven = set()
head_words_even = set()
neighbors_uneven = []
overlap_thresh = 0.1
overlap_thresh_up = 0.75
nr_neighbors = 100
overlaps = []

for word in set(embeddings_positive.keys()).intersection(embeddings_negative.keys()):
    neighbors_positive = model_positive.most_similar(word, topn=nr_neighbors)    
    neighbors_negative = model_negative.most_similar(word, topn=nr_neighbors)
    overlap = len(
        set([k for k,v in neighbors_positive]).intersection(
        set([k for k,v in neighbors_negative]))
    )
    overlaps.append(overlap)
    if overlap < overlap_thresh*nr_neighbors:
        head_words_uneven.add(word)
    if overlap > overlap_thresh_up*nr_neighbors:
        head_words_even.add(word)

In [None]:
pd.Series(overlaps).describe()

In [None]:
len(head_words_even)

In [None]:
set(head_words_even).intersection(common_words)
pickle.dump(set(head_words_even).intersection(common_words), 
            open("common_words_even_neighbors_overlap.pkl", "wb+"))

In [None]:
set(head_words_uneven).intersection(common_words)
pickle.dump(set(head_words_uneven).intersection(common_words), 
            open("common_words_uneven_neighbors_overlap.pkl", "wb+"))

In [None]:
set(head_words_uneven).intersection(common_words)


In [None]:
from matplotlib import pyplot as plt
from scipy.stats import pearsonr, spearmanr

def plot_scatter(word, topn):
    most_similar_negative = [(k, v) for k, v in model_negative.most_similar(word, topn=topn)
                             if k in model_positive]
    plt.scatter(        
        [model_negative.distance(word, k) for k,v in most_similar_negative],
        [model_positive.distance(word, k) for k,v in most_similar_negative]
    )
def get_correlation(word, topn, func=pearsonr):
    if word not in model_positive or word not in model_negative:
        return
    most_similar_negative = [(k, v) for k, v in model_negative.most_similar(word, topn=topn)
                             if k in model_positive]
    return func(
        [model_negative.distance(word, k) for k,v in most_similar_negative] ,       
        [model_positive.distance(word, k) for k,v in most_similar_negative]
    )[0]
plot_scatter("rituals", 1000)
get_correlation("rituals", 1000, pearsonr)

In [None]:
seed_vocab = {"sad", "happy", "depressed", "me", "you", "i", "health"}
for topn in range(10, 1500, 50):
    corrs = []
    for w in seed_vocab:
        corr = get_correlation(w, topn)
        corrs.append(corr)
    print(topn, sum(corrs))

In [None]:
correlations = {}
for word in model_negative.vocab:
    correlations[word] = get_correlation(word, 100)

In [None]:
sorted_anticorrelated = sorted([(w,c) for w,c in
    correlations.items()
       if w in common_words
       and w not in head_words_uneven],
        key=lambda t: t[1] if t[1] else 0)

In [None]:
sorted_correlated = sorted([(w,c) for w,c in
    correlations.items()
       if w in common_words],
        key=lambda t: t[1] if t[1] else 0, reverse=True)

In [None]:
sorted_anticorrelated

In [None]:
sorted_correlated

In [None]:
pickle.dump([k for k,v in sorted_correlated], open("common_words_uneven_neighbors_correlated2.pkl", "wb+"))

In [None]:
from liwc_readDict import readDict

liwc = readDict('/home/anasab/resources/liwc.dic')
categories = [c for (w,c) in liwc]
set(categories)
liwc_dict = {}
for (w, c) in liwc:
    if c not in liwc_dict:
        liwc_dict[c] = []
    liwc_dict[c].append(w)

In [None]:
all_vocab = pickle.load(open("all_vocab_clpsyck_erisk.pkl", "rb"))
common_words = set([k for k,v in all_vocab.most_common(1000)])

In [None]:
common_words