In [None]:
import csv
import numpy as np
import pandas as pd
import json
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
import re
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
from mittens import Mittens
import pickle

# Loading GloVe Model

In [None]:
embeddings_dim = 100
glove_filepath = '/home/anasab/resources/glove.twitter.27B/glove.twitter.27B.%dd.txt' % embeddings_dim


In [None]:
def glove2dict(glove_filename):
    with open(glove_filename, encoding='utf-8') as f:
        reader = csv.reader(f, delimiter=' ',quoting=csv.QUOTE_NONE)
        embed = {line[0]: np.array(list(map(float, line[1:])))
                for line in reader}
    return embed

pre_glove = glove2dict(glove_filepath)

# Loading & preprocessing text data

In [None]:
writings_clpsych = pd.DataFrame.from_dict(json.load(open('writings_df_clpsych_all.json')))
writings_erisk_selfharm = pickle.load(open('writings_df_selfharm_liwc_subsets', 'rb'))
writings_erisk_anorexia = pickle.load(open('writings_df_anorexia_liwc', 'rb'))
writings_erisk_depression = pickle.load(open('writings_df_depression_liwc', 'rb'))

#### Select just depressed/non depressed?

In [None]:
writings_clpsych = writings_clpsych[~writings_clpsych['condition'].isin(['depression', 'ptsd'])]
writings_erisk_selfharm = writings_erisk_selfharm[writings_erisk_selfharm['label']==0]
writings_erisk_anorexia = writings_erisk_anorexia[writings_erisk_anorexia['label']==0]
writings_erisk_depression = writings_erisk_depression[writings_erisk_depression['label']==0]

In [None]:
texts_clpsych = "\n".join(writings_clpsych.text.dropna().values)
texts_erisk_selfharm = "\n".join(writings_erisk_selfharm.dropna().text.values + writings_erisk_selfharm.dropna().title.values)
texts_erisk_anorexia = "\n".join(writings_erisk_anorexia.text.dropna().values + writings_erisk_anorexia.title.dropna().values)
texts_erisk_depression = "\n".join(writings_erisk_depression.text.dropna().values + writings_erisk_depression.title.dropna().values)

In [None]:
tt = TweetTokenizer()
sw = stopwords.words("english")
def tokenize_tweets(t, tokenizer=tt, stop=False):
    tokens = tokenizer.tokenize(t.lower())
    tokens_clean = [token for token in tokens if 
                            re.match("^[a-z]*$", token)]
    if not stop:
        tokens_clean = [token for token in tokens_clean 
                        if token not in sw]
    return tokens_clean

In [None]:
clpsych_texts_tokenized = tokenize_tweets(texts_clpsych, stop=True)
erisk_depression_texts_tokenized = tokenize_tweets(texts_erisk_depression, stop=True)
erisk_anorexia_texts_tokenized = tokenize_tweets(texts_erisk_anorexia, stop=True)
erisk_selfharm_texts_tokenized = tokenize_tweets(texts_erisk_selfharm, stop=True)

In [None]:
# all_texts_tokenized_clean = [token.lower() for token in all_texts_tokenized if (token.lower() not in sw)
#                             and re.match("^[a-z]*$", token.lower())]

In [None]:
# oov = [token for token in all_texts_tokenized_clean if token not in pre_glove.keys()]

In [None]:
def get_rareoov(xdict, val):
    return [k for (k,v) in Counter(xdict).items() if v<=val]
oov_rare = get_rareoov(oov, 1)
# corp_vocab = list(pre_glove.keys()) + 
corp_vocab = list(set(oov) - set(oov_rare))

In [None]:
clpsych_vocab_negative = Counter(clpsych_texts_tokenized).most_common()
clpsych_vocab_negative

In [None]:
erisk_depression_vocab_negative = Counter(erisk_depression_texts_tokenized).most_common()
erisk_depression_vocab_negative

In [None]:
erisk_anorexia_vocab_negative = Counter(erisk_anorexia_texts_tokenized).most_common()
erisk_anorexia_vocab_negative

In [None]:
erisk_selfharm_vocab_negative = Counter(erisk_selfharm_texts_tokenized).most_common()
erisk_selfharm_vocab_negative

In [None]:
print("Texts size: \nclpsych %d, \nerisk depression %d, \nerisk anorexia %d, \nerisk selfharm %d\n" % (
    len(clpsych_texts_tokenized), len(erisk_depression_texts_tokenized), len(erisk_anorexia_texts_tokenized), len(erisk_selfharm_texts_tokenized)))

In [None]:
print("Vocab size: \nclpsych %d, \nerisk depression %d, \nerisk anorexia %d, \nerisk selfharm %d\n" % (
    len(clpsych_vocab_negative), len(erisk_depression_vocab_negative), len(erisk_anorexia_vocab_negative), 
    len(erisk_selfharm_vocab_negative)))

In [None]:
all_texts = texts_clpsych + texts_erisk_depression + texts_erisk_anorexia + texts_erisk_selfharm

In [None]:
all_vocab = Counter(clpsych_texts_tokenized + erisk_depression_texts_tokenized + 
                    erisk_anorexia_texts_tokenized + erisk_selfharm_texts_tokenized)
len(all_vocab)

In [None]:
pickle.dump(all_texts, open("all_texts_clpsych_erisk_negative.pkl", "wb+"))

In [None]:
pickle.dump(texts_erisk_selfharm, open("texts_erisk_selfharm.pkl", "wb+"))

In [None]:
pickle.dump(all_vocab, open("all_vocab_clpsyck_erisk_negative_stop.pkl", "wb+"))

In [None]:
all_vocab_20000 = Counter(clpsych_texts_tokenized + erisk_depression_texts_tokenized + 
                    erisk_anorexia_texts_tokenized + erisk_selfharm_texts_tokenized).most_common(20000)
pickle.dump(list(set([k for k,v in all_vocab_20000])), open("all_vocab_clpsych_erisk_negative_stop_20000.pkl", "wb+"))

In [None]:
[k for k,v in all_vocab_10000]

In [None]:
clpsych_vocab_negative[30:100]

# Fine-tuning with Mittens

In [None]:
from mittens import Mittens

In [None]:
glove_embeddings = pickle.load(open("original_glove_clpsych_erisk_stop_20000.pkl", "rb"))

In [None]:
# mittens_embeddings = pickle.load(open("finetuned_glove_clpsych_erisk_20000.pkl", "rb"))
mittens_embeddings2 = pickle.load(open("finetuned_glove_clpsych_erisk_stop_20000_2.pkl", "rb"))

In [None]:
mittens_embeddings.keys()

In [None]:
def embeddings_to_csv(embeddings_dict, outfile_path="mittens_embeddings.tsv",
                      metadata_path="mittens_embeddings_meta.tsv", separator="\t"):
    words = []
    with open(outfile_path, "w+") as f:
        for key, vector in embeddings_dict.items():
            if not metadata_path:
                f.write(separator.join([key] + [str(n) for n in vector]) + "\n")
            else:
                f.write(separator.join([str(n) for n in vector]) + "\n")
                words.append(key)
    if metadata_path:
        with open(metadata_path, "w+") as f:
            for word in words:
                f.write(word + "\n")
        

In [None]:
embeddings_to_csv(glove_embeddings, "glove_clpsych_erisk_stop_embeddings_20000.tsv", 
                  "glove_clpsych_erisk_stop_meta_20000.tsv")

In [None]:
embeddings_to_csv(mittens_embeddings2, "mittens_clpsych_erisk_stop_embeddings_20000.tsv", 
                  "mittens_clpsych_erisk_stop_meta_20000.tsv")

In [None]:
vocab = pickle.load(open("vocab_clpsych_10000.pkl", "rb"))

In [None]:
sorted(vocab)

## Analyzing the embedding space

In [None]:
def get_embeddings_matrix(embeddings_dict):
    return np.array(list(embeddings_dict.values()))
def get_embeddings_dict(embeddings_matrix, embeddings_keys):
    embeddings_dict = {}
    for i, k in enumerate(embeddings_keys):
        embeddings_dict[k] = embeddings_matrix[i]
    return embeddings_dict

In [None]:
get_embeddings_dict(get_embeddings_matrix(glove_embeddings), glove_embeddings.keys())

In [None]:
glove_embeddings

In [None]:
def normalize_embeddings(embedding_matrix):
    emb_mean = np.mean(embedding_matrix,axis = 0)
    emb_std = np.std(embedding_matrix, axis = 0)
    return (embedding_matrix-emb_mean)/emb_std

In [None]:
pd.Series(np.array(list(glove_embeddings.values())).mean(axis=1)).hist(alpha=0.5, bins=50)
pd.Series(normalize_embeddings(np.array(list(glove_embeddings.values()))).mean(axis=1)).hist(alpha=0.5, bins=50)

In [None]:
pd.Series(np.array(list(mittens_embeddings.values())).mean(axis=1)).hist(alpha=0.5, bins=50)
pd.Series(normalize_embeddings(np.array(list(mittens_embeddings.values()))).mean(axis=1)).hist(alpha=0.5, bins=50)

In [None]:
pd.Series(np.array(list(mittens_embeddings2.values())).mean(axis=1)).hist(alpha=0.5, bins=50)
pd.Series(normalize_embeddings(np.array(list(mittens_embeddings2.values()))).mean(axis=1)).hist(alpha=0.5, bins=50)

In [None]:
glove_embeddings_normalized = get_embeddings_dict(normalize_embeddings(
    get_embeddings_matrix(glove_embeddings)), glove_embeddings.keys())
glove_embeddings_normalized

In [None]:
mittens_embeddings_normalized = get_embeddings_dict(normalize_embeddings(
    get_embeddings_matrix(mittens_embeddings)), mittens_embeddings.keys())
mittens_embeddings_normalized

In [None]:
mittens_embeddings2_normalized = get_embeddings_dict(normalize_embeddings(
    get_embeddings_matrix(mittens_embeddings)), mittens_embeddings2.keys())
mittens_embeddings2_normalized

In [None]:
pd.Series(np.array(list(mittens_embeddings.values())).mean(axis=1)).hist(alpha=0.5, bins=50)
pd.Series(np.array(list(mittens_embeddings_normalized.values())).mean(axis=1)).hist(alpha=0.5, bins=50)


In [None]:
pd.Series(np.array(list(glove_embeddings.values())).mean(axis=1)).hist(alpha=0.5, bins=50)
pd.Series(np.array(list(glove_embeddings_normalized.values())).mean(axis=1)).hist(alpha=0.5, bins=50)

In [None]:
pickle.dump(glove_embeddings_normalized, open("original_glove_clpsych_erisk_normalized_20000.pkl", "wb+"))
pickle.dump(mittens_embeddings_normalized, open("finetuned_glove_clpsych_erisk_normalized_20000.pkl", "wb+"))
pickle.dump(mittens_embeddings2_normalized, open("finetuned_glove_clpsych_erisk_normalized_2_20000.pkl", "wb+"))

In [None]:
embeddings_to_csv(glove_embeddings_normalized, "glove_clpsych_erisk_normalized_embeddings_20000.tsv", 
                  "glove_clpsych_erisk_normalized_meta_20000.tsv")
embeddings_to_csv(mittens_embeddings_normalized, "mittens_clpsych_erisk_normalized_embeddings_20000.tsv", 
                  "mittens_clpsych_erisk_normalized_meta_20000.tsv")
embeddings_to_csv(mittens_embeddings2_normalized, "mittens_clpsych_erisk2_normalized_embeddings_20000.tsv", 
                  "mittens2]_clpsych_erisk2_normalized_meta_20000.tsv")


In [None]:
['i' in mittens_embeddings2.keys()