In [None]:
import csv
import numpy as np
import pandas as pd
import json
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
import re
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
from mittens import Mittens
import pickle

# Loading GloVe Model

In [None]:
embeddings_dim = 100
glove_filepath = '/home/anasab/resources/glove.twitter.27B/glove.twitter.27B.%dd.txt' % embeddings_dim


In [None]:
def glove2dict(glove_filename):
    with open(glove_filename, encoding='utf-8') as f:
        reader = csv.reader(f, delimiter=' ',quoting=csv.QUOTE_NONE)
        embed = {line[0]: np.array(list(map(float, line[1:])))
                for line in reader}
    return embed

pre_glove = glove2dict(glove_filepath)

# Loading & preprocessing text data

In [None]:
writings_clpsych = pd.DataFrame.from_dict(json.load(open('writings_df_clpsych_all.json')))
writings_erisk_selfharm = pickle.load(open('writings_df_selfharm_liwc_subsets', 'rb'))
writings_erisk_anorexia = pickle.load(open('writings_df_anorexia_liwc', 'rb'))
writings_erisk_depression = pickle.load(open('writings_df_depression_liwc', 'rb'))

In [None]:
# writings_clpsych.text = writings_clpsych.text.dropna()
# writings_erisk_depression.text = writings_erisk_depression.text.dropna()
# writings_erisk_anorexia.text = writings_erisk_anorexia.text.dropna()
# writings_erisk_selfharm.text = writings_erisk_selfharm.text.dropna()

In [None]:
texts_clpsych = "\n".join(writings_clpsych.text.dropna().values)
texts_erisk_selfharm = "\n".join(writings_erisk_selfharm.dropna().text.values + writings_erisk_selfharm.dropna().title.values)
texts_erisk_anorexia = "\n".join(writings_erisk_anorexia.text.dropna().values + writings_erisk_anorexia.title.dropna().values)
texts_erisk_depression = "\n".join(writings_erisk_depression.text.dropna().values + writings_erisk_depression.title.dropna().values)

In [None]:
tt = TweetTokenizer()
sw = stopwords.words("english")
def tokenize_tweets(t, tokenizer=tt):
    tokens = tokenizer.tokenize(t.lower())
    tokens_clean = [token for token in tokens if (token not in sw)
                            and re.match("^[a-z]*$", token)]
    return tokens_clean

In [None]:
clpsych_texts_tokenized = tokenize_tweets(texts_clpsych)
erisk_depression_texts_tokenized = tokenize_tweets(texts_erisk_depression)
erisk_anorexia_texts_tokenized = tokenize_tweets(texts_erisk_anorexia)
erisk_selfharm_texts_tokenized = tokenize_tweets(texts_erisk_selfharm)

In [None]:
# all_texts_tokenized_clean = [token.lower() for token in all_texts_tokenized if (token.lower() not in sw)
#                             and re.match("^[a-z]*$", token.lower())]

In [None]:
# oov = [token for token in all_texts_tokenized_clean if token not in pre_glove.keys()]

In [None]:
def get_rareoov(xdict, val):
    return [k for (k,v) in Counter(xdict).items() if v<=val]
oov_rare = get_rareoov(oov, 1)
# corp_vocab = list(pre_glove.keys()) + 
corp_vocab = list(set(oov) - set(oov_rare))

In [None]:
clpsych_vocab = Counter(clpsych_texts_tokenized).most_common()
clpsych_vocab

In [None]:
erisk_depression_vocab = Counter(erisk_depression_texts_tokenized).most_common()
erisk_depression_vocab

In [None]:
erisk_anorexia_vocab = Counter(erisk_anorexia_texts_tokenized).most_common()
erisk_anorexia_vocab

In [None]:
erisk_selfharm_vocab = Counter(erisk_selfharm_texts_tokenized).most_common()
erisk_selfharm_vocab

In [None]:
print("Texts size: \nclpsych %d, \nerisk depression %d, \nerisk anorexia %d, \nerisk selfharm %d\n" % (
    len(clpsych_texts_tokenized), len(erisk_depression_texts_tokenized), len(erisk_anorexia_texts_tokenized), len(erisk_selfharm_texts_tokenized)))

In [None]:
print("Vocab size: \nclpsych %d, \nerisk depression %d, \nerisk anorexia %d, \nerisk selfharm %d\n" % (
    len(clpsych_vocab), len(erisk_depression_vocab), len(erisk_anorexia_vocab), len(erisk_selfharm_vocab)))

In [None]:
all_texts = texts_clpsych + texts_erisk_depression + texts_erisk_anorexia + texts_erisk_selfharm

In [None]:
all_vocab = Counter(clpsych_texts_tokenized + erisk_depression_texts_tokenized + 
                    erisk_anorexia_texts_tokenized + erisk_selfharm_texts_tokenized)
len(all_vocab)

In [None]:
pickle.dump(all_texts, open("all_texts_clpsych_erisk.pkl", "wb+"))

In [None]:
pickle.dump(texts_erisk_selfharm, open("texts_erisk_selfharm.pkl", "wb+"))

In [None]:
pickle.dump(all_vocab, open("all_vocab_clpsyck_erisk.pkl", "wb+"))

In [None]:
all_vocab_10000 = Counter(clpsych_texts_tokenized + erisk_depression_texts_tokenized + 
                    erisk_anorexia_texts_tokenized + erisk_selfharm_texts_tokenized).most_common(10000)
pickle.dump(list(set([k for k,v in all_vocab_10000])), open("all_vocab_clpsych_erisk_10000.pkl", "wb+"))

In [None]:
[k for k,v in all_vocab_10000]

# Fine-tuning with Mittens

In [None]:
cv = CountVectorizer(ngram_range=(1,1), vocabulary=corp_vocab)
X = cv.fit_transform([all_texts])
Xc = (X.T * X)
Xc.setdiag(0)
coocc_ar = Xc.toarray()

In [None]:
mittens_model = Mittens(n=50, max_iter=1000)
new_embeddings = mittens_model.fit(
    coocc_ar,
    vocab=corp_vocab,
    initial_embedding_dict= pre_glove)

In [None]:
Mittens?

In [None]:
from mittens import Mittens

In [None]:
mittens_embeddings = pickle.load(open("finetuned_glove_clpsych.pkl", "rb"))

In [None]:
mittens_embeddings.keys()

In [None]:
def embeddings_to_csv(embeddings_dict, outfile_path="mittens_embeddings.tsv",
                      metadata_path="mittens_embeddings_meta.tsv", separator="\t"):
    words = []
    with open(outfile_path, "w+") as f:
        for key, vector in embeddings_dict.items():
            if not metadata_path:
                f.write(separator.join([key] + [str(n) for n in vector]) + "\n")
            else:
                f.write(separator.join([str(n) for n in vector]) + "\n")
                words.append(key)
    if metadata_path:
        with open(metadata_path, "w+") as f:
            for word in words:
                f.write(word + "\n")
        

In [None]:
embeddings_to_csv(mittens_embeddings, "mittens_clpsych_embeddings.tsv", "mittens_clpsych_meta.tsv")

In [None]:
vocab = pickle.load(open("vocab_clpsych_10000.pkl", "rb"))

In [None]:
sorted(vocab)

In [None]:
Mittens?