In [None]:
import numpy as np 
import pandas as pd

In [None]:
import gensim

# Load Google's pre-trained Word2Vec model.
model = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)  

In [None]:
import csv
import nltk
from nltk.tokenize import word_tokenize, RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re

nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()
# print(lemmatizer.lemmatize("cats"))

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
tokenizer = RegexpTokenizer(r'\w+')

def parse_reddit_csv(filename):
    print("Reading from", filename)
    csv_cols = []
    frequencies = {}
    with open(filename) as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            #remove numbers 
            row['selftext'] = re.sub(r'\d+', '', row['selftext'])
            row['title'] = re.sub(r'\d+', '', row['title'])
            # Tokenize the post text (selftext) and post title
            #remove punctuations 
            post_tokens = tokenizer.tokenize(row['selftext'])
            title_tokens = tokenizer.tokenize(row['title'])
            # Filter out stopwords
            post_tokens = [w for w in post_tokens if not w in stop_words]
            title_tokens = [w for w in title_tokens if not w in stop_words]
            # Lemmatize the post text (reduce words to word stems i.e. cats->cat, liked->like)
            post_tokens = [lemmatizer.lemmatize(w, 'n') for w in post_tokens]
            post_tokens = [lemmatizer.lemmatize(w, 'v') for w in post_tokens]
            title_tokens = [lemmatizer.lemmatize(w, 'n') for w in title_tokens]
            title_tokens = [lemmatizer.lemmatize(w, 'v') for w in title_tokens]
            csv_cols.append({'author': row['author'],
                             'selftext': post_tokens,
                             'title': title_tokens})
            # TODO need to collect frequencies of words in the entire corpus
            # TODO update frequencies mapping from word->count and also get a sum
    return csv_cols, frequencies

In [None]:
parsed = parse_reddit_csv('data/final_proj_data_preprocessed_1000sample.csv')

In [None]:
from collections import Counter

# Compute word2vec post embeddings (using both selftext and title)
# TODO: do the below
# The first (thereafter called W2VWeighted) is calculated by weighing the
# contribution of each word embedding by the inverse of its relative frequency
# to the final sentence embedding.
# In doing so, the contributions of the most common words are minimized.
# The second (thereafter called W2V-SIF) is calculated by first taking the
# weighed sentence embedding before removing the first principal component from it.
# Sanjeev Arora, Yingyu Liang, and Tengyu Ma. 2017.
# A simple but tough-to-beat baseline for sentence embeddings. In ICLR.
counts = Counter(parsed[0][0]['selftext']).items()
freq = pd.DataFrame(counts)
print(freq)
#inverse relative frequency
freq['inv_rfreq'] = freq[1].sum()/freq[1]
print(freq)

In [None]:
for w in freq[0].to_list():
    emb = model[freq[0].to_list()]
except KeyError:

In [None]:
freq[0]

In [None]:
# Compute LDA post embeddings (using both selftext and title)
# TODO:
# A Bag of Words (BoW) corpus was obtained before a term frequency-inverse
# document frequency (TF-IDF) corpus was derived from it. Topic modeling was
# then performed on both the BoW corpus (thereafter LDA-BoW) and
# TF-IDF corpus (thereafter LDA-TFIDF) with the number of topics set to 30,
# in line with the number of clusters used. The document-topic mapping of
# each post is then used for computing cosine similarities with all other posts
# note: using gensim?