In [1]:
import gensim
import numpy as np 
import pandas as pd

In [2]:
from collections import Counter

In [3]:
from nltk.tokenize import RegexpTokenizer

In [4]:
import pickle 

In [5]:
from sklearn.metrics.pairwise import cosine_similarity

ModuleNotFoundError: No module named 'sklearn'

In [None]:
# Load Google's pre-trained Word2Vec model.
model = gensim.models.KeyedVectors.load_word2vec_format('model/GoogleNews-vectors-negative300.bin', binary=True)  

## Parsing raw reddit posts

In [6]:
import csv
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re

nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()
# print(lemmatizer.lemmatize("cats"))

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
tokenizer = RegexpTokenizer(r'\w+')

def parse_reddit_csv(filename):
    print("Reading from", filename)
    csv_cols = []
    frequencies = {}
    with open(filename) as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            # Remove numbers, punctuation
            row['selftext'] = re.sub(r'\d+', '', row['selftext'])
            row['title'] = re.sub(r'\d+', '', row['title'])
            # Tokenize the post text (selftext) and post title
            post_tokens = tokenizer.tokenize(row['selftext'])
            title_tokens = tokenizer.tokenize(row['title'])
            # Filter out stopwords
            post_tokens = [w for w in post_tokens if not w in stop_words]
            title_tokens = [w for w in title_tokens if not w in stop_words]
            # Lemmatize the post text (reduce words to word stems i.e. cats->cat, liked->like)
            post_tokens = [lemmatizer.lemmatize(w, 'n') for w in post_tokens]
            post_tokens = [lemmatizer.lemmatize(w, 'v') for w in post_tokens]
            title_tokens = [lemmatizer.lemmatize(w, 'n') for w in title_tokens]
            title_tokens = [lemmatizer.lemmatize(w, 'v') for w in title_tokens]
            csv_cols.append({'author': row['author'],
                             'selftext': post_tokens,
                             'title': title_tokens,
                             'post_id': row['id']})
    return csv_cols, frequencies

[nltk_data] Downloading package wordnet to /home/cephcyn/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/cephcyn/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
parsed, corpus_freq = parse_reddit_csv('data/final_proj_data_preprocessed_1000sample.csv')

Reading from data/final_proj_data_preprocessed_1000sample.csv


## Calculate post embeddings (Word2Vec)
(using selftext only)

The first (thereafter called W2VWeighted) is calculated by weighing the contribution of each word embedding by the inverse of its relative frequency to the final sentence embedding.

In doing so, the contributions of the most common words are minimized.

The second (thereafter called W2V-SIF) is calculated by first taking the weighed sentence embedding before removing the first principal component from it.

Sanjeev Arora, Yingyu Liang, and Tengyu Ma. 2017.\
A simple but tough-to-beat baseline for sentence embeddings. In ICLR.

In [None]:
sen_emb = {}
for i in range(len(parsed[0])):
    counts = Counter(parsed[0][i]['selftext']).items()
    freq = pd.DataFrame(counts)
    freq = freq.rename(columns={0: "word", 1: 'freq'})
    # Weight by inverse relative frequency
    freq['inv_rfreq'] = freq['freq'].sum()/freq['freq']
    unknowns = []
    emb_dict = {}
    for w in freq['word'].to_list():
        try:
            emb = model[w]
            emb_dict.update({w:emb})
        except:
            unknowns.append(w)
    emb_value = pd.DataFrame(emb_dict).transpose().reset_index()
    emb_value = emb_value.rename(columns={'index': "word"})
    emb_value_list = emb_value.iloc[:, 1:301].mul(freq['inv_rfreq'], axis = 0).sum().to_list()
    sen_emb.update({parsed[0][i]['post_id']:emb_value_list})       

In [None]:
with open('sample1000_emb.pickle', 'wb') as handle:
    pickle.dump(sen_emb, handle, protocol=pickle.HIGHEST_PROTOCOL)

## Calculate post topics (LDA)
(using both selftext and title)

A Bag of Words (BoW) corpus was obtained before a term frequency-inverse document frequency (TF-IDF) corpus was derived from it.

Topic modeling was then performed on both the BoW corpus (thereafter LDA-BoW) and
TF-IDF corpus (thereafter LDA-TFIDF) with the number of topics set to 30, in line with the number of clusters used.

The document-topic mapping of each post is then used for computing cosine similarities with all other posts

In [8]:
from gensim.corpora import Dictionary

# Create a dictionary representation of the documents.
dictionary = Dictionary([parsed[i]['selftext'] for i in range(len(parsed))])
print(dictionary)

# Bag-of-words representation of the documents.
corpus = [dictionary.doc2bow(parsed[i]['selftext']) for i in range(len(parsed))]
# for doc in corpus:
#     print([[dictionary[id], np.around(freq, decimals=2)] for id, freq in doc])

# TF-IDF (term freq, inverse document freq) representation
from gensim import models
tfidf = models.TfidfModel(corpus)
# for doc in tfidf[corpus]:
#     print([[dictionary[id], np.around(freq, decimals=2)] for id, freq in doc])

Dictionary(8311 unique tokens: ['Best', 'But', 'Hello', 'I', 'Unfortunately']...)


In [9]:
from gensim.models import LdaModel

def get_topics(corpus):
    # Train LDA model, get model & topic vectors
    # Set training parameters.
    num_topics = 30
    chunksize = 100
    passes = 20
    iterations = 400
    eval_every = 100  # None = Don't evaluate model perplexity, takes too much time.

    # Make a index to word dictionary.
    temp = dictionary[0]  # This is only to "load" the dictionary.
    id2word = dictionary.id2token

    model = LdaModel(
        corpus=corpus,
        id2word=id2word,
        chunksize=chunksize,
        alpha='auto',
        eta='auto',
        iterations=iterations,
        num_topics=num_topics,
        passes=passes,
        eval_every=eval_every
    )
    
    # Get basic evaluation
    top_topics = model.top_topics(corpus) #, num_words=20)

    # Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics.
    avg_topic_coherence = sum([t[1] for t in top_topics]) / num_topics
    print('Average topic coherence: %.4f.' % avg_topic_coherence)
    
    # Get topic vectors
    all_topics = model.get_document_topics(corpus, per_word_topics=True)
    all_topics = [(doc_topics, word_topics, word_phis) for doc_topics, word_topics, word_phis in all_topics]
    sen_top = {}
    for i in range(len(parsed)):
        # These are in the same order as the documents themselves.
        doc_topics, word_topics, phi_values = all_topics[i]
        sen_top.update({parsed[i]['post_id']:doc_topics})
    
    return model, sen_top

In [10]:
# Get bow data
print("Generating topics for BOW...")
model_bow, sen_top_bow = get_topics(corpus)

# Get tfidf data
print("Generating topics for TFIDF...")
model_tfidf, sen_top_tfidf = get_topics(tfidf[corpus])

Average topic coherence: -12.6919.
Average topic coherence: -12.2730.


In [11]:
print(sen_top_bow)

{'d74yjc': [(0, 0.10227645), (1, 0.14403439), (4, 0.6148539), (6, 0.0103208935), (8, 0.023121765), (19, 0.072179526), (20, 0.015908185)], 'btxa7s': [(0, 0.22447896), (4, 0.47272408), (6, 0.16768716), (8, 0.037385974), (19, 0.036154717), (20, 0.04584832)], 'ddjvhn': [(0, 0.20941664), (4, 0.5003727), (8, 0.020207144), (19, 0.19698721), (20, 0.03328536)], '8yh2sx': [(0, 0.13864264), (1, 0.010821545), (4, 0.6867952), (6, 0.039477337), (8, 0.011679684), (19, 0.07323903), (20, 0.029519396)], 'bj3esi': [(0, 0.26680657), (4, 0.47930372), (6, 0.017604247), (8, 0.035589293), (19, 0.15427804), (20, 0.02322877)], '83sgij': [(0, 0.11207843), (4, 0.6707311), (16, 0.018513435), (19, 0.042934626), (20, 0.12925589)], '8eu8qa': [(0, 0.28705812), (4, 0.60394424), (8, 0.014233643), (19, 0.043239135), (20, 0.023538968)], 'd4gwp1': [(0, 0.16433555), (1, 0.010984656), (4, 0.6196641), (6, 0.032022152), (8, 0.025854226), (19, 0.07461426), (20, 0.041383952)], 'dnu7ak': [(0, 0.46367756), (4, 0.39282247), (19, 0.

In [None]:
# Save bow data
with open('sample1000_top_bow_model.pickle', 'wb') as handle:
    pickle.dump(model_bow, handle, protocol=pickle.HIGHEST_PROTOCOL)
with open('sample1000_top_bow.pickle', 'wb') as handle:
    pickle.dump(sen_top_bow, handle, protocol=pickle.HIGHEST_PROTOCOL)

# Save tfidf data
with open('sample1000_top_tfidf_model.pickle', 'wb') as handle:
    pickle.dump(model_tfidf, handle, protocol=pickle.HIGHEST_PROTOCOL)
with open('sample1000_top_tfidf.pickle', 'wb') as handle:
    pickle.dump(sen_top_tfidf, handle, protocol=pickle.HIGHEST_PROTOCOL)

## Calculate Pairwise Cosine Similarity 

In [None]:
sen_emb_arr = np.array(list(sen_emb.values()))

In [None]:
sim_mat = cosine_similarity(sen_emb_arr,sen_emb_arr)

In [None]:
sim_mat