In [2]:
import gensim
import numpy as np 
import pandas as pd
import pickle
import csv

## Parsing raw reddit posts

In [None]:
import csv
import nltk
from nltk.tokenize import word_tokenize, RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re

nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()
# print(lemmatizer.lemmatize("cats"))

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
tokenizer = RegexpTokenizer(r'\w+')

def parse_reddit_csv(filename):
    print("Reading from", filename)
    csv_cols = []
    with open(filename) as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            # Remove numbers, punctuation
            row['selftext'] = re.sub(r'\d+', '', row['selftext'])
            row['title'] = re.sub(r'\d+', '', row['title'])
            # Tokenize the post text (selftext) and post title
            post_tokens = tokenizer.tokenize(row['selftext'])
            title_tokens = tokenizer.tokenize(row['title'])
            # Filter out stopwords
            post_tokens = [w for w in post_tokens if not w in stop_words]
            title_tokens = [w for w in title_tokens if not w in stop_words]
            # Lemmatize the post text (reduce words to word stems i.e. cats->cat, liked->like)
            post_tokens = [lemmatizer.lemmatize(w, 'n') for w in post_tokens]
            post_tokens = [lemmatizer.lemmatize(w, 'v') for w in post_tokens]
            title_tokens = [lemmatizer.lemmatize(w, 'n') for w in title_tokens]
            title_tokens = [lemmatizer.lemmatize(w, 'v') for w in title_tokens]
            csv_cols.append({'author': row['author'],
                             'selftext': post_tokens,
                             'title': title_tokens,
                             'post_id': row['id']})
    return csv_cols

In [None]:
parsed = parse_reddit_csv('data/final_proj_data_preprocessed_1000sample.csv')

with open('sample1000_parse.pickle', 'wb') as handle:
    pickle.dump(parsed, handle, protocol=pickle.HIGHEST_PROTOCOL)

## Calculate post embeddings (Word2Vec)
(using selftext only)

The first (thereafter called W2VWeighted) is calculated by weighing the contribution of each word embedding by the inverse of its relative frequency to the final sentence embedding.

In doing so, the contributions of the most common words are minimized.

The second (thereafter called W2V-SIF) is calculated by first taking the weighed sentence embedding before removing the first principal component from it.

Sanjeev Arora, Yingyu Liang, and Tengyu Ma. 2017.\
A simple but tough-to-beat baseline for sentence embeddings. In ICLR.

In [None]:
# Load Google's pre-trained Word2Vec model.
model = gensim.models.KeyedVectors.load_word2vec_format('model/GoogleNews-vectors-negative300.bin', binary=True)  

In [None]:
from collections import Counter

sen_emb = {}
for i in range(len(parsed)):
    counts = Counter(parsed[i]['selftext']).items()
    freq = pd.DataFrame(counts)
    freq = freq.rename(columns={0: "word", 1: 'freq'})
    # Weight by inverse relative frequency
    freq['inv_rfreq'] = freq['freq'].sum()/freq['freq']
    unknowns = []
    emb_dict = {}
    for w in freq['word'].to_list():
        try:
            emb = model[w]
            emb_dict.update({w:emb})
        except:
            unknowns.append(w)
    emb_value = pd.DataFrame(emb_dict).transpose().reset_index()
    emb_value = emb_value.rename(columns={'index': "word"})
    emb_value_list = emb_value.iloc[:, 1:301].mul(freq['inv_rfreq'], axis = 0).sum().to_list()
    sen_emb.update({parsed[i]['post_id']:emb_value_list})       

In [None]:
with open('sample1000_embed_w2v.pickle', 'wb') as handle:
    pickle.dump(sen_emb, handle, protocol=pickle.HIGHEST_PROTOCOL)

## Calculate post topics (LDA)
(using both selftext and title)

A Bag of Words (BoW) corpus was obtained before a term frequency-inverse document frequency (TF-IDF) corpus was derived from it.

Topic modeling was then performed on both the BoW corpus (thereafter LDA-BoW) and
TF-IDF corpus (thereafter LDA-TFIDF) with the number of topics set to 30, in line with the number of clusters used.

The document-topic mapping of each post is then used for computing cosine similarities with all other posts

In [None]:
from gensim.corpora import Dictionary

# Create a dictionary representation of the documents.
dictionary = Dictionary([parsed[i]['selftext'] for i in range(len(parsed))])
print(dictionary)

# Bag-of-words representation of the documents.
corpus = [dictionary.doc2bow(parsed[i]['selftext']) for i in range(len(parsed))]
# for doc in corpus:
#     print([[dictionary[id], np.around(freq, decimals=2)] for id, freq in doc])

# TF-IDF (term freq, inverse document freq) representation
from gensim import models
tfidf = models.TfidfModel(corpus)
# for doc in tfidf[corpus]:
#     print([[dictionary[id], np.around(freq, decimals=2)] for id, freq in doc])

In [None]:
from gensim.models import LdaModel

def get_topics(corpus):
    # Train LDA model, get model & topic vectors
    # Set training parameters.
    num_topics = 30
    chunksize = 100
    passes = 20
    iterations = 400
    eval_every = 100  # None = Don't evaluate model perplexity, takes too much time.

    # Make a index to word dictionary.
    temp = dictionary[0]  # This is only to "load" the dictionary.
    id2word = dictionary.id2token

    model = LdaModel(
        corpus=corpus,
        id2word=id2word,
        chunksize=chunksize,
        alpha='auto',
        eta='auto',
        iterations=iterations,
        num_topics=num_topics,
        passes=passes,
        eval_every=eval_every
    )
    
    # Get basic evaluation
    top_topics = model.top_topics(corpus) #, num_words=20)

    # Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics.
    avg_topic_coherence = sum([t[1] for t in top_topics]) / num_topics
    print('Average topic coherence: %.4f.' % avg_topic_coherence)
    
    # Get topic vectors
    all_topics = model.get_document_topics(corpus, per_word_topics=True)
    all_topics = [(doc_topics, word_topics, word_phis) for doc_topics, word_topics, word_phis in all_topics]
    sen_top = {}
    for i in range(len(parsed)):
        # These are in the same order as the documents themselves.
        doc_topics, word_topics, phi_values = all_topics[i]
        # Generate the topic VECTOR not just list of topics
        doc_topic_vector = [0] * num_topics
        for topic in doc_topics:
            doc_topic_vector[topic[0]] = topic[1]
        sen_top.update({parsed[i]['post_id']:doc_topic_vector})
    
    return model, sen_top

In [None]:
# Get bow data
print("Generating topics for BOW...")
model_bow, sen_top_bow = get_topics(corpus)

# Get tfidf data
print("Generating topics for TFIDF...")
model_tfidf, sen_top_tfidf = get_topics(tfidf[corpus])

In [None]:
# Save bow data
with open('sample1000_model_top_bow.pickle', 'wb') as handle:
    pickle.dump(model_bow, handle, protocol=pickle.HIGHEST_PROTOCOL)
with open('sample1000_embed_top_bow.pickle', 'wb') as handle:
    pickle.dump(sen_top_bow, handle, protocol=pickle.HIGHEST_PROTOCOL)

# Save tfidf data
with open('sample1000_model_top_tfidf.pickle', 'wb') as handle:
    pickle.dump(model_tfidf, handle, protocol=pickle.HIGHEST_PROTOCOL)
with open('sample1000_embed_top_tfidf.pickle', 'wb') as handle:
    pickle.dump(sen_top_tfidf, handle, protocol=pickle.HIGHEST_PROTOCOL)

## Calculate Pairwise Cosine Similarity & Cluster

In [7]:
import pickle

with open('sample1000_top_bow.pickle', 'rb') as handle:
    sen_emb = pickle.load(handle)

In [8]:
from sklearn.metrics.pairwise import cosine_similarity

d = pd.DataFrame(sen_emb).transpose()
sim_mat = cosine_similarity(d)

In [9]:
post = d.index.to_list()
post_emb = dict(zip(post, sim_mat))

In [13]:
import random
import numpy as np
import pandas as pd

def Similarity_clustering(similarity_dict, m, n):
    clusters = {};
    unselected_posts = similarity_dict.copy()
    post_keys = list(unselected_posts.keys())
    unselected_keys = list(unselected_posts.keys())
    cluster_size = int(np.ceil(n / m))
    while len(unselected_posts) != 0:
        selected_post = random.choice(unselected_keys)
        # labeling the selected row
        emb_dict = dict(zip(post_keys, unselected_posts[selected_post]))
        # only sort the unselected columns
        sim = {k: emb_dict[k] for k in unselected_keys}
        sim_sort = [k for k in sorted(sim.items(), key=lambda item: item[1])][::-1]
        sim_most = sim_sort[0:cluster_size]
        clusters[selected_post] = sim_most
        # deleted the selected rows from the unselected
        for p in sim_most:
            del unselected_posts[p[0]]
        unselected_keys = list(unselected_posts.keys())
        cluster_size = int(np.floor(n / m))
    return clusters

In [16]:
cluster = Similarity_clustering(post_emb, 6, 1000)
print(cluster)

with open('sample1000_cluster_top_tfidf.pickle', 'wb') as handle:
    pickle.dump(model_tfidf, handle, protocol=pickle.HIGHEST_PROTOCOL)

{'98tvq3': [('98tvq3', 1.0), ('856038', 0.9997761178299003), ('8tamec', 0.9995424143441137), ('db3xfc', 0.9994839314450785), ('der5kb', 0.9994837385648353), ('d0jlhr', 0.9994832266618143), ('9j9f3q', 0.9994736343619143), ('8lj0cg', 0.9994505154817892), ('cw1usc', 0.9994148815045575), ('9hhlg4', 0.9993815607997073), ('dospz8', 0.9993736024194667), ('9fve7a', 0.9993646145258137), ('di7lew', 0.9993636953538194), ('7wrm09', 0.9993378610014053), ('c9r1mg', 0.9993324163061773), ('7t8gqk', 0.9993310902604041), ('9bhwgo', 0.9993225152914309), ('cdawhl', 0.9993062995324047), ('chujg0', 0.9993041561668747), ('8sig1h', 0.9992963841101707), ('7ny1x1', 0.9992883213263696), ('7q7re0', 0.9992848753580981), ('bh5zis', 0.9992812803759523), ('bb47a4', 0.9992786634536147), ('8kn01a', 0.9992765616536782), ('7mgc0p', 0.9992675183705061), ('7k4vi5', 0.9992669694813989), ('ar4ddv', 0.9992639043297264), ('7vd3yj', 0.9992469539068674), ('crd82k', 0.9992388927712731), ('8ovy3r', 0.9992371217693898), ('cjdzem', 