In [7]:
import pickle
import csv
import time
import re
import random
import itertools
import json
from collections import Counter
import numpy as np 
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.metrics.pairwise import cosine_similarity
import gensim
from gensim.models import LdaModel
from gensim.corpora import Dictionary
from gensim import models
import nltk
from nltk.tokenize import word_tokenize, RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

test_mode = True

In [None]:
# import argparse

# parser = argparse.ArgumentParser()
# parser.add_argument("csv_file_name", required=True, help="Filename of the Reddit-scrape-data CSV we are running ref models on")
# parser.add_argument("setname", required=True, help="Title of this experiment")
# parser.add_argument("num_clusters", type=int, default=6, required=False, help="Number of clusters (paper used 30)")
# args = parser.parse_args()
# print(args.csv_file_name)
# print(args.setname)

# setname = args.setname
# src_file = args.csv_file_name
# num_clusters = num_clusters

## Parsing raw reddit posts

In [None]:
import csv
import nltk
from nltk.tokenize import word_tokenize, RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re

nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()
# print(lemmatizer.lemmatize("cats"))

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
tokenizer = RegexpTokenizer(r'\w+')

def parse_reddit_csv(filename, setname, stop_words=None, lemmatizer=None, tokenizer=None):
    if lemmatizer == None:
        nltk.download('wordnet')
        lemmatizer = WordNetLemmatizer()
    if stop_words == None or tokenizer == None:
        nltk.download('stopwords')
        stop_words = set(stopwords.words('english'))
        tokenizer = RegexpTokenizer(r'\w+')
    
    print(f'parse_reddit_csv({filename}, {setname})')
    print('START:', time.strftime("%Y%m%d-%H%M%S", time.localtime()))
    t0 = time.process_time()
    print("Reading from", filename)
    csv_cols = []
    authors = {}
    with open(filename) as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            # Remove numbers, punctuation
            row['selftext'] = re.sub(r'\d+', '', row['selftext'])
            row['title'] = re.sub(r'\d+', '', row['title'])
            # Tokenize the post text (selftext) and post title
            post_tokens = tokenizer.tokenize(row['selftext'])
            title_tokens = tokenizer.tokenize(row['title'])
            # Filter out stopwords
            post_tokens = [w for w in post_tokens if not w in stop_words]
            title_tokens = [w for w in title_tokens if not w in stop_words]
            # Lemmatize the post text (reduce words to word stems i.e. cats->cat, liked->like)
            post_tokens = [lemmatizer.lemmatize(w, 'n') for w in post_tokens]
            post_tokens = [lemmatizer.lemmatize(w, 'v') for w in post_tokens]
            title_tokens = [lemmatizer.lemmatize(w, 'n') for w in title_tokens]
            title_tokens = [lemmatizer.lemmatize(w, 'v') for w in title_tokens]
            csv_cols.append({'author': row['author'],
                             'selftext': post_tokens,
                             'title': title_tokens,
                             'post_id': row['id']})
            # Add author mapping
            if row['author'] not in authors:
                authors[row['author']] = []
            authors[row['author']].append(row['id'])
    print('PROCESS TIME ELAPSED (s)', time.process_time() - t0)
    with open(f'partials/{setname}_parse.pickle', 'wb') as handle:
        pickle.dump(csv_cols, handle, protocol=pickle.HIGHEST_PROTOCOL)
    with open(f'partials/{setname}_parse_authors.pickle', 'wb') as handle:
        pickle.dump(authors, handle, protocol=pickle.HIGHEST_PROTOCOL)
    print('parse_reddit_csv ENDED:', time.strftime("%Y%m%d-%H%M%S", time.localtime()))
    return

In [None]:
if test_mode:
    parse_reddit_csv('data/final_proj_data_preprocessed_1000sample.csv', 'sample1000')

## Calculate post embeddings (Word2Vec)
(using selftext only)

The first (thereafter called W2VWeighted) is calculated by weighing the contribution of each word embedding by the inverse of its relative frequency to the final sentence embedding.

In doing so, the contributions of the most common words are minimized.

The second (thereafter called W2V-SIF) is calculated by first taking the weighed sentence embedding before removing the first principal component from it.

Sanjeev Arora, Yingyu Liang, and Tengyu Ma. 2017.\
A simple but tough-to-beat baseline for sentence embeddings. In ICLR.

In [None]:
from collections import Counter
import pickle
import numpy as np
from sklearn.decomposition import PCA

def embed_w2v(setname, model=None):
    # Load the parse
    with open(f'partials/{setname}_parse.pickle', 'rb') as handle:
        parsed = pickle.load(handle)
    
    # Load Google's pre-trained Word2Vec model.
    if model == None:
        model = gensim.models.KeyedVectors.load_word2vec_format('model/GoogleNews-vectors-negative300.bin', binary=True)
    
    print(f'embed_w2v({setname})')
    print('START:', time.strftime("%Y%m%d-%H%M%S", time.localtime()))
    t0 = time.process_time()
    # Build weighted embeddings
    weighted_emb = {}
    for i in range(len(parsed)):
        counts = Counter(parsed[i]['selftext'])
        freq = pd.DataFrame.from_dict(counts, orient='index').reset_index()
        freq = freq.rename(columns={'index': 'word', 0: 'freq'})
        # Weight by inverse relative frequency
        freq['inv_rfreq'] = freq['freq'].sum()/freq['freq']
        unknowns = []
        emb_dict = {}
        for w in freq['word']:
            try:
                emb = model[w]
                emb_dict.update({w:emb})
            except:
                unknowns.append(w)
        emb_value = pd.DataFrame(emb_dict).transpose().reset_index()
        emb_value = emb_value.rename(columns={'index': 'word'})
        emb_value_list = list(emb_value.iloc[:, 1:301].mul(freq['inv_rfreq'], axis = 0).sum())
        weighted_emb.update({parsed[i]['post_id']:emb_value_list})
    # Build SIF (remove first principal component)
    pca = PCA()
    ids = [key for (key, val) in list(weighted_emb.items())]
    weighted_matrix = np.array([val for (key, val) in list(weighted_emb.items())])
    # calculate PCA projections
    pca_matrix = pca.fit_transform(weighted_matrix)
    # calculate p-component that we need to subtract
    pca_adjust = [[emb[0] * c for c in pca.components_[0]] for emb in pca_matrix.tolist()]
    # drop p-component
    sif_matrix = [[i - j for i, j in zip(emb, pc)] for emb, pc in zip(weighted_matrix.tolist(), pca_adjust)]
    # convert back to dict format
    sif_emb = dict(zip(ids, sif_matrix))
    print('PROCESS TIME ELAPSED (s)', time.process_time() - t0)
    with open(f'partials/{setname}_embed_w2v_weighted.pickle', 'wb') as handle:
        pickle.dump(weighted_emb, handle, protocol=pickle.HIGHEST_PROTOCOL)
    with open(f'partials/{setname}_embed_w2v_sif.pickle', 'wb') as handle:
        pickle.dump(sif_emb, handle, protocol=pickle.HIGHEST_PROTOCOL)
    print('embed_w2v ENDED:', time.strftime("%Y%m%d-%H%M%S", time.localtime()))
    return

In [None]:
if test_mode:
    model = gensim.models.KeyedVectors.load_word2vec_format('model/GoogleNews-vectors-negative300.bin', binary=True)
    embed_w2v('sample1000', model=model)

## Calculate post topics (LDA)
(using both selftext and title)

A Bag of Words (BoW) corpus was obtained before a term frequency-inverse document frequency (TF-IDF) corpus was derived from it.

Topic modeling was then performed on both the BoW corpus (thereafter LDA-BoW) and
TF-IDF corpus (thereafter LDA-TFIDF) with the number of topics set to 30, in line with the number of clusters used.

The document-topic mapping of each post is then used for computing cosine similarities with all other posts

In [None]:
from gensim.models import LdaModel

def get_topics(dictionary, corpus, parsed):
    # Train LDA model, get model & topic vectors
    # Set training parameters.
    num_topics = 30
    chunksize = 100
    passes = 20
    iterations = 400
    eval_every = 100  # None = Don't evaluate model perplexity, takes too much time.

    # Make a index to word dictionary.
    temp = dictionary[0]  # This is only to "load" the dictionary.
    id2word = dictionary.id2token

    model = LdaModel(
        corpus=corpus,
        id2word=id2word,
        chunksize=chunksize,
        alpha='auto',
        eta='auto',
        iterations=iterations,
        num_topics=num_topics,
        passes=passes,
        eval_every=eval_every
    )
    
    # Get basic evaluation
    top_topics = model.top_topics(corpus) #, num_words=20)

    # Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics.
    avg_topic_coherence = sum([t[1] for t in top_topics]) / num_topics
    print('Average topic coherence: %.4f.' % avg_topic_coherence)
    
    # Get topic vectors
    all_topics = model.get_document_topics(corpus, per_word_topics=True)
    all_topics = [(doc_topics, word_topics, word_phis) for doc_topics, word_topics, word_phis in all_topics]
    sen_top = {}
    for i in range(len(parsed)):
        # These are in the same order as the documents themselves.
        doc_topics, word_topics, phi_values = all_topics[i]
        # Generate the topic VECTOR not just list of topics
        doc_topic_vector = [0] * num_topics
        for topic in doc_topics:
            doc_topic_vector[topic[0]] = topic[1]
        sen_top.update({parsed[i]['post_id']:doc_topic_vector})
    
    return model, sen_top

In [None]:
from gensim.corpora import Dictionary
from gensim import models
import pickle

def embed_lda(setname):
    # Load the parse
    with open(f'partials/{setname}_parse.pickle', 'rb') as handle:
        parsed = pickle.load(handle)

    print(f'embed_lda({setname})')
    print('START:', time.strftime("%Y%m%d-%H%M%S", time.localtime()))
    t0 = time.process_time()
    # Create a dictionary representation of the documents.
    dictionary = Dictionary([parsed[i]['selftext'] for i in range(len(parsed))])
    # print(dictionary)

    # Bag-of-words representation of the documents.
    corpus = [dictionary.doc2bow(parsed[i]['selftext']) for i in range(len(parsed))]
    # for doc in corpus:
    #     print([[dictionary[id], np.around(freq, decimals=2)] for id, freq in doc])

    # TF-IDF (term freq, inverse document freq) representation
    tfidf = models.TfidfModel(corpus)
    # for doc in tfidf[corpus]:
    #     print([[dictionary[id], np.around(freq, decimals=2)] for id, freq in doc])

    # Get bow data
    print("Generating topics for BOW...")
    model_bow, sen_top_bow = get_topics(dictionary, corpus, parsed)

    # Get tfidf data
    print("Generating topics for TFIDF...")
    model_tfidf, sen_top_tfidf = get_topics(dictionary, tfidf[corpus], parsed)

    print('PROCESS TIME ELAPSED (s)', time.process_time() - t0)
    
    # Save bow data
    with open(f'partials/{setname}_model_top_bow.pickle', 'wb') as handle:
        pickle.dump(model_bow, handle, protocol=pickle.HIGHEST_PROTOCOL)
    with open(f'partials/{setname}_embed_top_bow.pickle', 'wb') as handle:
        pickle.dump(sen_top_bow, handle, protocol=pickle.HIGHEST_PROTOCOL)

    # Save tfidf data
    with open(f'partials/{setname}_model_top_tfidf.pickle', 'wb') as handle:
        pickle.dump(model_tfidf, handle, protocol=pickle.HIGHEST_PROTOCOL)
    with open(f'partials/{setname}_embed_top_tfidf.pickle', 'wb') as handle:
        pickle.dump(sen_top_tfidf, handle, protocol=pickle.HIGHEST_PROTOCOL)
    print('embed_lda ENDED:', time.strftime("%Y%m%d-%H%M%S", time.localtime()))
    return

In [None]:
if test_mode:
    embed_lda('sample1000')

## Calculate Pairwise Cosine Similarity & Cluster

In [None]:
import random
import numpy as np
import pandas as pd

def similarity_clustering(similarity_dict, m, n):
    clusters = {};
    unselected_posts = similarity_dict.copy()
    post_keys = list(unselected_posts.keys())
    unselected_keys = list(unselected_posts.keys())
    cluster_size = int(np.ceil(n / m))
    # print(cluster_size)
    while len(unselected_posts) != 0:
        selected_post = random.choice(unselected_keys)
        # labeling the selected row
        emb_dict = dict(zip(post_keys, unselected_posts[selected_post]))
        # only sort the unselected columns
        sim = {k: emb_dict[k] for k in unselected_keys}
        sim_sort = [k for k in sorted(sim.items(), key=lambda item: item[1])][::-1]
        cluster_size = int(np.ceil(n / m))
        try:
            sim_most = sim_sort[0:cluster_size]
        except:
            sim_most = sim_sort[0:end]
        clusters[selected_post] = sim_most
        # deleted the selected rows from the unselected
        for p in sim_most:
            del unselected_posts[p[0]]
        unselected_keys = list(unselected_posts.keys())
        # print(cluster_size)
    return clusters

In [None]:
import pickle
from sklearn.metrics.pairwise import cosine_similarity

def clust_any_ref(setname, embedname, numClusters):
    # Load the parse
    with open(f'partials/{setname}_parse.pickle', 'rb') as handle:
        parsed = pickle.load(handle)
    # Load the embed / topic matrix
    with open(f'partials/{setname}_embed_{embedname}.pickle', 'rb') as handle:
        sen_emb = pickle.load(handle)

    print(f'clust_any_ref({setname}, {embedname}, {numClusters})')
    print('START:', time.strftime("%Y%m%d-%H%M%S", time.localtime()))
    print('clustering dataset:', setname, '; embeds:', embedname)
    t0 = time.process_time()
    numTotalPosts = len(parsed)
    d = pd.DataFrame(sen_emb).transpose()
    sim_mat = cosine_similarity(d)

    post = list(d.index)
    post_emb = dict(zip(post, sim_mat))

    cluster = similarity_clustering(post_emb, numClusters, numTotalPosts)
    # print(cluster)

    with open(f'partials/{setname}_clust_{embedname}.pickle', 'wb') as handle:
        pickle.dump(cluster, handle, protocol=pickle.HIGHEST_PROTOCOL)

    # Transform clusters into a post_id:cluster_id dict
    transformed_cluster = {}
    clust_num = 0
    for key in cluster.keys():
        for post in cluster[key]:
            transformed_cluster[post[0]] = clust_num
        clust_num += 1
    # print(transformed_cluster)

    print('PROCESS TIME ELAPSED (s)', time.process_time() - t0)
    
    with open(f'partials/{setname}_clustdict_{embedname}.pickle', 'wb') as handle:
        pickle.dump(transformed_cluster, handle, protocol=pickle.HIGHEST_PROTOCOL)
    print('clust_any_ref ENDED:', time.strftime("%Y%m%d-%H%M%S", time.localtime()))
    return

In [None]:
if test_mode:
    clust_any_ref('sample1000', 'top_tfidf', 6)

## Score Clusters: Calculate Same-Author-Score

In [39]:
import pickle
import itertools

def score_sas(setname, embedname):
    # Read clusters
    with open(f'partials/{setname}_clust_{embedname}.pickle', 'rb') as handle:
        clusters = pickle.load(handle)
    with open(f'partials/{setname}_clustdict_{embedname}.pickle', 'rb') as handle:
        clustdict = pickle.load(handle)
    # Read author list
    with open(f'partials/{setname}_parse_authors.pickle', 'rb') as handle:
        authors = pickle.load(handle)

    print(f'score_sas({setname}, {embedname})')
    print('START:', time.strftime("%Y%m%d-%H%M%S", time.localtime()))
    print('scoring dataset:', setname, '; embeds:', embedname)
    t0 = time.process_time()
    
    num_clust_pair = 0
    num_total_pair = 0

    for auth in authors:
        # authors[auth] is a list of post IDs made by author 'auth'
        if len(authors[auth]) < 2:
            continue
        for pair in itertools.product(authors[auth],authors[auth]):
            if pair[0] == pair[1]:
                continue
            num_total_pair += 1
            if clustdict[pair[0]] == clustdict[pair[1]]:
                num_clust_pair += 1

    score_sas = (num_clust_pair / num_total_pair) - (1/len(clusters))
    print('PROCESS TIME ELAPSED (s)', time.process_time() - t0)
    print('score_sas ENDED:', time.strftime("%Y%m%d-%H%M%S", time.localtime()))
    return score_sas

In [None]:
if test_mode:
    a = score_sas('sample1000', 'top_tfidf')
    # TODO: this seems to be 10x larger than what the paper reports for SAS score ...
    # Is this because of the dataset size?
    print(a)

## Score Clusters: Calculate Jaccard Score

In [40]:
import pickle
import json
import itertools

def score_jaccard(setname, embedname):
    # Read post data
    with open(f'partials/{setname}_parse.pickle', 'rb') as handle:
        parsed = pickle.load(handle)
    # Read clusters
    with open(f'partials/{setname}_clust_{embedname}.pickle', 'rb') as handle:
        clusters = pickle.load(handle)
    with open(f'partials/{setname}_clustdict_{embedname}.pickle', 'rb') as handle:
        clustdict = pickle.load(handle)
    # Read author list
    with open(f'partials/{setname}_parse_authors.pickle', 'rb') as handle:
        authors = pickle.load(handle)
    # Read author subreddits
    with open(f'data/authorsubs.json', 'r') as fp:
        sub_mappings = json.load(fp)

    print(f'score_jaccard({setname}, {embedname})')
    print('START:', time.strftime("%Y%m%d-%H%M%S", time.localtime()))
    print('scoring dataset:', setname, '; embeds:', embedname)
    t0 = time.process_time()
    
    # Transform parsed into something more usable for Jaccard
    metadata = {}
    for p in parsed:
        metadata[p['post_id']] = p

    # Set up constants
    target_sub = 'Advice'
    default_subs = {
        'comment': [target_sub],
        'submission': [target_sub]
    }

    intersect_sum = 0
    for clustkey in clusters.keys():
        ids_in_clust = [i[0] for i in clusters[clustkey]]
        for pair in itertools.product(ids_in_clust,ids_in_clust):
            a0 = metadata[pair[0]]['author']
            a1 = metadata[pair[1]]['author']
            a0_subs = sub_mappings[a0] if (a0 in sub_mappings) else default_subs
            a1_subs = sub_mappings[a1] if (a1 in sub_mappings) else default_subs
            # Check for "throwaways"
            a0_subs_total = set(a0_subs['comment'] + a0_subs['submission'])
            a1_subs_total = set(a1_subs['comment'] + a1_subs['submission'])
            if len(a0_subs_total) == 1:
                continue
            if len(a1_subs_total) == 1:
                continue
            # New formulation: use set of subreddits an author has ever interacted with
            intersect_sum += (
                len(a0_subs_total.intersection(a1_subs_total))
                /
                len(a0_subs_total.union(a1_subs_total))
            )
            # Original paper formulation: this fails if neither author has ever commented on a sub
#             comment_subscore = (
#                 len(set(a0_subs['comment']).intersection(set(a1_subs['comment'])))
#                 /
#                 len(set(a0_subs['comment']).union(set(a1_subs['comment'])))
#             )
#             submits_subscore = (
#                 len(set(a0_subs['submission']).intersection(set(a1_subs['submission'])))
#                 /
#                 len(set(a0_subs['submission']).union(set(a1_subs['submission'])))
#             )
#             intersect_sum += 0.5 * (comment_subscore + submits_subscore)
    score_jaccard = intersect_sum * len(clusters) / (len(clustdict) ** 2)
    print('PROCESS TIME ELAPSED (s)', time.process_time() - t0)
    print('score_jaccard ENDED:', time.strftime("%Y%m%d-%H%M%S", time.localtime()))
    return score_jaccard

In [None]:
if test_mode:
    a = score_jaccard('sample1000', 'top_bow')
    print(a)

## Now run the entire reference pipeline!

In [None]:
import time

# Load constants / large loading items
nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
tokenizer = RegexpTokenizer(r'\w+')

model = gensim.models.KeyedVectors.load_word2vec_format('model/GoogleNews-vectors-negative300.bin', binary=True)

# Parameters
setname = 'sample1000'
src_file = 'data/final_proj_data_preprocessed_1000sample.csv'
num_clusters = 6

# Load basic data
parse_reddit_csv(src_file, setname, 
                stop_words=stop_words, lemmatizer=lemmatizer, tokenizer=tokenizer)

# Generate embeddings for reference models
embed_w2v(setname, model=model)
embed_lda(setname)

# Cluster and score in loops
embed_types = ['top_tfidf', 'top_bow', 'w2v_weighted', 'w2v_sif']
scores = {
    'sas': {
        'top_tfidf': [],
        'top_bow': [],
        'w2v_weighted': [],
        'w2v_sif': [],
    },
    'jaccard': {
        'top_tfidf': [],
        'top_bow': [],
        'w2v_weighted': [],
        'w2v_sif': [],
    }
}
for i in range(100):
    for embed_name in embed_types:
        clust_any_ref(setname, embed_name, num_clusters)
    for embed_name in embed_types:
        scores['sas'][embed_name].append(score_sas(setname, embed_name))
        scores['jaccard'][embed_name].append(score_jaccard(setname, embed_name))

# Save scores
time_string = time.strftime("%Y%m%d-%H%M%S", time.localtime())
with open(f'outputs/{setname}_scores_{time_string}.pickle', 'wb') as handle:
    pickle.dump(scores, handle, protocol=pickle.HIGHEST_PROTOCOL)

## Read in BERT similarity table, cluster and generate scores

In [49]:
import pickle

with open(f'bert_similarity_table_1000.pickle', 'rb') as handle:
    bert_similarity = pickle.load(handle)

print(bert_similarity.keys())
# print(bert_similarity['7k3dho'])

dict_keys(['7k3dho', '8c5fwa', 'd0s4yn', 'cbqy09', 'bdsb98', 'c28qrb', 'del70k', 'cmnu2b', '9b6hss', '7v8n4f', 'ck84w1', 'c0uumi', 'cjysbh', '8s6lqa', 'bb7onp', 'c9awxn', '8bcalz', '90pp5v', 'c4sf1y', '7lq9wv', 'ddm7jp', '9hcflb', '8np7kq', 'ds8vvp', 'cox1o0', 'ckan5e', 'ek4ttg', 'bb47a4', 'buevzw', '8awc7v', 'bka12a', 'cg9umf', 'd28zc8', '835onz', 'c8j77n', 'd1ns76', 'chooph', '94dsy6', '7lx887', 'c0zg62', '9e0t8x', 'ejm2ih', 'bk942e', '7q484y', 'd90m5n', 'c7katx', 'btc8oa', 'bv0769', 'dospz8', '7tic5b', '7o040y', 'c3ur6a', '9cs6bu', 'b4f32u', 'd6e90g', '7jplwv', 'ar4h16', '8bwor7', 'bdzdwj', 'bqwccn', '9jvywb', 'dinmi1', '8z49cd', 'cdawhl', 'dplxqx', '9atsa6', 'cszmur', '7rmlxn', '7war0f', '8tfg1k', 'dkr5zg', '9h4dwk', '7mtwlo', 'cyqovn', 'bufz6c', '87uvst', '9et5bi', '8ae4ec', '9coy3r', 'c5xhry', 'bapp1e', 'djg8r0', '811d4f', 'clio2p', 'bqysob', 'cjtrot', '81049i', 'bts9fb', 'az6scz', '7xd0vf', 'bkq2j7', '85u56v', 'c14oug', '8gf8br', '7ujvr2', 'at7vl6', 'clsaq5', '9dzn72', 'aurdq3',

In [34]:
def similarity_clustering(similarity_dict, m, n):
    clusters = {};
    unselected_posts = similarity_dict.copy()
    post_keys = list(unselected_posts.keys())
    unselected_keys = list(unselected_posts.keys())
    cluster_size = int(np.ceil(n / m))
    # print(cluster_size)
    while len(unselected_posts) != 0:
        selected_post = random.choice(unselected_keys)
        # labeling the selected row
        emb_dict = dict(zip(post_keys, unselected_posts[selected_post]))
        # only sort the unselected columns
        sim = {k: emb_dict[k] for k in unselected_keys}
        sim_sort = [k for k in sorted(sim.items(), key=lambda item: item[1])][::-1]
        cluster_size = int(np.ceil(n / m))
        try:
            sim_most = sim_sort[0:cluster_size]
        except:
            sim_most = sim_sort[0:end]
        clusters[selected_post] = sim_most
        # deleted the selected rows from the unselected
        for p in sim_most:
            del unselected_posts[p[0]]
        unselected_keys = list(unselected_posts.keys())
        # print(cluster_size)
    return clusters

def clust_any_bert(setname, embedname, numClusters):
    # Load the parse
    with open(f'partials/{setname}_parse.pickle', 'rb') as handle:
        parsed = pickle.load(handle)
    # Load the similarity matrix
    with open(f'bert_similarity_table_1000.pickle', 'rb') as handle:
        post_emb = pickle.load(handle)

    print(f'clust_any_bert({setname}, {embedname}, {numClusters})')
    print('START:', time.strftime("%Y%m%d-%H%M%S", time.localtime()))
    print('clustering dataset:', setname, '; embeds:', embedname)
    t0 = time.process_time()
    numTotalPosts = len(parsed)

    cluster = similarity_clustering(post_emb, numClusters, numTotalPosts)
#     print(cluster)

    with open(f'partials/{setname}_clust_{embedname}.pickle', 'wb') as handle:
        pickle.dump(cluster, handle, protocol=pickle.HIGHEST_PROTOCOL)

    # Transform clusters into a post_id:cluster_id dict
    transformed_cluster = {}
    clust_num = 0
    for key in cluster.keys():
        for post in cluster[key]:
            transformed_cluster[post[0]] = clust_num
        clust_num += 1
    # print(transformed_cluster)

    print('PROCESS TIME ELAPSED (s)', time.process_time() - t0)
    
    with open(f'partials/{setname}_clustdict_{embedname}.pickle', 'wb') as handle:
        pickle.dump(transformed_cluster, handle, protocol=pickle.HIGHEST_PROTOCOL)
    print('clust_any_bert ENDED:', time.strftime("%Y%m%d-%H%M%S", time.localtime()))
    return

In [32]:
clust_any_bert('sample1000', 'bert', 6)

clust_any_bert(sample1000, bert, 6)
START: 20200312-223928
clustering dataset: sample1000 ; embeds: bert
PROCESS TIME ELAPSED (s) 0.016010248999997145
clust_any_bert ENDED: 20200312-223928


In [43]:
setname = 'sample1000'
num_clusters = 6

# Load in scores?
with open(f'outputs/sample1000_scores_20200311-215846.pickle', 'rb') as handle:
    scores = pickle.load(handle)

scores['sas']['bert'] = []
scores['jaccard']['bert'] = []
    
for i in range(100):
    clust_any_bert(setname, 'bert', num_clusters)
    scores['sas']['bert'].append(score_sas(setname, 'bert'))
    scores['jaccard']['bert'].append(score_jaccard(setname, 'bert'))

# Save scores
time_string = time.strftime("%Y%m%d-%H%M%S", time.localtime())
with open(f'outputs/sample1000_scores_20200311-215846_addbert.pickle', 'wb') as handle:
    pickle.dump(scores, handle, protocol=pickle.HIGHEST_PROTOCOL)

clust_any_bert(sample1000, bert, 6)
START: 20200312-224625
clustering dataset: sample1000 ; embeds: bert
PROCESS TIME ELAPSED (s) 0.015298389000001578
clust_any_bert ENDED: 20200312-224625
score_sas(sample1000, bert)
START: 20200312-224625
scoring dataset: sample1000 ; embeds: bert
PROCESS TIME ELAPSED (s) 0.00011887000000143644
score_sas ENDED: 20200312-224625
score_jaccard(sample1000, bert)
START: 20200312-224625
scoring dataset: sample1000 ; embeds: bert
PROCESS TIME ELAPSED (s) 2.241962391999998
score_jaccard ENDED: 20200312-224627
clust_any_bert(sample1000, bert, 6)
START: 20200312-224627
clustering dataset: sample1000 ; embeds: bert
PROCESS TIME ELAPSED (s) 0.006760911999997177
clust_any_bert ENDED: 20200312-224627
score_sas(sample1000, bert)
START: 20200312-224627
scoring dataset: sample1000 ; embeds: bert
PROCESS TIME ELAPSED (s) 0.00024356899999844472
score_sas ENDED: 20200312-224627
score_jaccard(sample1000, bert)
START: 20200312-224627
scoring dataset: sample1000 ; embeds: b

PROCESS TIME ELAPSED (s) 3.868448737999998
score_jaccard ENDED: 20200312-224707
clust_any_bert(sample1000, bert, 6)
START: 20200312-224707
clustering dataset: sample1000 ; embeds: bert
PROCESS TIME ELAPSED (s) 0.00664845599999353
clust_any_bert ENDED: 20200312-224707
score_sas(sample1000, bert)
START: 20200312-224707
scoring dataset: sample1000 ; embeds: bert
PROCESS TIME ELAPSED (s) 0.00011355499999865515
score_sas ENDED: 20200312-224707
score_jaccard(sample1000, bert)
START: 20200312-224707
scoring dataset: sample1000 ; embeds: bert
PROCESS TIME ELAPSED (s) 2.9434420329999966
score_jaccard ENDED: 20200312-224710
clust_any_bert(sample1000, bert, 6)
START: 20200312-224710
clustering dataset: sample1000 ; embeds: bert
PROCESS TIME ELAPSED (s) 0.007240241999994623
clust_any_bert ENDED: 20200312-224710
score_sas(sample1000, bert)
START: 20200312-224710
scoring dataset: sample1000 ; embeds: bert
PROCESS TIME ELAPSED (s) 0.00012102300000549349
score_sas ENDED: 20200312-224710
score_jaccard(

PROCESS TIME ELAPSED (s) 2.5982222780000086
score_jaccard ENDED: 20200312-224746
clust_any_bert(sample1000, bert, 6)
START: 20200312-224746
clustering dataset: sample1000 ; embeds: bert
PROCESS TIME ELAPSED (s) 0.008972561000007317
clust_any_bert ENDED: 20200312-224746
score_sas(sample1000, bert)
START: 20200312-224746
scoring dataset: sample1000 ; embeds: bert
PROCESS TIME ELAPSED (s) 0.00012942499999724077
score_sas ENDED: 20200312-224746
score_jaccard(sample1000, bert)
START: 20200312-224746
scoring dataset: sample1000 ; embeds: bert
PROCESS TIME ELAPSED (s) 2.5485457799999978
score_jaccard ENDED: 20200312-224748
clust_any_bert(sample1000, bert, 6)
START: 20200312-224748
clustering dataset: sample1000 ; embeds: bert
PROCESS TIME ELAPSED (s) 0.007117374000003451
clust_any_bert ENDED: 20200312-224748
score_sas(sample1000, bert)
START: 20200312-224748
scoring dataset: sample1000 ; embeds: bert
PROCESS TIME ELAPSED (s) 0.00013332600001092487
score_sas ENDED: 20200312-224748
score_jaccar

PROCESS TIME ELAPSED (s) 1.9158330889999888
score_jaccard ENDED: 20200312-224828
clust_any_bert(sample1000, bert, 6)
START: 20200312-224828
clustering dataset: sample1000 ; embeds: bert
PROCESS TIME ELAPSED (s) 0.006494597999989082
clust_any_bert ENDED: 20200312-224828
score_sas(sample1000, bert)
START: 20200312-224828
scoring dataset: sample1000 ; embeds: bert
PROCESS TIME ELAPSED (s) 0.00011789399999884154
score_sas ENDED: 20200312-224828
score_jaccard(sample1000, bert)
START: 20200312-224828
scoring dataset: sample1000 ; embeds: bert
PROCESS TIME ELAPSED (s) 1.9292937389999736
score_jaccard ENDED: 20200312-224830
clust_any_bert(sample1000, bert, 6)
START: 20200312-224830
clustering dataset: sample1000 ; embeds: bert
PROCESS TIME ELAPSED (s) 0.0069173669999997855
clust_any_bert ENDED: 20200312-224830
score_sas(sample1000, bert)
START: 20200312-224830
scoring dataset: sample1000 ; embeds: bert
PROCESS TIME ELAPSED (s) 0.00012209100000859507
score_sas ENDED: 20200312-224830
score_jacca

PROCESS TIME ELAPSED (s) 3.2381501119999996
score_jaccard ENDED: 20200312-224913
clust_any_bert(sample1000, bert, 6)
START: 20200312-224913
clustering dataset: sample1000 ; embeds: bert
PROCESS TIME ELAPSED (s) 0.0069117930000004435
clust_any_bert ENDED: 20200312-224913
score_sas(sample1000, bert)
START: 20200312-224913
scoring dataset: sample1000 ; embeds: bert
PROCESS TIME ELAPSED (s) 0.00011504400001172144
score_sas ENDED: 20200312-224913
score_jaccard(sample1000, bert)
START: 20200312-224913
scoring dataset: sample1000 ; embeds: bert
PROCESS TIME ELAPSED (s) 2.3890322379999986
score_jaccard ENDED: 20200312-224915
clust_any_bert(sample1000, bert, 6)
START: 20200312-224915
clustering dataset: sample1000 ; embeds: bert
PROCESS TIME ELAPSED (s) 0.023051086999998915
clust_any_bert ENDED: 20200312-224915
score_sas(sample1000, bert)
START: 20200312-224915
scoring dataset: sample1000 ; embeds: bert
PROCESS TIME ELAPSED (s) 0.00042131699999004013
score_sas ENDED: 20200312-224915
score_jacca

PROCESS TIME ELAPSED (s) 2.4955451380000113
score_jaccard ENDED: 20200312-224948
clust_any_bert(sample1000, bert, 6)
START: 20200312-224948
clustering dataset: sample1000 ; embeds: bert
PROCESS TIME ELAPSED (s) 0.006095968000010998
clust_any_bert ENDED: 20200312-224948
score_sas(sample1000, bert)
START: 20200312-224948
scoring dataset: sample1000 ; embeds: bert
PROCESS TIME ELAPSED (s) 0.0001306110000030003
score_sas ENDED: 20200312-224948
score_jaccard(sample1000, bert)
START: 20200312-224948
scoring dataset: sample1000 ; embeds: bert
PROCESS TIME ELAPSED (s) 1.9425002579999955
score_jaccard ENDED: 20200312-224950
clust_any_bert(sample1000, bert, 6)
START: 20200312-224950
clustering dataset: sample1000 ; embeds: bert
PROCESS TIME ELAPSED (s) 0.006480756999991399
clust_any_bert ENDED: 20200312-224950
score_sas(sample1000, bert)
START: 20200312-224950
scoring dataset: sample1000 ; embeds: bert
PROCESS TIME ELAPSED (s) 0.00011632700000063778
score_sas ENDED: 20200312-224950
score_jaccard

KeyError: '7x5q1a'

In [48]:
setname='sample1000'
embedname='bert'

# Read clusters
with open(f'partials/{setname}_clust_{embedname}.pickle', 'rb') as handle:
    clusters = pickle.load(handle)
with open(f'partials/{setname}_clustdict_{embedname}.pickle', 'rb') as handle:
    clustdict = pickle.load(handle)
# Read author list
with open(f'partials/{setname}_parse_authors.pickle', 'rb') as handle:
    authors = pickle.load(handle)

print(clusters)
print(len(clustdict))
clustdict['7x5q1a']

num_clust_pair = 0
num_total_pair = 0

# for auth in authors:
#     # authors[auth] is a list of post IDs made by author 'auth'
#     if len(authors[auth]) < 2:
#         continue
#     for pair in itertools.product(authors[auth],authors[auth]):
#         if pair[0] == pair[1]:
#             continue
#         num_total_pair += 1
#         if clustdict[pair[0]] == clustdict[pair[1]]:
#             num_clust_pair += 1

# score_sas = (num_clust_pair / num_total_pair) - (1/len(clusters))

{'99inoa': [('8bt5fq', 2.0), ('7tp9y7', 2.0), ('b7ie4l', 2.0), ('85w9re', 2.0), ('8qd163', 2.0), ('91fv1u', 2.0), ('90zsvk', 2.0), ('84t5qb', 2.0), ('8ae4ec', 2.0), ('7zb4yw', 1.0), ('d2grma', 1.0), ('bf9v85', 1.0), ('80rx1p', 1.0), ('b7w4ug', 1.0), ('d50wn0', 1.0), ('8artjk', 1.0), ('dmje6n', 1.0), ('c80mcc', 1.0), ('bn7wn9', 1.0), ('dr8vwx', 1.0), ('7l8e5l', 1.0), ('au99kg', 1.0), ('bb8dju', 1.0), ('88jk4u', 1.0), ('curv3t', 1.0), ('8f09nq', 1.0), ('dolrre', 1.0), ('7mz9e8', 1.0), ('b508ab', 1.0), ('90fu8r', 1.0), ('8b18h9', 1.0), ('d36q3f', 1.0), ('8pre3v', 1.0), ('913dao', 1.0), ('czesor', 1.0), ('81c1ve', 1.0), ('8cix95', 1.0), ('92ww2c', 1.0), ('8lghij', 1.0), ('asmqr2', 1.0), ('9id12l', 1.0), ('7lll4f', 1.0), ('aupgs0', 1.0), ('7upzvn', 1.0), ('7q7re0', 1.0), ('8lteey', 1.0), ('cqy487', 1.0), ('blbftj', 1.0), ('bzvd44', 1.0), ('cu301w', 1.0), ('7zxrrv', 1.0), ('clvc3s', 1.0), ('bp5f51', 1.0), ('9hvn0f', 1.0), ('da7u7l', 1.0), ('7uellj', 1.0), ('dk4q1x', 1.0), ('7odmyk', 1.0), ('

KeyError: '7x5q1a'