# setup

In [1]:
# Import packages
import re
import numpy as np
import pandas as pd
import csv
import json
import os
import random
import scipy
import time
import tqdm
from collections import defaultdict

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
from gensim.models.wrappers import LdaMallet

# spacy for lemmatization
import spacy

# Plotting tools
# !pip install pyLDAvis
# import pyLDAvis
# import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline

# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

In [2]:
# NLTK Stop words
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])

In [3]:
path = '/Users/alexwey/Desktop/ipynb/'

In [45]:
# Import all raw annotations files
annotations_path = '/Users/alexwey/Desktop/CS/GPT-2/Grover/gpt-2/annotations/'
annotations_raw = defaultdict()
c = 0
duplicates = set()

for annotator in os.listdir(annotations_path):
  for date in os.listdir(annotations_path + annotator):
    for filename in os.listdir(annotations_path + annotator + '/' + date):
      with open(annotations_path + annotator + '/' + date + '/' + filename, 'r') as f:
        if filename in duplicates:
            c += 1
        else:
            duplicates.add(filename)
            annotations_raw[f.read()] = filename

In [47]:
annotations_raw

defaultdict(None,
            {'Of course not, scrub. The Cartels would do as the Mafia did: use their money, power and influence to reestablish themselves as legitimate business operations. And no, scrub. No. They can not "compete" with legal drugs by simply introducing new legal drugs. That is not how chemistry works. There is no "new and improved" version of heroin, for example. There is only more milligrams of the molecule or less milligrams of the heroin molecule... Or krokodil, which provides a similar high at the cost of melting your skin off... but people will make krokodil and will buy krokodil *because it\'s legal* and they can\'t get heroin. So. ironically, prohibition is what gives rise to your so-called "new drugs", because "new drugs" are means to circumvent detection through cheap, dangerous, experimental knock-offs. "Spice", for example, is billed as a *legal alternative* to marijuana. And bath salts, for example, are billed as a *legal alternative* to cocaine and amphe

In [None]:
# Import all json files
document_path = f'{path}documents/'
data = defaultdict(list)
c = 0
h_c, g_c = 0, 0
for filename in os.listdir(document_path):
  d = json.JSONDecoder()
  with open(document_path + filename, 'r') as f:
    for record in f:
      record_data = d.decode(record)
      source = record_data['domain'].split('.')[0]
      if source == 'huffingtonpost':
        source = 'huffpost'
      human_text = record_data['text']
      grover_text = record_data['gens_article'][0]

      human_annotation, grover_annotation = '', ''
      if human_text in annotations_raw.keys():
        human_annotation = annotations_raw[human_text]
        h_c += 1
      if grover_text in annotations_raw.keys():
        grover_annotation = annotations_raw[grover_text]
        g_c += 1

      data[source].append((human_text, grover_text, human_annotation, grover_annotation))

total = 0
for source, texts in data.items():
  print(source, len(texts))
  total += len(texts)

print('total', total)

In [None]:
print(h_c)
print(g_c)

# preprocessing documents

In [None]:
# Remove emails, new lines, and single quotes
def preremoval(text):
  text = re.sub('\S*@\S*\s?', '', text)
  text = re.sub('\s+', ' ', text)
  text = re.sub("\'", "", text)
  return text

# Tokenize
def tokenize(text):
  return(gensim.utils.simple_preprocess(str(text), deacc=True)) # deacc=True removes punctuations

# Remove stopwords
def remove_stopwords(texts):
  return [(doc[0], doc[1], [word for word in simple_preprocess(str(doc[2])) if word not in stop_words], doc[3]) for doc in texts]

# Make bigrams
def make_bigrams(nostops, tups):
  texts = [elem[2] for elem in tups]
  bigram = gensim.models.Phrases(texts, min_count=5, threshold=100) # higher threshold, fewer phrases
  bigram_mod = gensim.models.phrases.Phraser(bigram)
  return [(doc[0], doc[1], bigram_mod[doc[2]], doc[3]) for doc in nostops]

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
nlp = spacy.load('en', disable=['parser', 'ner'])

# Lemmatize
def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
  lemmatized_out = []
  for sent in texts:
    doc = nlp(" ".join(sent[2]))
    lemmatized_out.append((sent[0], sent[1], [token.lemma_ for token in doc if token.pos_ in allowed_postags], sent[3]))

  return lemmatized_out

# Preprocess set of texts
def preprocess(texts):
  nostops = remove_stopwords(texts)
  bigrams = make_bigrams(nostops, texts)
  lemmatized_out = lemmatization(bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])
  return lemmatized_out

In [None]:
# Collect dataset
all_texts = []
tokenized = []
for source, docs in data.items():
  for tup in docs:
    human_words = tokenize(preremoval(tup[0]))
    grover_words = tokenize(preremoval(tup[1]))
    tokenized.append((source, 'h', human_words, tup[2]))
    tokenized.append((source, 'g', grover_words, tup[3]))
    all_texts.append((source, tup[0], tup[1]))

# Preprocess data
preprocessed_data = preprocess(tokenized)

# Aggregate data accordingly
all_data = []
all_human_data = []
all_grover_data = []
reddit_human_data = []
reddit_grover_data = []
news_human_data = []
news_grover_data = []

# Get dictionary{txtfile: idx}
all_human_txtfile_idx = defaultdict()
all_grover_txtfile_idx = defaultdict()

h_idx = 0
g_idx = 0

for doc in preprocessed_data:
  all_data.append(doc[2])

  if doc[1] == 'h':
    all_human_data.append(doc[2])
    all_human_txtfile_idx[doc[3]] = h_idx
    h_idx += 1
  else:
    all_grover_data.append(doc[2])
    all_grover_txtfile_idx[doc[3]] = g_idx
    g_idx += 1

  if doc[0] == 'reddit':
    if doc[1] == 'h':
      reddit_human_data.append(doc[2])
    else:
      reddit_grover_data.append(doc[2])
  else:
    if doc[1] == 'h':
      news_human_data.append(doc[2])
    else:
      news_grover_data.append(doc[2])

print(all_data[:2])
print(all_texts[:2])

In [None]:
all_human_txtfile_idx

In [None]:
# Calculate average lengths of prompts/generations for normalization
reddit_prompt_avg_len, reddit_generation_avg_len, reddit_c = 0, 0, 0
reddit_prompt_min, reddit_prompt_max = float('inf'), 0
reddit_generation_min, reddit_generation_max = float('inf'), 0

news_prompt_avg_len, news_generation_avg_len, news_c = 0, 0, 0
news_prompt_min, news_prompt_max = float('inf'), 0
news_generation_min, news_generation_max = float('inf'), 0


for source, texts in data.items():
  if source == 'reddit':
    for pairs in texts:
      reddit_c += 1
      prompt_len = len(pairs[0].split())
      generation_len = len(pairs[1].split())
      reddit_prompt_avg_len += prompt_len
      reddit_generation_avg_len += generation_len
      if prompt_len < reddit_prompt_min:
        reddit_prompt_min = prompt_len
      if generation_len < reddit_generation_min:
        reddit_generation_min = generation_len
      if prompt_len > reddit_prompt_max:
        reddit_prompt_max = prompt_len
      if generation_len > reddit_generation_max:
        reddit_generation_max = generation_len
  else:
    for pairs in texts:
      news_c += 1
      prompt_len = len(pairs[0].split())
      generation_len = len(pairs[1].split())
      news_prompt_avg_len += prompt_len
      news_generation_avg_len += generation_len
      if prompt_len < news_prompt_min:
        news_prompt_min = prompt_len
      if generation_len < news_generation_min:
        news_generation_min = generation_len
      if prompt_len > news_prompt_max:
        news_prompt_max = prompt_len
      if generation_len > news_generation_max:
        news_generation_max = generation_len

reddit_prompt_avg_len /= reddit_c
reddit_generation_avg_len /= reddit_c
news_prompt_avg_len /= news_c
news_generation_avg_len /= news_c

print("Reddit Prompt Avg Length: ", reddit_prompt_avg_len)
print("Reddit Generation Avg Length: ", reddit_generation_avg_len)
print("Reddit Prompt Min: ", reddit_prompt_min)
print("Reddit Prompt Max: ", reddit_prompt_max)
print("Reddit Generation Min: ", reddit_generation_min)
print("Reddit Generation Max: ", reddit_generation_max)
print("")
print("News Prompt Avg Length: ", news_prompt_avg_len)
print("News Generation Avg Length: ", news_generation_avg_len)
print("News Prompt Min: ", news_prompt_min)
print("News Prompt Max: ", news_prompt_max)
print("News Generation Min: ", news_generation_min)
print("News Generation Max: ", news_generation_max)

# run model

In [None]:
# Create Dictionary
id2word = corpora.Dictionary(all_data)

# Create Corpus
texts = all_data

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

In [None]:
# Build LDA model
num_topics = 18
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=num_topics,
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           eta='auto',
                                           per_word_topics=True)

# coherence values

In [None]:
# Obtain coherence model
cm = CoherenceModel(model=lda_model, texts=all_data, corpus=corpus, dictionary=id2word, coherence='c_v')

# Obtain segmented topics
segmented_topics = cm.segment_topics()

# Calculate topic specific coherence values
coherence_values = cm.get_coherence_per_topic(segmented_topics)

In [None]:
# Configure specifications for bar plot
X = np.arange(1, num_topics + 1)
fig, ax = plt.subplots(figsize=(30,10))
ax.bar(X - 0.1, coherence_values, color='orange', width=0.4)

# Add labels, title, and custom x-axis tick labels
ax.set_ylabel('Coherence Values', fontsize=20)
ax.set_title('Topic-Specific Coherence Values', fontsize=20)
ax.set_xticks(X)
ax.legend()

# topic distribution preprocessing



In [None]:
# Convert documents to BoW format
def convert_to_bow(words_set):
  bows = []
  for words in words_set:
    bow = id2word.doc2bow(words)
    bows.append(bow)
  return bows

# Obtain topic distribution for all documents
def get_topic_dist(documents, pwt):
  outputs = []
  for document in documents:
    if pwt:
      output = lda_model.get_document_topics(document, per_word_topics=True)
      outputs.append(output)
    else:
      output = lda_model.get_document_topics(document)
      outputs.append(output)
  return outputs

# Convert data to BoW
all_human_bow = convert_to_bow(all_human_data)
all_grover_bow = convert_to_bow(all_grover_data)
reddit_human_bow = convert_to_bow(reddit_human_data)
reddit_grover_bow = convert_to_bow(reddit_grover_data)
news_human_bow = convert_to_bow(news_human_data)
news_grover_bow = convert_to_bow(news_grover_data)

# Obtain distribution (pwt = False)
all_human_dist = get_topic_dist(all_human_bow, False)
all_grover_dist = get_topic_dist(all_grover_bow, False)
reddit_human_dist = get_topic_dist(reddit_human_bow, False)
reddit_grover_dist = get_topic_dist(reddit_grover_bow, False)
news_human_dist = get_topic_dist(news_human_bow, False)
news_grover_dist = get_topic_dist(news_grover_bow, False)

# Make a copy
all_human_dist_copy = all_human_dist
all_grover_dist_copy = all_grover_dist
reddit_human_dist_copy = reddit_human_dist
reddit_grover_dist_copy = reddit_grover_dist
news_human_dist_copy = news_human_dist
news_grover_dist_copy = news_grover_dist

# average word counts across topics

In [None]:
# Get the number of words contributed by topics
def calc_words(dist, dist_pwt):
  for i in range(len(dist)):
    word_count = len(dist_pwt[i][1])
    for j in range(len(dist[i])):
      topic_id = dist[i][j][0]
      contribution = dist[i][j][1]
      dist[i][j] = (topic_id, contribution * word_count)

# Normalize word contributions
def normalize_wc(dist):
  x_min, x_max = float('inf'), 0
  tups = [tup for doc_dist in dist for tup in doc_dist]
  for tup in tups:
    wc = tup[1]
    if wc < x_min:
      x_min = wc
    if wc > x_max:
      x_max = wc

  for i in range(len(dist)):
    for j in range(len(dist[i])):
      topic_id = dist[i][j][0]
      wc_not_norm = dist[i][j][1]
      wc_norm = (wc_not_norm - x_min) / (x_max - x_min)
      dist[i][j] = (topic_id, wc_norm)

# Get averages after normalizing data
def average(dist):
  tups = [tup for doc_dist in dist for tup in doc_dist]
  sums = [0] * num_topics
  counts = [0] * num_topics
  for tup in tups:
    idx = tup[0]
    sums[idx] += tup[1]
    counts[idx] += 1
  avg_data = [sum / count if count != 0 else 0 for sum, count in zip(sums, counts)] 
  return avg_data

In [None]:
# Obtain distribution (pwt = True)
all_human_dist_pwt = get_topic_dist(all_human_bow, True)
all_grover_dist_pwt = get_topic_dist(all_grover_bow, True)
reddit_human_dist_pwt = get_topic_dist(reddit_human_bow, True)
reddit_grover_dist_pwt = get_topic_dist(reddit_grover_bow, True)
news_human_dist_pwt = get_topic_dist(news_human_bow, True)
news_grover_dist_pwt = get_topic_dist(news_grover_bow, True)

# Calculate number of words per document
calc_words(all_human_dist, all_human_dist_pwt)
calc_words(all_grover_dist, all_grover_dist_pwt)
calc_words(reddit_human_dist, reddit_human_dist_pwt)
calc_words(reddit_grover_dist, reddit_grover_dist_pwt)
calc_words(news_human_dist, news_human_dist_pwt)
calc_words(news_grover_dist, news_grover_dist_pwt)

# Normalize word conributions
normalize_wc(all_human_dist)
normalize_wc(all_grover_dist)
normalize_wc(reddit_human_dist)
normalize_wc(reddit_grover_dist)
normalize_wc(news_human_dist)
normalize_wc(news_grover_dist)

# Calculate average number of words
all_human_avg = average(all_human_dist)
all_grover_avg = average(all_grover_dist)
reddit_human_avg = average(reddit_human_dist)
reddit_grover_avg = average(reddit_grover_dist)
news_human_avg = average(news_human_dist)
news_grover_avg = average(news_grover_dist)

In [None]:
# Configure specifications for bar plot (ALL DATA)
X = np.arange(1, num_topics + 1)
fig, ax = plt.subplots(figsize=(30,10))
ax.bar(X - 0.1, all_human_avg, color='b', width=0.2, label='Human')
ax.bar(X + 0.1, all_grover_avg, color = 'r', width=0.2, label='Grover')

# Add labels, title, and custom x-axis tick labels
ax.set_ylabel('Average word contribution (after normalized)', fontsize=20)
ax.set_title('Word Count Contributions across Topics between Human and Grover (ALL DATA)', fontsize=20)
ax.set_xticks(X)
ax.legend()

In [None]:

# Configure specifications for bar plot (REDDIT)
X = np.arange(1, num_topics + 1)
fig, ax = plt.subplots(figsize=(30,10))
ax.bar(X - 0.1, reddit_human_avg, color='b', width=0.2, label='Human')
ax.bar(X + 0.1, reddit_grover_avg, color = 'r', width=0.2, label='Grover')

# Add labels, title, and custom x-axis tick labels
ax.set_ylabel('Average word contribution (after normalized)', fontsize=20)
ax.set_title('Word Count Contributions across Topics between Human and Grover (REDDIT)', fontsize=20)
ax.set_xticks(X)
ax.legend()

In [None]:
# Configure specifications for bar plot (NEWS)
X = np.arange(1, num_topics + 1)
fig, ax = plt.subplots(figsize=(30,10))
ax.bar(X - 0.1, news_human_avg, color='b', width=0.2, label='Human')
ax.bar(X + 0.1, news_grover_avg, color = 'r', width=0.2, label='Grover')

# Add labels, title, and custom x-axis tick labels
ax.set_ylabel('Average word contribution (after normalized)', fontsize=20)
ax.set_title('Word Count Contributions across Topics between Human and Grover (NEWS)', fontsize=20)
ax.set_xticks(X)
ax.legend()

# difference in average word counts across topics

In [None]:
# Calculates difference in distributions
def calc_difs(human, grover):
  res = []
  for i in range(len(human)):
    human_dist_dic = dict(human[i])
    grover_dist_dic = dict(grover[i])

    for key in human_dist_dic.keys():
      if key not in grover_dist_dic.keys():
        grover_dist_dic[key] = 0
    for key in grover_dist_dic.keys():
      if key not in human_dist_dic.keys():
        human_dist_dic[key] = 0
    
    difs_dic = {}
    for key, val in human_dist_dic.items():
      difs_dic[key] = human_dist_dic[key] - grover_dist_dic[key]
    difs_tups = [(k, v) for k, v in difs_dic.items()]
    res.append(difs_tups)
      
  return res

In [None]:
print(all_human_dist)
print(all_grover_dist)

In [None]:
# Calculate differences in topic contributions
all_difs = calc_difs(all_human_dist, all_grover_dist)
reddit_difs = calc_difs(reddit_human_dist, reddit_grover_dist)
news_difs = calc_difs(news_human_dist, news_grover_dist)

# Calculate average of differences
all_difs_avg = average(all_difs)
reddit_difs_avg = average(reddit_difs)
news_difs_avg = average(news_difs)

In [None]:
# Configure specifications for bar plot (ALL DATA)
X = np.arange(1, num_topics + 1)
fig, ax = plt.subplots(figsize=(30,10))
ax.bar(X - 0.1, all_difs_avg, color='r', width=0.4, label='Human minus Grover')

# Add labels, title, and custom x-axis tick labels
ax.set_ylabel('', fontsize=20) # average number of words per topic instead
ax.set_title('Average Differences in Word Count Contributions between Human and Grover (ALL DATA)', fontsize=20)
ax.set_xticks(X)
ax.legend()

In [None]:
# Configure specifications for bar plot (REDDIT)
X = np.arange(1, num_topics + 1)
fig, ax = plt.subplots(figsize=(30,10))
ax.bar(X - 0.1, reddit_difs_avg, color='r', width=0.4, label='Human minus Grover')

# Add labels, title, and custom x-axis tick labels
ax.set_ylabel('', fontsize=20) # average number of words per topic instead
ax.set_title('Average Differences in Word Count Contributions between Human and Grover (REDDIT)', fontsize=20)
ax.set_xticks(X)
ax.legend()

In [None]:
# Configure specifications for bar plot (NEWS)
X = np.arange(1, num_topics + 1)
fig, ax = plt.subplots(figsize=(30,10))
ax.bar(X - 0.1, news_difs_avg, color='r', width=0.4, label='Human minus Grover')

# Add labels, title, and custom x-axis tick labels
ax.set_ylabel('', fontsize=20) # average number of words per topic instead
ax.set_title('Average Differences in Word Count Contributions between Human and Grover (NEWS)', fontsize=20)
ax.set_xticks(X)
ax.legend()

# documents higher than 50% more than uniform distribution

In [None]:
# Count number of documents 50% higher than uniform distribution
def count_high_threshold(dist):
  threshold = 1.5/num_topics
  count = [0] * num_topics
  for doc in dist:
    for tup in doc:
      if tup[1] > threshold:
        count[tup[0]] += 1
  return count

In [None]:
# Count number of documents higher than 50% threshold
all_human_higher = count_high_threshold(all_human_dist_copy)
all_grover_higher = count_high_threshold(all_grover_dist_copy)
reddit_human_higher = count_high_threshold(reddit_human_dist_copy)
reddit_grover_higher = count_high_threshold(reddit_grover_dist_copy)
news_human_higher = count_high_threshold(news_human_dist_copy)
news_grover_higher = count_high_threshold(news_grover_dist_copy)

In [None]:
# Configure specifications for bar plot (ALL DATA)
X = np.arange(1, num_topics + 1)
fig, ax = plt.subplots(figsize=(30,10))
ax.bar(X - 0.1, all_human_higher, color='b', width=0.2, label='Human')
ax.bar(X + 0.1, all_grover_higher, color = 'r', width=0.2, label='Grover')

# Add labels, title, and custom x-axis tick labels
ax.set_ylabel('# of Documents, Threshold = 1.5/18 ~= 0.083', fontsize=20)
ax.set_title('Number of Documents 50% Higher than Uniform Distribution Across Topics between Human and Grover (ALL DATA)', fontsize=20)
ax.set_xticks(X)
ax.legend()

In [None]:
# Configure specifications for bar plot (REDDIT)
X = np.arange(1, num_topics + 1)
fig, ax = plt.subplots(figsize=(30,10))
ax.bar(X - 0.1, reddit_human_higher, color='b', width=0.2, label='Human')
ax.bar(X + 0.1, reddit_grover_higher, color = 'r', width=0.2, label='Grover')

# Add labels, title, and custom x-axis tick labels
ax.set_ylabel('# of Documents, Threshold = 1.5/18 ~= 0.083', fontsize=20)
ax.set_title('Number of Documents 50% Higher than Uniform Distribution Across Topics between Human and Grover (REDDIT)', fontsize=20)
ax.set_xticks(X)
ax.legend()

In [None]:
# Configure specifications for bar plot (NEWS)
X = np.arange(1, num_topics + 1)
fig, ax = plt.subplots(figsize=(30,10))
ax.bar(X - 0.1, news_human_higher, color='b', width=0.2, label='Human')
ax.bar(X + 0.1, news_grover_higher, color = 'r', width=0.2, label='Grover')

# Add labels, title, and custom x-axis tick labels
ax.set_ylabel('# of Documents, Threshold = 1.5/18 ~= 0.083', fontsize=20)
ax.set_title('Number of Documents 50% Higher than Uniform Distribution Across Topics between Human and Grover (NEWS)', fontsize=20)
ax.set_xticks(X)
ax.legend()

# single-most/least representative documents

In [None]:
# Get texts for Reddit/News
reddit_texts = [text for text in all_texts if text[0] == 'reddit']
news_texts = [text for text in all_texts if text[0] != 'reddit']

In [None]:
# Document that is single-most/least representative of differences in topic contributions
def most_representative(difs):
  human_most, human_idx = (-1, 0), -1
  grover_most, grover_idx = (-1, 0), -1
  for i, doc in enumerate(difs):
    for tup in doc:
      if tup[1] > human_most[1]:
        human_most = tup
        human_idx = i
      if tup[1] < grover_most[1]:
        grover_most = tup
        grover_idx = i
  
  return human_idx, grover_idx, human_most, grover_most

def least_representative(difs):
  human_least, human_idx = (-1, float('inf')), -1
  grover_least, grover_idx = (-1, float('-inf')), -1
  for i, doc in enumerate(difs):
    for tup in doc:
      if tup[1] < human_least[1] and tup[1] > 0:
        human_least = tup
        human_idx = i
      if tup[1] > grover_least[1] and tup[1] < 0:
        grover_least = tup
        grover_idx = i
    
  return human_idx, grover_idx, human_least, grover_least

def find_print_results(most, human_idx, grover_idx, human_res, grover_res, all, reddit, news):
  if all:
    human_rep = all_texts[human_idx]
    grover_rep = all_texts[grover_idx]
  if reddit:
    human_rep = reddit_texts[human_idx]
    grover_rep = reddit_texts[grover_idx]
  if news:
    human_rep = news_texts[human_idx]
    grover_rep = news_texts[grover_idx]

  if most:
    print('Most Representative Human Document:')
  else:
    print('Least Representative Human Document:')
  print(human_res, human_idx)
  print('SOURCE:', human_rep[0])
  print('HUMAN:', human_rep[1])
  print('GROVER:', human_rep[2])

  if most:
    print('Most Representative Grover Document:')
  else:
    print('Least Representative Grover Document:')
  print(grover_res, grover_idx)
  print('SOURCE:', grover_rep[0])
  print('HUMAN:', grover_rep[1])
  print('GROVER:', grover_rep[2])

In [None]:
all_hidx, all_gidx, human_res, grover_res = most_representative(all_difs)
find_print_results(True, all_hidx, all_gidx, human_res, grover_res, True, False, False)
all_hidx, all_gidx, human_res, grover_res = least_representative(all_difs)
find_print_results(False, all_hidx, all_gidx, human_res, grover_res, True, False, False)

In [None]:
reddit_hidx, reddit_gidx, human_res, grover_res = most_representative(reddit_difs)
find_print_results(True, reddit_hidx, reddit_gidx, human_res, grover_res, False, True, False)
reddit_hidx, reddit_gidx, human_res, grover_res = least_representative(reddit_difs)
find_print_results(False, reddit_hidx, reddit_gidx, human_res, grover_res, False, True, False)

In [None]:
news_hidx, news_gidx, human_res, grover_res = most_representative(news_difs)
find_print_results(True, news_hidx, news_gidx, human_res, grover_res, False, False, True)
news_hidx, news_gidx, human_res, grover_res = least_representative(news_difs)
find_print_results(False, news_hidx, news_gidx, human_res, grover_res, False, False, True)

# topic words analysis

In [None]:
# Obtain most probable words under each topic
def get_topic_words():
  res = {}
  for i in range(num_topics): # change depending on # topics
    print(f'Topic {i + 1}')
    top_idxs = lda_model.get_topic_terms(i, topn=40) # increase depending on if sets have limited number of unique words
    top_words = [id2word[word_id[0]] for word_id in top_idxs]
    print(top_words)
    res[f'Topic {i + 1}'] = top_words
  return res
  
topic_words = get_topic_words()

In [None]:
# Unique words algorithm
def word_count(t, w, topic_words):
  c = 0
  appears = "unique"
  for topic, words in topic_words.items():
    if t != topic:
      c += words.count(w)
      if words.count(w) != 0:
        appears = topic
  return c, appears

unique_topic_words = defaultdict(list)
for topic, words in topic_words.items():
  for word in words:
    c, appears = word_count(topic, word, topic_words)
    if c == 0 or c == 1:
      unique_topic_words[topic].append((word, appears))
  
for topic, words in unique_topic_words.items():
  print(topic)
  print(words)

# visual analysis

In [None]:
# Compute Perplexity
print('Perplexity: ', lda_model.log_perplexity(corpus)) # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=pp_data, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('Coherence Score: ', coherence_lda)

*   272 total documents
    *   Reddit: 44
    *   Fox News: 35
    *   Washington Post: 34
    *   Huffington Post: 12
    *   Breitbart: 11
    *   Total: 136 Human


In [None]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
vis

# compute optimal number of topics

In [None]:
def compute_coherence_values(corpus, id2word, k):
  lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                          id2word=id2word,
                                          num_topics=k, 
                                          random_state=100,
                                          update_every=1,
                                          chunksize=100,
                                          passes=10,
                                          alpha='auto',
                                          eta='auto',
                                          per_word_topics=True)
  
  coherence_model_lda = CoherenceModel(model=lda_model, texts=pp_data, corpus=corpus, dictionary=id2word, coherence='c_v')
    
  return coherence_model_lda.get_coherence()

In [None]:
grid = {}
grid['Validation_Set'] = {}

# Topics range
topics_range = range(2, 30, 1)

# Alpha parameters
alpha = [0.01, 0.1, 1]
alpha.append('symmetric')
alpha.append('asymmetric')

# Beta parameters
beta = [0.01, 0.1, 1]
beta.append('symmetric')

# Validation sets
num_of_docs = len(corpus)
corpus_sets = [corpus] # gensim.utils.ClippedCorpus(corpus, int(num_of_docs*0.75))
corpus_title = ['100% Corpus'] # '75% Corpus'
model_results = {'Validation_Set': [], 'Topics': [], 'Alpha': [], 'Beta': [], 'Coherence': []}

pbar = tqdm.tqdm(total=360)

# Begin iterating through validation corpuses, # topics, alphas, and betas
for i in range(len(corpus_sets)):
  for k in topics_range:
    # for a in alpha:
    #   for b in beta:
    # Get the coherence score for the given parameters
    cv = compute_coherence_values(corpus=corpus_sets[i], id2word=id2word, k=k)
    # Save the model results
    model_results['Validation_Set'].append(corpus_title[i])
    model_results['Topics'].append(k)
    model_results['Alpha'].append('auto')
    model_results['Beta'].append(['auto'])
    model_results['Coherence'].append(cv)
    
    pbar.update(1)

pd.DataFrame(model_results).to_csv(f'{path}tuning_results/all_auto.csv', index=False)
pbar.close()