In [1]:
import numpy as np
import json
import argparse
import os
import random
random.seed(2023)

import gensim.corpora as corpora
from tqdm import tqdm
from model import ThetaRoleModel



In [2]:
parser = argparse.ArgumentParser()

# n topics and n latent theta roles
parser.add_argument('--K', nargs='?', type=int, default=5)
parser.add_argument('--T', nargs='?', type=int, default=2)

# dirichlet initialization hyper parameters (static)
parser.add_argument('--alpha', nargs='?', type=float, default=0.1) 
parser.add_argument('--eta', nargs='?', type=float, default=0.1) 
parser.add_argument('--gamma', nargs='?', type=float, default=0.1) 
parser.add_argument('--lam', nargs='?', type=float, default=0.1)
parser.add_argument('--n_iters', nargs='?', type=int, default=20)
parser.add_argument('--corpus_path', nargs='?', type=str, default="data/processed/corpus.json")
#args = parser.parse_args()

_StoreAction(option_strings=['--corpus_path'], dest='corpus_path', nargs='?', const=None, default='data/processed/corpus.json', type=<class 'str'>, choices=None, help=None, metavar=None)

# prepare the corpus:

In [3]:
from malt import MaltParser # source code from nltk library

In [4]:
import pandas as pd
from nltk.tokenize import word_tokenize, sent_tokenize
import nltk
from tqdm import tqdm
import json
#from datasets import load_dataset
from gensim import utils
from collections import defaultdict

import re

from word import Word
# from malt.malt import MaltParser # source code from nltk library
from malt import MaltParser # source code from nltk library

def process_dataset(docs, stop_words, malt_parser_version='../maltparser-1.7.2', model_version='engmalt.linear-1.7.mco'):
    # initalize malt parser model
    mp = MaltParser(malt_parser_version, model_version, tagger=nltk.pos_tag)
    
    #Add more preprocessing:
    # replace single smart quote with single straight quote, so as to catch stopword contractions
    docs = [re.sub("[\u2018\u2019]", "'", doc) for doc in docs] #replace qoute with regualar qoutations
    #it removes the digits
    # docs = [re.sub('\d+', '', doc) for doc in docs] 
    docs = [re.sub('(\/.*?\.[\w:]+)', '', doc) for doc in docs]
    docs = [re.sub(r"http\S+", '', doc) for doc in docs]
    
    # create <doc_idx, tokenized_sent> list of sents
    sents = [
        (i, nltk.word_tokenize(sent))
        for i, doc in enumerate(docs)
        for sent in nltk.sent_tokenize(utils.to_unicode(str(doc).lower())) # convert doc to lowercase, and sentence tokenized.
    ]

    # unzip list of tuples
    doc_idxs, sents = zip(*sents)

    # create parser <generator> and loop through parser to produce dependency tree for each sentence
    parser = mp.parse_sents(sents, verbose=True)

    # define valid word
    stop_words.append('amp');stop_words.append('&amp');stop_words.append('&amp;')
    valid_word = lambda word: not word in stop_words and word.isalpha() and len(word) > 2

    # initalize dictionary for json output
    docs_dict = {
        'documents': dict((doc_idx, {'words': [], 'relns': [], 'originaltext': []}) for doc_idx in doc_idxs),
        #'documents': dict((doc_idx, {'words': [], 'relns': []}) for doc_idx in doc_idxs),
        'vocab': [], 
        'vocab_relns': [],
    }
    # initalize vocab variables as sets (no duplicates)
    vocab = set()
    vocab_relns = set()

    i = 0
    # loop through list iterators
    for list_it in parser:
        tree = next(list_it)
        # check if valid tree, if not skip
        try:
            nodes = tree.nodes
        except:
            continue
        
        word_relns_hash = defaultdict(list)
        for word_idx in nodes:
            if word_idx == 0: # skip first
                continue
            
            deps = nodes[word_idx]['deps']

            # check for valid dependency relations
            if deps:
                for reln, idxs in deps.items():
                    for idx in idxs:
                        dep_reln, gov_reln = f"{reln}.dep", f"{reln}.gov"
                        # add relations to vocab
                        vocab_relns.add(dep_reln)
                        vocab_relns.add(gov_reln)

                        # add reln to word in hashmap
                        word_relns_hash[idx].append(dep_reln) # append to dep word
                        word_relns_hash[word_idx].append(gov_reln) # append to current word
        
        # check for valid hashmap
        if word_relns_hash:
            doc_idx = doc_idxs[i]

            # loop through hashmap items and append to dict for future storing
            for word_idx, relns in word_relns_hash.items():
                word = nodes[word_idx]['word']
                relns = [reln for reln in relns if reln != "punct.gov"]
                if valid_word(word):
                    vocab.add(word)
                    docs_dict['documents'][doc_idx]['words'].append(word)
                    docs_dict['documents'][doc_idx]['relns'].append(relns)
                    #@zhila: we need to also add the original document to it as well..
                    docs_dict['documents'][doc_idx]['originaltext'] = [docs[doc_idx]]
        
        i += 1

    docs_dict['vocab'] = list(vocab)
    docs_dict['vocab_relns'] = list(vocab_relns)

    return docs_dict

# papare the data for theta role model:

In [5]:
# set the stop words
import pickle
# with open("../Stopword_list",'rb') as read_file:
with open('extra_stopwords','rb') as read_file:
    more_stop_words = pickle.load(read_file)

In [6]:
#testing block:
data = pd.read_csv("../tweets_nytimes.csv")
# data.head()

In [7]:
#read all the news media articles.. 
#define the data frame..
import csv
columns = ['text', 'id']
df = pd.DataFrame(columns = columns)
news_media_files = ['tweets_nytimes.csv']
#news_media_files = ['tweets_nytimes.csv', 'tweets_WSJ.csv','tweets_USATODAY.csv'] 
for item in news_media_files:
    with open("../"+item, "r") as csvfile:
        csv_content = csv.reader(csvfile,delimiter = ',')
        next(csv_content) 
        for row in csv_content:
            df = pd.concat([df, pd.DataFrame([[row[7],row[1]]], columns = columns)])

In [8]:
# read the data
#[UNCOMMENT] uncomment these lines to use own dataset, 2 lines after used for testing
#df = pd.read_csv("../tweets_nytimes.csv")
data=df.loc[:, ["id", "text"]]
corpus = data['text']
# stop word initialization
# with open("data/utils/stopwords.txt") as f:
#     more_stop_words = f.read().splitlines()
stop_words = nltk.corpus.stopwords.words('english')
stop_words.extend(more_stop_words)

In [49]:
#testing block
# docs_dict = process_dataset(corpus, stop_words)
# docs_dict['documents'][0]

In [9]:
# obtain word relation pairs
docs_dict = process_dataset(corpus, stop_words)
# convert to json object
json_object = json.dumps(docs_dict, indent=4)
#Here, we need to also add the original documents... 

# store json object
# with open("all_news_tweets_corpus.json", "w") as f:
with open("ny_news_tweets_corpus.json", "w") as f:
    f.write(json_object)
# with open("news_tweets_corpus.json", "w") as f:
#     f.write(json_object)

-----------------------------------------------------------------------------
                          MaltParser 1.7.2                             
-----------------------------------------------------------------------------
         MALT (Models and Algorithms for Language Technology) Group          
             Vaxjo University and Uppsala University                         
                             Sweden                                          
-----------------------------------------------------------------------------

Started: Thu Jun 15 10:00:48 EDT 2023
  Transition system    : Projective
  Parser configuration : Stack
  Feature model        : eng-liblinear.xml
  Classifier           : liblinear
  Data Format          : /engmalt.linear-1.7/conllx.xml
.          	      1	      3s	    365MB
.          	     10	      3s	    365MB
.          	    100	      4s	    409MB
..........	   1000	      8s	    508MB
..........	   2000	     12s	    500MB
..........	   3000	     15s

In [10]:
#set the arguments.
K, T= 5, 5
alpha, eta, etaprime, gamma, lam , omega= .1, .1, .1, .1, 0.1, 0.1
n_iters = 10

In [10]:
# testing block. 
# type(corpus)
# # corpus[0]
# # o['documents']
# o = json_object
# [TODO]: change to BSON instead of JSON for faster io and smaller storage
# document preprocessing helpers
# theta_model.initialize_variables()
# theta_model.fit()

In [11]:
# [TODO]: change to BSON instead of JSON for faster io and smaller storage
with open("ny_news_tweets_corpus.json") as json_file:
# with open("all_news_tweets_corpus.json") as json_file:
# with open("news_tweets_corpus.json") as json_file:
    o = json.load(json_file)
doc_objects = o['documents'] #words, reln, originaltext
docs = [ doc_objects[str(doc_id)]['words'] for doc_id in doc_objects ]
text = [ doc_objects[str(doc_id)]['originaltext'] for doc_id in doc_objects ]
doc_relns = [ doc_objects[str(doc_id)]['relns'] for doc_id in doc_objects ]
vocab = o['vocab']
vocab_relns = o['vocab_relns']

In [12]:


# document preprocessing helpers
id2word = corpora.Dictionary(docs)
reln2id = {reln:i for i, reln in enumerate(vocab_relns)}
corpus = list(map(lambda x: id2word.doc2idx(x), docs))
# originaltext = list(map(lambda x: id2word.doc2idx(x), text))
#originaltext = list(map(lambda doc_id: doc_objects[str(doc_id)]['originaltext'], doc_objects))
originaltext = {doc_id: doc_objects[str(doc_id)]['originaltext'] for doc_id in doc_objects}


# initialize scalars from plate diagram
D, V, R = len(docs), len(vocab), len(vocab_relns) # n documents, n words, n relns: https://universaldependencies.org/u/dep/

# initialize theta role model
theta_model = ThetaRoleModel(corpus, originaltext, doc_relns, vocab_relns, id2word, reln2id, n_iters, K, T, D, V, R, alpha, eta, etaprime, gamma, lam, omega)
theta_model.initialize_variables()

In [15]:
theta_model.p_t_reln_w_z_d

array([0., 0., 0., 0., 0.])

In [None]:
theta_model.fit()

# compute matrices
theta_model.compute_matrices()

# print topics, theta roles, and top topics/theta roles for each document
theta_model.print_all()

In [None]:
doc_objects['0']['originaltext']

In [61]:
# for doc_id in doc_objects:
#     print(doc_id)

In [62]:
# dict = {doc_id: doc_objects[str(doc_id)]['originaltext'] for doc_id in doc_objects}

In [69]:
# originaltext = list(map(lambda x: id2word.doc2idx(x), text))
# originaltext

In [54]:
text[0]

['President Biden received an updated coronavirus booster shot on Tuesday, using the opportunity to plead with Americans to get boosted ahead of a possible surge in virus cases in the coming months. ']

In [70]:
# corpus
# top_documents_indx = np.argsort(theta_model.theta[:, 4])[::-1][:10]
# top_documents_indx

# Top documents for topics

In [17]:
top_documents_indx = theta_model.print_top_documents_topic()

doc 10475 - 0.9095238095238096 - ['On our new podcast “Sugar Calling,” best-selling author Cheryl Strayed calls up the writers she admires in search of insight and courage in an uncertain moment.\n\nUp first: George Saunders. Listen now 📞\n'] 
 
doc 16238 - 0.905 - ["Netflix password sharing has maybe gotten out of control during coronavirus lockdowns. One woman watched “Orange Is the New Black” on her husband's brother's girlfriend's uncle's account. "] 
 
doc 11347 - 0.905 - ["For today, March 22, here's how to follow the latest on the coronavirus outbreak. We'll update this every day. \n\n🌎 Live updates from around the world: \n🗽 The latest in New York: \n🏡 The effect on daily life: "] 
 
doc 24421 - 0.905 - ["Transparency builds trust. \nTransparency builds authority. \nTransparency builds credibility. \nRead this week's Backstory, on why transparency is so important, from USA TODAY editor-in-chief Nicole Carroll. \n"] 
 
doc 11448 - 0.9 - ["For today, March 20, here's how to follo

# Top documents for theta roles 

In [16]:
top_documents_indx = theta_model.print_top_documents_theta()

doc 10051 - 0.9854545454545455 - ["To help provide food amid the coronavirus pandemic, Food Bank for New York City has set up pop-up food distribution sites and is planning to deliver groceries, snacks and hygiene items to medical professionals at the city's public hospitals and clinics "] 
 
doc 15541 - 0.9849056603773585 - ["Two New York City restaurant owners are launching an organization called Safe Eats, which aims to help dining businesses navigate federal, state and city health guidelines and advertise their commitment to diners' safety with a window decal "] 
 
doc 672 - 0.9849056603773585 - ['Two days after Sarah Palin tested positive for the coronavirus, the former Alaska governor dined outdoors in New York City on Wednesday evening, defying federal guidance that infected people isolate from others for at least five full days. '] 
 
doc 675 - 0.9849056603773585 - ['Two days after Sarah Palin tested positive for the coronavirus, the former Alaska governor dined outdoors in New

In [15]:
top_documents_indx