In [1]:
import numpy as np
import json
import argparse
import os
import random
random.seed(2023)

import gensim.corpora as corpora
from tqdm import tqdm
from model import ThetaRoleModel



In [2]:
parser = argparse.ArgumentParser()

# n topics and n latent theta roles
parser.add_argument('--K', nargs='?', type=int, default=5)
parser.add_argument('--T', nargs='?', type=int, default=2)

# dirichlet initialization hyper parameters (static)
parser.add_argument('--alpha', nargs='?', type=float, default=0.1) 
parser.add_argument('--eta', nargs='?', type=float, default=0.1) 
parser.add_argument('--gamma', nargs='?', type=float, default=0.1) 
parser.add_argument('--lam', nargs='?', type=float, default=0.1)
parser.add_argument('--n_iters', nargs='?', type=int, default=20)
parser.add_argument('--corpus_path', nargs='?', type=str, default="data/processed/corpus.json")
#args = parser.parse_args()

_StoreAction(option_strings=['--corpus_path'], dest='corpus_path', nargs='?', const=None, default='data/processed/corpus.json', type=<class 'str'>, choices=None, help=None, metavar=None)

# prepare the corpus:

In [3]:
from malt import MaltParser # source code from nltk library

In [4]:
import pandas as pd
from nltk.tokenize import word_tokenize, sent_tokenize
import nltk
from tqdm import tqdm
import json
#from datasets import load_dataset
from gensim import utils
from collections import defaultdict

import re

from word import Word
# from malt.malt import MaltParser # source code from nltk library
from malt import MaltParser # source code from nltk library

def process_dataset(docs, stop_words, malt_parser_version='../maltparser-1.7.2', model_version='engmalt.linear-1.7.mco'):
    # initalize malt parser model
    mp = MaltParser(malt_parser_version, model_version, tagger=nltk.pos_tag)
    
    #Add more preprocessing:
    # replace single smart quote with single straight quote, so as to catch stopword contractions
    docs = [re.sub("[\u2018\u2019]", "'", doc) for doc in docs] #replace qoute with regualar qoutations
    #it removes the digits
    # docs = [re.sub('\d+', '', doc) for doc in docs] 
    docs = [re.sub('(\/.*?\.[\w:]+)', '', doc) for doc in docs]
    docs = [re.sub(r"http\S+", '', doc) for doc in docs]
    
    # create <doc_idx, tokenized_sent> list of sents
    sents = [
        (i, nltk.word_tokenize(sent))
        for i, doc in enumerate(docs)
        for sent in nltk.sent_tokenize(utils.to_unicode(str(doc).lower())) # convert doc to lowercase, and sentence tokenized.
    ]

    # unzip list of tuples
    doc_idxs, sents = zip(*sents)

    # create parser <generator> and loop through parser to produce dependency tree for each sentence
    parser = mp.parse_sents(sents, verbose=True)

    # define valid word
    stop_words.append('amp');stop_words.append('&amp');stop_words.append('&amp;')
    valid_word = lambda word: not word in stop_words and word.isalpha() and len(word) > 2

    # initalize dictionary for json output
    docs_dict = {
        'documents': dict((doc_idx, {'words': [], 'relns': [], 'originaltext': []}) for doc_idx in doc_idxs),
        #'documents': dict((doc_idx, {'words': [], 'relns': []}) for doc_idx in doc_idxs),
        'vocab': [], 
        'vocab_relns': [],
    }
    # initalize vocab variables as sets (no duplicates)
    vocab = set()
    vocab_relns = set()

    i = 0
    # loop through list iterators
    for list_it in parser:
        tree = next(list_it)
        # check if valid tree, if not skip
        try:
            nodes = tree.nodes
        except:
            continue
        
        word_relns_hash = defaultdict(list)
        for word_idx in nodes:
            if word_idx == 0: # skip first
                continue
            
            deps = nodes[word_idx]['deps']

            # check for valid dependency relations
            if deps:
                for reln, idxs in deps.items():
                    for idx in idxs:
                        dep_reln, gov_reln = f"{reln}.dep", f"{reln}.gov"
                        # add relations to vocab
                        vocab_relns.add(dep_reln)
                        vocab_relns.add(gov_reln)

                        # add reln to word in hashmap
                        word_relns_hash[idx].append(dep_reln) # append to dep word
                        word_relns_hash[word_idx].append(gov_reln) # append to current word
        
        # check for valid hashmap
        if word_relns_hash:
            doc_idx = doc_idxs[i]

            # loop through hashmap items and append to dict for future storing
            for word_idx, relns in word_relns_hash.items():
                word = nodes[word_idx]['word']
                relns = [reln for reln in relns if reln != "punct.gov"]
                if valid_word(word):
                    vocab.add(word)
                    docs_dict['documents'][doc_idx]['words'].append(word)
                    docs_dict['documents'][doc_idx]['relns'].append(relns)
                    #@zhila: we need to also add the original document to it as well..
                    docs_dict['documents'][doc_idx]['originaltext'] = [docs[doc_idx]]
        
        i += 1

    docs_dict['vocab'] = list(vocab)
    docs_dict['vocab_relns'] = list(vocab_relns)

    return docs_dict

# papare the data for theta role model:

In [6]:
# set the stop words
import pickle
# with open("../Stopword_list",'rb') as read_file:
with open('extra_stopwords','rb') as read_file:
    more_stop_words = pickle.load(read_file)
# stop word initialization
# with open("data/utils/stopwords.txt") as f:
#     more_stop_words = f.read().splitlines()
stop_words = nltk.corpus.stopwords.words('english')
stop_words.extend(more_stop_words)

In [6]:
#testing block:
# data = pd.read_csv("../tweets_nytimes.csv")
# data.head()

# Read the dataset from workspace

In [7]:
#read all the news media articles, and define the data frame..
import csv
columns = ['text', 'id']
df = pd.DataFrame(columns = columns)
news_media_files = ['tweets_nytimes.csv']
#news_media_files = ['tweets_nytimes.csv', 'tweets_WSJ.csv','tweets_USATODAY.csv'] 
for item in news_media_files:
    with open("../"+item, "r") as csvfile:
        csv_content = csv.reader(csvfile,delimiter = ',')
        next(csv_content) 
        for row in csv_content:
            df = pd.concat([df, pd.DataFrame([[row[7],row[1]]], columns = columns)])

In [8]:
# read the data
#[UNCOMMENT] uncomment these lines to use own dataset, 2 lines after used for testing
df = pd.read_csv("../tweets_nytimes.csv")
data=df.loc[:, ["id", "text"]]
corpus = data['text']

In [9]:
type(corpus)
# corpus[0]

pandas.core.series.Series

# Read the dataset from the mysql dataset:

In [10]:
import mysql.connector

sql_db = mysql.connector.connect(host='127.0.0.1',user = 'covidAnalysis',password = 'k34p63MbDDcZ9yf4',
                                 database = 'covid19framing')
cursor = sql_db.cursor()

In [10]:
cursor.execute('show tables;')
results = cursor.fetchall()
# I changed the chosen column to be text_preproc and not text because of some issues in those rows wherein text_preproc is null and text exist...
cursor.execute("select * from articles where text_preproc is not NULL")
results = cursor.fetchall()
len(results)

3699

In [11]:
originaltext = [item[1].decode() for item in results] #originaltextoriginaltext[1]
corpus = pd.Series(originaltext)
# corpus = originaltext

In [13]:
type(corpus), len(corpus)
# originaltext[0].decode()

(pandas.core.series.Series, 12976)

In [49]:
#testing block
# docs_dict = process_dataset(corpus, stop_words)
# docs_dict['documents'][0]

# prepare the dataset to json objects that includes word tokens and typed depdencies for each documents.

In [None]:
# obtain word relation pairs
docs_dict = process_dataset(corpus, stop_words)
# convert to json object
json_object = json.dumps(docs_dict, indent=4)
#Here, we need to also add the original documents... 

# store json object
# with open("all_news_tweets_corpus.json", "w") as f:
with open("doh_corpus.json", "w") as f:
# with open("ny_news_tweets_corpus.json", "w") as f:
    f.write(json_object)
# with open("news_tweets_corpus.json", "w") as f:
#     f.write(json_object)

-----------------------------------------------------------------------------
                          MaltParser 1.7.2                             
-----------------------------------------------------------------------------
         MALT (Models and Algorithms for Language Technology) Group          
             Vaxjo University and Uppsala University                         
                             Sweden                                          
-----------------------------------------------------------------------------

Started: Mon Jun 19 17:14:46 EDT 2023
  Transition system    : Projective
  Parser configuration : Stack
  Feature model        : eng-liblinear.xml
  Classifier           : liblinear
  Data Format          : /engmalt.linear-1.7/conllx.xml
.          	      1	      3s	    358MB
.          	     10	      3s	    358MB
.          	    100	      4s	    412MB
..........	   1000	      8s	    496MB
..........	   2000	     11s	    488MB
..........	   3000	     13s

In [15]:
#set the arguments.
K, T= 20, 10
alpha, eta, etaprime, gamma, lam , omega= .1, .1, .1, .1, 0.1, 0.1
n_iters = 1000

In [10]:
# testing block. 
# type(corpus)
# # corpus[0]
# # o['documents']
# o = json_object
# [TODO]: change to BSON instead of JSON for faster io and smaller storage
# document preprocessing helpers
# theta_model.initialize_variables()
# theta_model.fit()

In [16]:
# [TODO]: change to BSON instead of JSON for faster io and smaller storage
# with open("ny_news_tweets_corpus.json") as json_file:
with open("doh_corpus.json") as json_file:
# with open("news_tweets_corpus.json") as json_file:
    o = json.load(json_file)
doc_objects = o['documents'] #words, reln, originaltext
docs = [ doc_objects[str(doc_id)]['words'] for doc_id in doc_objects ]
text = [ doc_objects[str(doc_id)]['originaltext'] for doc_id in doc_objects ]
doc_relns = [ doc_objects[str(doc_id)]['relns'] for doc_id in doc_objects ]
vocab = o['vocab']
vocab_relns = o['vocab_relns']

In [17]:


# document preprocessing helpers
id2word = corpora.Dictionary(docs)
reln2id = {reln:i for i, reln in enumerate(vocab_relns)}
corpus = list(map(lambda x: id2word.doc2idx(x), docs))
# originaltext = list(map(lambda x: id2word.doc2idx(x), text))
#originaltext = list(map(lambda doc_id: doc_objects[str(doc_id)]['originaltext'], doc_objects))
originaltext = {doc_id: doc_objects[str(doc_id)]['originaltext'] for doc_id in doc_objects}


# initialize scalars from plate diagram
D, V, R = len(docs), len(vocab), len(vocab_relns) # n documents, n words, n relns: https://universaldependencies.org/u/dep/

# initialize theta role model
theta_model = ThetaRoleModel(corpus, originaltext, doc_relns, vocab_relns, id2word, reln2id, n_iters, K, T, D, V, R, alpha, eta, etaprime, gamma, lam, omega)
theta_model.initialize_variables()

In [23]:
theta_model.fit()
# compute matrices
theta_model.compute_matrices()

# print topics, theta roles, and top topics/theta roles for each document
theta_model.print_all()

100%|██████████| 5/5 [33:35<00:00, 403.14s/it]

Topic 0: 0.013401029200698274 * state, 0.010426012100817379 * health, 0.009184265833041005 * governor, 0.005940203708475228 * public, 0.005562505885359914 * new, 0.005055459492684562 * department, 0.005039937664337357 * today, 0.004791588410782083 * testing, 0.0045846306994860204 * order, 0.004377672988189958 * emergency, 0.004165541334111495 * program, 0.004160367391329092 * care, 0.004160367391329092 * businesses, 0.004134497677417085 * support, 0.004103454020722676 * services, 0.003803365339343385 * idaho, 0.0031307527776311826 * people, 0.003099709120936773 * community, 0.003042795750330356 * help, 0.0030014042080711437 * county 

Topic 1: 0.010048763341785305 * data, 0.009915043026103477 * connecticut, 0.009061780059371818 * state, 0.0077882532433544175 * health, 0.007138754567185543 * los, 0.005756977971806663 * colorado, 0.004795465225713525 * vaccine, 0.004769994689393177 * cases, 0.004693583080432133 * public, 0.0046426420077914365 * information, 0.004394304278668043 * departm




In [34]:
theta_model.theta[0, 4]

0.3093935790725327

In [42]:
# top_topics=5
# for idx in range(theta_model.D):
#     top_topics_indx = np.argsort(theta_model.theta[idx, :][::-1][:top_topics])
#     print(top_topics_indx)
#     print('document {0} \n'.format(idx))
#     try:
#         for k in top_topics_indx:
#             print("topic {0} - {1} \n ".format(k, theta_model.theta[idx, k]))
#         print('********************')
#     except:
#         print('some error')
# top_thetas = 5
# for idx in range(theta_model.D):
#     top_thetas_indx = np.argsort(theta_model.phi[idx, :][::-1][:top_thetas])
#     print('document {0} \n'.format(idx))
#     try:
#         for t in top_thetas_indx:
#             print("theta {0} - {1} \n ".format(t, theta_model.phi[idx, t]))
#         print('------------------------------------------------------------------------')
#         print('------------------------------------------------------------------------')
#     except:
#         print('error')

In [24]:
doc_objects['0']['originaltext']

["ROCHESTER, N.Y. – With Memorial Day Weekend about to kick off, the\nTransportation Security Administration (TSA) is ready for the expected\nincrease in the number of travelers who will be flying out of Frederick\nDouglass Greater Rochester International Airport.\n\n“We are prepared to handle the expected increase in travel volume this summer\nand at the same time TSA is committed to supporting a healthy and secure\nenvironment for airline passengers, security personnel and airport employees,”\nsaid Bart R. Johnson, TSA's Federal Security Director for Upstate New York.\n“For those who will be traveling this summer, travelers need to know that the\ncheckpoint experience will look different to those who have not flown since\nthe start of the pandemic, and individuals flying out of Rochester should be\nprepared for those changes.”\n\nNationwide TSA officers are screening approximately 1.6 million people daily,\nwhich is a large increase from last year, but still down significantly from\n

In [61]:
# for doc_id in doc_objects:
#     print(doc_id)

In [62]:
# dict = {doc_id: doc_objects[str(doc_id)]['originaltext'] for doc_id in doc_objects}

In [69]:
# originaltext = list(map(lambda x: id2word.doc2idx(x), text))
# originaltext

In [None]:
text[0]

In [70]:
# corpus
# top_documents_indx = np.argsort(theta_model.theta[:, 4])[::-1][:10]
# top_documents_indx

# Top documents for topics

# Top documents for theta roles 

In [54]:
top_documents_indx = theta_model.print_top_documents_theta()

doc 2345 - 0.9964566929133858 - ["Governor Whitmer Announces Michiganders to Receive Additional Assistance in March to Lower the Cost of GroceriesLANSING, Mich. – Governor Gretchen Whitmer today announced all Michigan families who are eligible for food assistance benefits will receive at least an additional $95 monthly payment in March to help lower the cost of groceries and ensure Michiganders can keep more of their hard-earned money. The additional assistance will help approximately 1.31 million Michiganders in more than 700,000 households.\xa0“Michiganders will receive additional assistance to put food on the table in March as we continue growing our economy,” said Governor Gretchen Whitmer. “This relief\xa0ensures that families can thrive and help us build on our economic momentum. We will continue collaborating with our federal partners to get things done by lowering out-of-pocket food costs and put money in people's pockets with our proposals to roll back the retirement tax, trip

In [15]:
top_documents_indx