In [27]:
import json
import spacy
import tqdm
import pandas as pd
import re
import numpy as np
import os
import csv
import glob

MIN_COUNT = 3  # MINIMUM NUMBER OF INSTANCES THAT A NAMED ENTITY MUST APPEAR
PRO_INDUSTRY_PREDICTION_THRESHOLD = .75 #PRO-INDUSTRY THRESHOLD WITH WHICH WE CONSIDER A DOCUMENT
CHANCE_TO_REPLACE_SNIPPET = 0 # set after feed loaded

PATH_TO_FEEDS = 'Feeds/*.jsonl'
PATH_TO_CSV = "coreferences.csv"

BACKGROUND_FOREGROUND_ANALYSIS = True # Toggle background / foreground analysis
BACKGROUND_FOREGROUND_THRESHOLD = .05 # Threshold for ratio of foreground frequency to background frequency


nlp = spacy.load("en_core_web_lg") #natural language processing model
nlp.disable_pipes("tagger", "parser") #"parser" can't be disabled for spacy noun_chuncks

KeyboardInterrupt: 

## Load Feed

In [2]:
feed_addresses = glob.glob(PATH_TO_FEEDS)
feed_addresses = feed_addresses[1:]
print(feed_addresses)

['Feeds/IQOS_search_result_2019-07-31_11_31_21.097108.jsonl', 'Feeds/Heat_Not_Burn_search_result_2019-07-24_07_48_28.067985.jsonl', 'Feeds/Vaping_search_result_2019-07-24_07_48_25.721746.jsonl']


In [3]:
desired_stances = ["POSITIVE_sent_probas","POSITIVE_sent_probas","POSITIVE_sent_probas"] #desired stance for every feed - Must be manually input by user
assert len(desired_stances) == len(feed_addresses)


In [4]:
feeds = [] # 2d-list of feeds with articles of each
total_articles = 0
for i, adress in enumerate(feed_addresses):
    feeds.append([])
    for line in open(adress):
        feeds[i].append(json.loads(line))
        total_articles +=1


CHANCE_TO_REPLACE_SNIPPET = 3.0 / total_articles  # percent chance for a current snippet to replace a stored one
print("Loaded", total_articles, "documents")
print(len(feeds))

Loaded 13431 documents
3


In [26]:
named_ents = {} #{"named entity": count} keeps count of each named entity ex: named_ents["PMI"] ->> 233 times
coreference_matrix = {} #{"named entity":{"coref1":count, "coref2":count}} keeps count of coreferenced named entities in a sentence ex: coreference_matrix["PMI"]["FDA"] ->>> 34 times
associated_keywords = {} #{"named entity": {"keyword1":count, "keyword2":count}} keeps count of keywords associated with named entities ex: associated_keywords["PMI"]["help"] ->>> 200 times
associated_countries = {} #{"named entity": {"country1":count, "country2":count}} keeps count of countries associated witg named entities ex: associated_countries["PMI"]["USA"] ->>> 20 times
snippets = {} # {"named entity": [snippet1, snippet2, snippet3]} stores example sentences of a named entity
background_ents = {} #{"named entity": count} keeps count of named entity regardless of whether the sentence is pro-industry

aliases_list = [
    ["PMI", "philip morris", "philip morris international"],
    ["FDA","food and drug administration", "fda","food & drug administration"],
    ["Andre Calantzopoulos","calantzopoulos"],
    ['IQOS', 'iqos'],
    ["JUUL","juul"],
    ["WHO","world health organisation"],
    ["the Royal College of Physicians","the royal college of physicians"],
    ["CDC","centers for disease control"],
    ["ATHRA", "australian tobacco harm reduction association"],
    ["PHE", "public health england", ],
    ['Israel Cancer Association', 'ica', 'israel cancer association']
] # load in alias list to account for synonyms: [["real name","alias1","alias2", ...], ...] accounts for synonyms
# NOTE: ALIASES MUST BE LOWERCASE!


"""add_to_dict will update named_ents"""
def add_to_dict(ent_text): 
    if ent_text in named_ents:
        named_ents[ent_text]+=1
    else:
        named_ents[ent_text] = 1
        
"""add_to_matrix will update coreference_matrix"""
def add_to_matrix(ent1_text,ent2_text): 
    
    if ent1_text not in coreference_matrix:
        coreference_matrix[ent1_text] = {}
            
    if ent2_text not in coreference_matrix:
        coreference_matrix[ent2_text] = {}
    
    if ent1_text not in coreference_matrix[ent2_text]:
        coreference_matrix[ent2_text][ent1_text]=1
        coreference_matrix[ent1_text][ent2_text]=1
    else:
        coreference_matrix[ent2_text][ent1_text]+=1 
        if not ent2_text == ent1_text: #add it only once if label1 == label2
            coreference_matrix[ent1_text][ent2_text]+=1
    
"""add_to_keywords will update associated_keywords"""
def add_to_keywords(ent_text, keyword):
    if not ent_text in associated_keywords:
        associated_keywords[ent_text] = {keyword: 1}
    elif not keyword in associated_keywords[ent_text]:
        associated_keywords[ent_text][keyword] = 1 
    else:
        associated_keywords[ent_text][keyword] += 1
        
"""add_to_countries will update associated_countries"""
def add_to_countries(ent_text, countries):
    if not ent_text in associated_countries:
        associated_countries[ent_text] = {countries: 1}
    elif not countries in associated_countries[ent_text]:
        associated_countries[ent_text][countries] = 1 
    else:
        associated_countries[ent_text][countries] += 1

"""add_to_snippets will update associated_keywords"""
def add_to_snippets(ent_text, snippet):
    if not ent_text in snippets:
        snippets[ent_text] = [snippet]
    elif snippet in snippets[ent_text]:
        return
    elif len(snippets[ent_text]) < 3:
        snippets[ent_text].append(snippet)
    else:
        if np.random.random() < CHANCE_TO_REPLACE_SNIPPET:
            snippets[int(np.random.randint(0,high=2))] = snippet
        
def add_to_background_ents(ent_text):
    if ent_text in background_ents:
        background_ents[ent_text]+=1
    else:
        background_ents[ent_text]=1
        
"""check_alias will check if ent_text is alias, if so it will return the proper name"""
def check_alias(ent_text):
    temp = ent_text.lower()
    for aliases in aliases_list:
        for alias in aliases:
            if alias == temp:
                return aliases[0]
    return None

"""check_accronym will check if ent2 is an accornym of ent1, if so adds it to allias_list. ex: A study conducted by the University of East Anglia (UEA) ..."""
def check_accronym(ent1, ent2):
    temp = ""
    for token in ent1:
        if not token.is_stop:
            temp += token.text[0]
    temp = temp.upper()
    if temp == ent2.text:
        aliases_list.append([ent1.text, ent2.text.lower(), ent1.text.lower()])
        return ent1.text
    return ent2.text

## Iterate Over Data

In [23]:
#iterate over every feed
for feed_number, feed in enumerate(feeds):
    # iterate over every article
    for article in tqdm.tqdm_notebook(feed[150::]):
        
        # iterate over every sentence
        for sent, prob in zip(article["_source"]["sents"], article["_source"][desired_stances[feed_number]]):

            sent = "With or without the ability to market the iQOS as healthier than standard cigarettes, if the company gets sales approval, it will have a tremendous lead on British American Tobacco (NYSEMKT: BTI), which is still weighing submitting its own U.S. marketing application."
            
            # Condition for BACKGROUND_FOREGROUND_ANALYSIS boolean
            if BACKGROUND_FOREGROUND_ANALYSIS or prob > PRO_INDUSTRY_PREDICTION_THRESHOLD: 
                #sent = "A new study conducted by the University of East Anglia (UEA) found that people who switched from cigarettes to vape pens actually found it more enjoyable and this is one "
                doc = nlp(sent)
                ents = list(doc.ents) 

                # iterate through entities
                for i in range(len(ents)):
                    if ents[i].label_ == "PERSON" or ents[i].label_ == "ORG":

                        # check alias:
                        ent1_text = ents[i].text
                        ent1_text = re.sub("[\n,]", "", ent1_text)
                        temp = check_alias(ent1_text)
                        if temp != None:
                            ent1_text = temp
                            # check accronym
                        elif i > 0 and ents[i-1].label_ == "ORG" and ents[i].start - ents[i-1].end < 2 and ents[i].text.upper() == ents[i].text :
                            ent1_text = check_accronym(ents[i-1],ents[i])
                        
                        

                        #check incomplete name:
                        if ents[i].label_== "PERSON" and not " " in ent1_text and ent1_text != "JUUL": #JUUL IS MISTAKENLY THOUGHT TO BE A PERSON
                            break


                        if prob > PRO_INDUSTRY_PREDICTION_THRESHOLD: # if model is > 75% confident that the sentence is pro-industry
                            # update named_ents
                            add_to_dict(ent1_text)

                            # update keywords
                            for token in doc:#noun_chunk in doc.noun_chunks:
                                if (not token.is_stop) and token.is_alpha and (not token.text.lower() in ent1_text.lower()):
                                    add_to_keywords(ent1_text, token.lemma_.lower())
                                #add_to_keywords(ent1_text, noun_chunk.text.lower())

                            # update countires
                            for country in article["_source"]["countries"]:
                                add_to_countries(ent1_text, country["country_name"])

                            # update snippets
                            add_to_snippets(ent1_text, re.sub('"', '""', re.sub("\n", "", doc.text)))

                            # iterate corefs
                            for j in range(len(ents) - i):
                                if ents[j + i].label_ == "PERSON" or ents[j + i].label_ == "ORG":
                                    # check alias:
                                    ent2_text = ents[j + i].text
                                    ent3_text = re.sub("[\n,]", "", ent2_text)
                                    ent2_text = check_alias(ent2_text)

                                    # update coreference_matrix
                                    add_to_matrix(ent1_text, ent2_text)

                        # update background_ents - regardless of pro-industry statement
                        if BACKGROUND_FOREGROUND_ANALYSIS:
                            add_to_background_ents(ent1_text)


KeyboardInterrupt: 

In [13]:
aliases_list

[['PMI', 'philip morris', 'philip morris international'],
 ['FDA', 'food and drug administration', 'fda', 'food & drug administration'],
 ['Andre Calantzopoulos', 'calantzopoulos'],
 ['IQOS', 'iqos'],
 ['JUUL', 'juul'],
 ['WHO', 'world health organisation'],
 ['the Royal College of Physicians', 'the royal college of physicians'],
 ['CDC', 'centers for disease control'],
 ['ATHRA', 'australian tobacco harm reduction association'],
 ['PHE', 'public health england'],
 ['Philip Morris International', 'pmi', 'philip morris international'],
 ['the Tobacco Products Scientific Advisory Committee',
  'tpsac',
  'the tobacco products scientific advisory committee'],
 ['the Philippine Tobacco Institute',
  'pti',
  'the philippine tobacco institute'],
 ['the Tobacco Heated Systems', 'ths', 'the tobacco heated systems'],
 ['the International Olympic Committee',
  'ioc',
  'the international olympic committee'],
 ['Philip Morris New Zealand', 'pmnz', 'philip morris new zealand'],
 ['European Union'

## Clean Data

In [8]:
# discard any named entities that are below the minimum count threshold from named_ents and associated_keywords
# NOTE: only partially deletes from coreference matrix. not accounted for: coref["kept": {"kept": 90, "DISCARD":1}]
# This is handled later

# REMOVE ENTITIES BELOW THRESHOLD OR IF THEY HAVE A RELATIVELY HIGH BACKGROUND FREQUENCY:
keys = list(named_ents.keys())
for i in range(len(named_ents)):
    if named_ents[keys[i]] < MIN_COUNT or (BACKGROUND_FOREGROUND_ANALYSIS and named_ents[keys[i]]/background_ents[keys[i]] < BACKGROUND_FOREGROUND_THRESHOLD):
        del named_ents[keys[i]]


# REMOVE UNWANTED ENTITIES
remove_ents = ["FDA","CDC","Vaping","Vapes","E-Cigarette","Nicotine","WHO"]

for ent in remove_ents:
    if ent in named_ents:
        del named_ents[ent]

# Sort Data

In [24]:
sorted_named_ents = sorted(named_ents.items(), key=lambda kv: kv[1], reverse=True)

## Export to CSV

In [25]:
# FORMAT:
# entity, count, snippets, associated keywords, coreferences
with open(PATH_TO_CSV, "w+") as f:
    
    if BACKGROUND_FOREGROUND_ANALYSIS:
        f.write("Entity, Foreground Count, Fore/Back Ratio, Foreground Count*Ratio, Snippets, Associated Keywords, Coreferences\n")
    else:
        f.write("Entity, Count, Snippets, , , Associated Keywords, Coreferences\n")
        
    for i, row in enumerate(sorted_named_ents):
        ent_text = row[0]
        #print(ent_text, named_ents[ent_text],background_ents[ent_text], named_ents[ent_text]/background_ents[ent_text])

        # entity
        f.write(ent_text)
        f.write(',')

        # foreground count
        f.write(str(named_ents[ent_text]))
        f.write(',')
        
        # ratio
        if BACKGROUND_FOREGROUND_ANALYSIS:
            f.write(str(named_ents[ent_text]/background_ents[ent_text]))
            f.write(',')
            
        # foregroud count * ratio
        if BACKGROUND_FOREGROUND_ANALYSIS:
            f.write(str(named_ents[ent_text]**2/background_ents[ent_text]))
            f.write(',')

        # snippets
        f.write('"')
        f.write(snippets[ent_text][0])
        f.write('","')
        f.write(snippets[ent_text][1] if len(snippets[ent_text])>1 else "")
        f.write('","')
        f.write(snippets[ent_text][2] if len(snippets[ent_text])>2 else "")
        f.write('",')

        # associated keywords
        MIN_KEYWORD_COUNT = 2
        f.write('"')
        if ent_text in associated_keywords:
            sorted_associated_keywords = sorted(associated_keywords[ent_text].items(), key=lambda kv: kv[1], reverse=True)
            for row in sorted_associated_keywords:
                keyword = row[0]
                if associated_keywords[ent_text][keyword] > MIN_KEYWORD_COUNT:
                    f.write(keyword)
                    f.write(':')
                    f.write(str(associated_keywords[ent_text][keyword]))
                    f.write(', ')
        f.write('",')

        # coreferences
        f.write('"')
        sorted_corefs = sorted(coreference_matrix[ent_text].items(), key=lambda kv: kv[1], reverse= True)
        for row in sorted_corefs:
            coref = row[0]
            if coref in named_ents:  # accounting for incomplete deletion in coreference_matrix
                f.write(coref)
                f.write(':')
                f.write(str(coreference_matrix[ent_text][coref]))
                f.write(', ')
        f.write('",')

        # COUNRIES
        f.write('"')
        if ent_text in associated_countries:
            sorted_associated_countries = sorted(associated_countries[ent_text].items(), key=lambda kv: kv[1], reverse= True)
            for row in sorted_associated_countries:
                country = row[0]
                f.write(country)
                f.write(':')
                f.write(str(associated_countries[ent_text][country]))
                f.write(', ')
        f.write('"')

        f.write('\n')

In [55]:
#TODO: noun_chuncks
#TODO: deduping
#TODO: background / forground - only use Tokenizer

# Networkx

In [95]:
import networkx as nx

In [None]:
graph = nx.Graph()
for ent, val in named_ents.items():
    graph.add_node(ent,weight=val)
for ent, corefs in coreference_matrix.items():
    for coref, link_weight in corefs.items():
        if coref in named_ents and ent in named_ents:
            graph.add_edge(ent,coref,weight=link_weight)

In [None]:
nx.write_graphml(graph,"/Users/Ben/Desktop/Vital Strategies/networkxgraphs/feedtest3.graphml")