In [1]:
import datetime
import json
import spacy
import tqdm
import pandas as pd
import re
import numpy as np
import csv
import glob
import dateutil.parser


PATH_TO_FEEDS = 'Feeds/*.jsonl'

nlp = spacy.load("en_core_web_lg") #natural language processing model
nlp.disable_pipes("tagger", "parser", "ner")

[('tagger', <spacy.pipeline.pipes.Tagger at 0x1227d5e48>),
 ('parser', <spacy.pipeline.pipes.DependencyParser at 0x1229a3768>),
 ('ner', <spacy.pipeline.pipes.EntityRecognizer at 0x1229a37c8>)]

## Load Feeds

In [2]:
feed_addresses = glob.glob(PATH_TO_FEEDS)
feed_addresses = feed_addresses[1:]
print(feed_addresses)

['Feeds/IQOS_search_result_2019-07-31_11_31_21.097108.jsonl', 'Feeds/Heat_Not_Burn_search_result_2019-07-24_07_48_28.067985.jsonl', 'Feeds/Vaping_search_result_2019-07-24_07_48_25.721746.jsonl']


In [3]:
desired_stances = ["POSITIVE_sent_probas","POSITIVE_sent_probas","POSITIVE_sent_probas"] #desired stance for every feed - Must be manually input by user
assert len(desired_stances) == len(feed_addresses)


In [4]:
feeds = [] # 2d-list of feeds with articles of each
total_articles = 0
for i, adress in enumerate(feed_addresses):
    feeds.append([])
    for line in open(adress):
        feeds[i].append(json.loads(line))
        total_articles +=1


CHANCE_TO_REPLACE_SNIPPET = 3.0 / total_articles  # percent chance for a current snippet to replace a stored one
print("Loaded", total_articles, "documents")
print(len(feeds))

Loaded 13431 documents
3


## Setup

In [30]:
#ensure keywords are in fact properly lemmanized
for i in range(len(harm_reduction_keywords)):
    harm_reduction_keywords[i] = nlp(harm_reduction_keywords[i])[0].lemma_.lower()
    
harm_reduction_by_country = {}
    
def add_to_countries(country, year):
    if not country in harm_reduction_by_country:
        harm_reduction_by_country[country] = {}
        for yr in range(1990,2020):
            harm_reduction_by_country[country][str(yr)] = 0
    else:
        harm_reduction_by_country[country][year] += 1
            
def format_date(string):
    return dateutil.parser.parse(string)


def avg_sentence_sentiment(article,desired_stance):
    temp = 0.0
    for prob in article["_source"][desired_stance]:
        if(prob != -1):
            temp += prob
    return temp / len(article["_source"][desired_stance])

## Iterate Over Data

In [58]:
DOC_THRESHOLD = .35

DESIRED_TIME = datetime.datetime(2018,6,15)
TIME_RANGE = datetime.timedelta(days=365//2)
DESIRED_LOCATION = "CAN"

#iterate over feeds
for feed_number, feed in enumerate(feeds):
    #iterate over the articles
    for article in tqdm.tqdm_notebook(feed):
        
        #check the region, check the date
        #gather keywords, interesting sents, interesting articles
        
        date = format_date(article["_source"]["date"])
        
        #1. check within time range
        if(abs(date-DESIRED_TIME) < TIME_RANGE):
            
            #2. check location
            for country in article["_source"]["countries"]:
                if country["country_name"] == DESIRED_LOCATION:

                    #2. check for pro-industry
                    if avg_sentence_sentiment(article,desired_stances[feed_number]) >= DOC_THRESHOLD:#article["_source"][desired_stances[feed_number]] > THRESHOLD:
                        #docs = nlp.pipe(article["_source"]["sents"])
                        print(date, article["_source"]["title"])#, article["_source"]["sents"][:4])
                        
                    break








2018-09-18 22:59:34 Majority of Canadians concerned about young people using e-cigarettes, national survey finds
2018-09-18 23:50:00 Pushing back against vaping health concerns
2018-09-18 23:50:00 Pushing back against vaping health concerns
2018-09-18 23:50:00 Canadian tobacco exec pushes back against vaping health concerns
2018-09-18 23:50:00 Canadian tobacco exec pushes back against vaping health concerns
2018-09-18 23:50:00 Canadian tobacco exec pushes back against vaping health concerns
2018-09-18 23:50:00 Canadian tobacco exec pushes back against vaping health concerns
2018-09-18 23:50:00 Canadian tobacco exec pushes back against vaping health concerns
2018-09-18 23:50:00 Canadian tobacco exec pushes back against vaping health concerns
2018-09-18 23:50:00 Canadian tobacco exec pushes back against vaping health concerns
2018-09-18 23:50:00 Canadian tobacco exec pushes back against vaping health concerns
2018-09-18 23:50:00 Canadian tobacco exec pushes back against vaping health con