# Imports

In [1]:
import os
import sys
import json
import datetime
import pickle
import gc
import time

from operator import itemgetter
from collections import defaultdict

import pandas as pd
import numpy as np

import gensim
from gensim import corpora, models
from sklearn.feature_extraction.text import CountVectorizer
from gensim import corpora, models, similarities, matutils
from gensim.models.ldamodel import LdaModel
from gensim.models.ldamulticore import LdaMulticore

import pprint
pp = pprint.PrettyPrinter(indent=4)

import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

# Get Cleaned Text

In [2]:
# Load Clean Data
with open('pickled_files/clean_data_small.pickle', 'rb') as file:
    art = pickle.load(file)

In [3]:
# Check Clean Data
art.head()

Unnamed: 0,article_text
https://lens.blogs.nytimes.com/2017/06/21/handicapped-but-no-longer-invisible-andres-millan/,"[andres, millan, traveling, environs, photogra..."
https://lens.blogs.nytimes.com/2017/08/22/combat-photographer-marine-ptsd-book/,"[camera, shutter, whirred, grenade, wobbled, t..."
https://www.nytimes.com/2017/06/01/nyregion/bronx-police-shooting-mental-illness.html,"[week, police, sergeant, shot, killed, woman, ..."
https://www.nytimes.com/2017/06/01/world/canada/nurse-killings-insulin.html,"[nurse, pleaded, guilty, killing, eight, patie..."
https://www.nytimes.com/2017/06/02/learning/editorial-contest-winner-the-anguish-of-the-rich.html,"[honoring, top, winner, annual, student, edito..."


## Some Additional Cleaning

In [4]:
# List of Extra Stop words
stopwords = [
    'week','another','thing','month','day','come',
    'york','away','left','wrote','came','tell','asked',
    'left','right','hand','point','often','talk','head','point','ago','whether',
    'hour','group','became','become','becomes','often','sometimes','usually',
    ]

In [5]:
# Remove Extra Stop Words
art['article_text_nostop_extra'] = [[word for word in x if not word in stopwords] for x in art['article_text']]

In [6]:
art['string'] = [' '.join(x) for x in art['article_text_nostop_extra']]

# Preprocessing

## Count Vectorizer

In [7]:
vec = CountVectorizer(max_df =.95,min_df = 30,stop_words='english')
counts = vec.fit_transform(art['string']).transpose()

In [8]:
counts.shape

(19003, 29539)

## Get Corpus

In [9]:
# # import corpus
# with open('pickled_files/full_set_countvec_corpus.pickle','rb') as file:
#     corpus = pickle.load(file)

In [10]:
corpus = matutils.Sparse2Corpus(counts)

In [11]:
# # export corpus
# with open('pickled_files/full_set_countvec_corpus.pickle','wb') as file:
#     pickle.dump(corpus,file)

## Get Dictionary

In [12]:
# # load previously saved dictionary

# with open('pickled_files/full_set_countvec_dict.pickle', 'rb') as file:
#     dictionary = pickle.load(file)

In [13]:
dictionary = dict((v, k) for k, v in vec.vocabulary_.items())

In [14]:
len(dictionary)

19003

In [15]:
# # Check
# corpus[2]

In [16]:
# # Check
# dictionary

In [17]:
# # export dictionary

# with open('pickled_files/full_set_countvec_dict.pickle', 'wb') as file:
#     pickle.dump(dictionary,file)

# Topic Modeling - LDA

## Run no. of topics (12-20) with Alpha to Auto 50p

In [18]:
# Dictionary to store topics in
validation_50p = {}

In [19]:
# Function to calculate and store models for validating choice for no_of_topics
def cross_val_topics_50p(no_of_topics):
    model = LdaModel(corpus, num_topics=no_of_topics, id2word = dictionary, passes=50, alpha='auto', eval_every=2000)
    globals()['ldamodel_%dt_50p_autoalpha_val' % no_of_topics] = model
    model.save('pickled_files/models/noent/countvec/val/lda_{}t_50p_autoalpha_val.model'.format(no_of_topics))
    validation_50p[no_of_topics] = model.print_topics(num_topics=-1, num_words=50)

In [20]:
# Loop through 12-20 topics to see optimal number of topics
for i in np.arange(12,21):
    print('Running LDA with 50 passes for {} topics...'.format(i))
    cross_val_topics_50p(i)
    with open('pickled_files/models/noent/countvec/val/dict_topics_from_validation_50p.pickle', 'wb') as file:
        pickle.dump(validation_50p, file)

In [21]:
# # Load Models
# for no_of_topics in np.arange(12,21):
#     model =  models.LdaModel.load('pickled_files/models/noent/countvec/val/lda_{}t_50p_autoalpha_val.model'.format(no_of_topics))
#     globals()['ldamodel_%dt_50p_autoalpha_val' % no_of_topics] = model;

2018-05-15 17:38:31,971 : INFO : loading LdaModel object from pickled_files/models/noent/countvec/val/lda_12t_50p_autoalpha_val.model
2018-05-15 17:38:31,974 : INFO : loading expElogbeta from pickled_files/models/noent/countvec/val/lda_12t_50p_autoalpha_val.model.expElogbeta.npy with mmap=None
2018-05-15 17:38:31,976 : INFO : setting ignored attribute id2word to None
2018-05-15 17:38:31,977 : INFO : setting ignored attribute state to None
2018-05-15 17:38:31,978 : INFO : setting ignored attribute dispatcher to None
2018-05-15 17:38:31,978 : INFO : loaded pickled_files/models/noent/countvec/val/lda_12t_50p_autoalpha_val.model
2018-05-15 17:38:31,979 : INFO : loading LdaState object from pickled_files/models/noent/countvec/val/lda_12t_50p_autoalpha_val.model.state
2018-05-15 17:38:31,986 : INFO : loaded pickled_files/models/noent/countvec/val/lda_12t_50p_autoalpha_val.model.state
2018-05-15 17:38:32,019 : INFO : loading LdaModel object from pickled_files/models/noent/countvec/val/lda_13t

In [22]:
# # Load Validation
# with open('pickled_files/models/noent/countvec/val/dict_topics_from_validation_50p.pickle', 'rb') as file:
#     validation_50p = pickle.load(file)

In [23]:
# Check Validation
pp.pprint(validation_50p)

{   12: [   (   0,
                '0.021*"police" + 0.011*"officer" + 0.009*"trial" + '
                '0.009*"man" + 0.009*"case" + 0.008*"death" + 0.008*"defense" '
                '+ 0.008*"lawyer" + 0.007*"shooting" + 0.006*"charge" + '
                '0.006*"mental" + 0.006*"judge" + 0.006*"prosecutor" + '
                '0.006*"victim" + 0.005*"defendant" + 0.005*"jury" + '
                '0.005*"shot" + 0.005*"murder" + 0.005*"crime" + 0.005*"court" '
                '+ 0.005*"insanity" + 0.004*"killed" + 0.004*"hospital" + '
                '0.004*"official" + 0.004*"woman" + 0.004*"killing" + '
                '0.004*"guilty" + 0.004*"psychiatrist" + 0.004*"charged" + '
                '0.004*"gun" + 0.004*"witness" + 0.004*"investigation" + '
                '0.004*"according" + 0.004*"family" + 0.004*"attorney" + '
                '0.003*"evidence" + 0.003*"testimony" + 0.003*"violence" + '
                '0.003*"criminal" + 0.003*"investigator" + 0.003*"psychiatric" '

                '0.008*"product" + 0.008*"gambling" + 0.008*"store" + '
                '0.007*"fat" + 0.007*"cigarette" + 0.006*"alcoholic" + '
                '0.006*"farm" + 0.006*"pound" + 0.006*"wine" + 0.005*"farmer" '
                '+ 0.005*"meal" + 0.005*"kaczynski" + 0.005*"fish" + '
                '0.005*"department" + 0.005*"coffee" + 0.004*"bar" + '
                '0.004*"smoke" + 0.004*"calorie" + 0.004*"addiction" + '
                '0.004*"chicken" + 0.004*"casino" + 0.004*"alcoholism" + '
                '0.004*"gambler" + 0.004*"meat" + 0.004*"salt" + '
                '0.004*"dinner" + 0.004*"habit" + 0.004*"percent" + '
                '0.004*"industry" + 0.004*"compulsive" + 0.004*"cooking" + '
                '0.004*"kitchen" + 0.004*"milk" + 0.003*"cook" + 0.003*"rice" '
                '+ 0.003*"consumption" + 0.003*"sale"'),
            (   7,
                '0.033*"city" + 0.017*"people" + 0.015*"state" + '
                '0.014*"service" + 0.014*"home" 

                '0.004*"power" + 0.004*"foreign" + 0.004*"plant" + '
                '0.003*"minister" + 0.003*"immigrant" + 0.003*"political" + '
                '0.003*"leader" + 0.003*"trade" + 0.003*"disaster" + '
                '0.003*"camp" + 0.003*"area" + 0.003*"force" + 0.003*"refugee" '
                '+ 0.003*"national" + 0.003*"flight" + 0.003*"military" + '
                '0.003*"hostage" + 0.003*"thousand" + 0.003*"plane" + '
                '0.003*"security" + 0.003*"town" + 0.003*"page" + '
                '0.003*"victim" + 0.003*"ground" + 0.003*"immigration" + '
                '0.003*"news" + 0.002*"president" + 0.002*"authority" + '
                '0.002*"mile" + 0.002*"peace" + 0.002*"terrorist" + '
                '0.002*"effort" + 0.002*"region" + 0.002*"red"'),
            (   1,
                '0.018*"disorder" + 0.018*"mental" + 0.017*"depression" + '
                '0.017*"patient" + 0.016*"treatment" + 0.013*"therapy" + '
                '0.012*"psychi

                '0.002*"better" + 0.002*"winning" + 0.002*"national" + '
                '0.002*"athletic" + 0.002*"problem" + 0.002*"yankee" + '
                '0.002*"stadium" + 0.002*"manager"'),
            (   13,
                '0.013*"art" + 0.010*"film" + 0.008*"theater" + 0.007*"music" '
                '+ 0.007*"street" + 0.007*"ticket" + 0.006*"movie" + '
                '0.006*"museum" + 0.005*"play" + 0.005*"artist" + '
                '0.004*"dinner" + 0.004*"television" + 0.004*"center" + '
                '0.004*"performance" + 0.003*"dance" + 0.003*"star" + '
                '0.003*"house" + 0.003*"benefit" + 0.003*"song" + '
                '0.003*"event" + 0.003*"free" + 0.003*"story" + 0.003*"world" '
                '+ 0.003*"director" + 0.003*"child" + 0.003*"party" + '
                '0.003*"begin" + 0.003*"directed" + 0.003*"character" + '
                '0.002*"cocktail" + 0.002*"evening" + 0.002*"open" + '
                '0.002*"series" + 0.002*"productio

### Validate no. of topics (12-20) with Average Weights of Topics per Document

In [24]:
val_models_to_check = [
    ldamodel_12t_50p_autoalpha_val,
    ldamodel_13t_50p_autoalpha_val,
    ldamodel_14t_50p_autoalpha_val,
    ldamodel_15t_50p_autoalpha_val,
    ldamodel_16t_50p_autoalpha_val,
    ldamodel_17t_50p_autoalpha_val,
    ldamodel_18t_50p_autoalpha_val,
    ldamodel_19t_50p_autoalpha_val,
    ldamodel_20t_50p_autoalpha_val,
]

In [25]:
def get_art_topics_val(val_models_to_check):
    model_num = 12
    for model in val_models_to_check:
        print('Getting art_topics_ldamodel_%dt_50p_autoalpha_val...' % (model_num))
        global art_topics_val
        art_topics_val = []
        for article in corpus:
            art_topics_val.append(model.get_document_topics(article, 
#                                            minimum_probability=0.15,   
                                           minimum_phi_value=None, 
                                           per_word_topics=False)
             )
        for i in art_topics_val:
            i.sort(key=itemgetter(1),reverse=True)
        globals()['art_topics_ldamodel_%dt_50p_autoalpha_val' % model_num] = art_topics_val
        with open('pickled_files/models/noent/countvec/val/art_topics_lda_{}t_50p_autoalpha_val.pickle'.format(model_num),'wb') as file:
            pickle.dump(art_topics_val,file)
        print('Finished art_topics_ldamodel_%dt_50p_autoalpha_val.' % (model_num))
        model_num += 1

In [26]:
# Get the topics for each article for each model
get_art_topics_val(val_models_to_check)

In [27]:
# # Load Applied Article Topics
# for model_num in np.arange(12,21):
#     with open('pickled_files/models/noent/countvec/val/art_topics_lda_{}t_50p_autoalpha_val.pickle'.format(model_num),'rb') as file:
#         art_topics_val = pickle.load(file)
#     globals()['art_topics_ldamodel_%dt_50p_autoalpha_val' % model_num] = art_topics_val

In [28]:
applied_topics = [
    art_topics_ldamodel_12t_50p_autoalpha_val,
    art_topics_ldamodel_13t_50p_autoalpha_val,
    art_topics_ldamodel_14t_50p_autoalpha_val,
    art_topics_ldamodel_15t_50p_autoalpha_val,
    art_topics_ldamodel_16t_50p_autoalpha_val,
    art_topics_ldamodel_17t_50p_autoalpha_val,
    art_topics_ldamodel_18t_50p_autoalpha_val,
    art_topics_ldamodel_19t_50p_autoalpha_val,
    art_topics_ldamodel_20t_50p_autoalpha_val,
]

In [29]:
def get_av_prob_scores(applied):
    applied_score = [[x[1] for x in article] for article in applied]
#     applied_score = [[x[1] for x in article if x[1]>.12] for article in applied] # --> extension to get biggest topics
    applied_score = [np.mean(article) for article in applied_score]
#     applied_score = np.sum(applied_score) # -->makes no difference with mean, but just a variation
    applied_score = np.mean(applied_score)
#    return(applied_score)

In [30]:
# Get scores
scores_50p = defaultdict()
mod_num = 12
for applied in applied_topics:
    print('Getting mean average topic weights for articles in model with %d topics...' % mod_num)
    scores_50p[mod_num] = get_av_prob_scores(applied)
    mod_num += 1

In [31]:
# Save scores to pickle
with open('pickled_files/models/noent/countvec/val/av_weight_scores_12t-20t_50p_autoalpha_countvec.pickle','wb') as file:
    pickle.dump(scores_50p, file)

In [32]:
# # Load scores from pickle
# with open('pickled_files/models/noent/countvec/val/av_weight_scores_12t-20t_50p_autoalpha_countvec.pickle','rb') as file:
#     scores_50p = pickle.load(file)

In [33]:
# Which number of topics performs best with 50 passes?
scores_50p
# --> 12, which makes sense, as broader topics would have higher and less topic weights per article.
# Let's investigate and pick the lowest number of topics as possible

defaultdict(None,
            {12: 0.20462723,
             13: 0.18348703,
             14: 0.18173361,
             15: 0.18073815,
             16: 0.16733696,
             17: 0.16116382,
             18: 0.15747477,
             19: 0.15231758,
             20: 0.1524702})

#### Check 12 topics

In [34]:
art_topics_ldamodel_12t_50p_autoalpha_val[2125:2130]

[[(4, 0.57962978),
  (5, 0.18506525),
  (6, 0.10375641),
  (3, 0.070821896),
  (11, 0.032072257),
  (8, 0.018756555)],
 [(2, 0.27761555),
  (6, 0.21436819),
  (1, 0.13119446),
  (0, 0.12758826),
  (7, 0.10268247),
  (4, 0.067639753),
  (8, 0.047024876),
  (9, 0.021596743)],
 [(8, 0.41356125),
  (3, 0.31065202),
  (1, 0.10509452),
  (2, 0.048307337),
  (9, 0.042736165),
  (10, 0.040432099),
  (6, 0.037958425)],
 [(0, 0.6730786), (3, 0.18112579), (4, 0.068173751), (6, 0.057074886)],
 [(6, 0.50200254), (1, 0.43771306), (10, 0.044782586)]]

In [35]:
art.iloc[2125:2130]
# From looking at this and checking the articles, 
# I think the number of topics should be a bit more than 12

# Below I'll be going through the other models with different
# numbers of topics and comparing these same articles,
# to see which number of topics makes most sense.

Unnamed: 0,article_text,article_text_nostop_extra,string
https://www.nytimes.com/2016/03/01/nyregion/overcrowding-worsens-in-new-york-as-working-families-double-up.html,"[rafael, housing, situation, exercise, toleran...","[rafael, housing, situation, exercise, toleran...",rafael housing situation exercise tolerance cr...
https://www.nytimes.com/2016/03/01/nytnow/your-tuesday-evening-briefing-donald-trump-iphone-scott-kelly.html,"[briefing, email, evening, latest, upended, co...","[briefing, email, evening, latest, upended, co...",briefing email evening latest upended conventi...
https://www.nytimes.com/2016/03/01/us/politics/abortion-supreme-court-women-explain-choices.html,"[washington, amy, brenneman, actress, want, ju...","[washington, amy, brenneman, actress, want, ju...",washington amy brenneman actress want justice ...
https://www.nytimes.com/2016/03/01/world/europe/woman-waving-childs-severed-head-is-arrested-in-moscow.html,"[moscow, visibly, disturbed, woman, arrested, ...","[moscow, visibly, disturbed, woman, arrested, ...",moscow visibly disturbed woman arrested witnes...
https://www.nytimes.com/2016/03/02/fashion/amber-tamblyn-modern-love-podcast-episode-8.html,"[eighth, episode, modern, love, podcast, actre...","[eighth, episode, modern, love, podcast, actre...",eighth episode modern love podcast actress amb...


#### Check 13 topics

In [36]:
art_topics_ldamodel_13t_50p_autoalpha_val[2125:2130]

[[(9, 0.59469336),
  (0, 0.14388536),
  (12, 0.086599395),
  (6, 0.054454222),
  (4, 0.04863555),
  (1, 0.030050594),
  (10, 0.029237121),
  (7, 0.011214718)],
 [(2, 0.24455552),
  (8, 0.14801615),
  (4, 0.1322064),
  (12, 0.105798),
  (11, 0.10164268),
  (5, 0.087461211),
  (9, 0.076271482),
  (3, 0.052095685),
  (1, 0.049848791)],
 [(1, 0.30323809),
  (12, 0.26292747),
  (2, 0.23806664),
  (7, 0.086728185),
  (6, 0.041776355),
  (0, 0.033501178),
  (11, 0.032332893)],
 [(5, 0.52265185),
  (7, 0.11155502),
  (12, 0.080884613),
  (8, 0.078920588),
  (9, 0.07211405),
  (3, 0.072045177),
  (1, 0.043418065)],
 [(12, 0.6905337), (11, 0.18319415), (4, 0.10794071)]]

#### Check 14 topics

In [37]:
art_topics_ldamodel_14t_50p_autoalpha_val[2125:2130]

[[(2, 0.46017951),
  (7, 0.25504538),
  (5, 0.11048138),
  (11, 0.076387197),
  (8, 0.045483541),
  (3, 0.035697218)],
 [(10, 0.22675671),
  (1, 0.13865712),
  (0, 0.11725798),
  (6, 0.11663849),
  (13, 0.11199103),
  (9, 0.10896588),
  (7, 0.083979651),
  (2, 0.058786869),
  (4, 0.031007973)],
 [(10, 0.26740092),
  (6, 0.24998768),
  (3, 0.20039396),
  (9, 0.13462612),
  (12, 0.12122938),
  (8, 0.012903375),
  (11, 0.011528835)],
 [(6, 0.42507488), (7, 0.30292755), (0, 0.13387811), (12, 0.11328805)],
 [(7, 0.53473783), (9, 0.34370074), (3, 0.072061278), (1, 0.031968486)]]

#### Check 15 topics

In [38]:
art_topics_ldamodel_15t_50p_autoalpha_val[2125:2130]

[[(2, 0.62279862),
  (12, 0.17647575),
  (7, 0.060099293),
  (14, 0.05249697),
  (8, 0.038400389),
  (6, 0.018835856),
  (1, 0.017921273)],
 [(4, 0.25295883),
  (13, 0.1361814),
  (14, 0.13009556),
  (1, 0.11000224),
  (8, 0.084799603),
  (5, 0.078105509),
  (2, 0.073610559),
  (3, 0.068432234),
  (11, 0.044237427),
  (9, 0.016310817)],
 [(5, 0.29129097),
  (8, 0.25460076),
  (3, 0.17180623),
  (7, 0.13064507),
  (9, 0.082859851),
  (14, 0.042457622),
  (0, 0.024511602)],
 [(1, 0.67146176), (7, 0.2006319), (4, 0.096660398)],
 [(14, 0.75212592), (8, 0.1467648), (7, 0.05610574), (0, 0.028219402)]]

#### Check 16 topics

In [39]:
art_topics_ldamodel_16t_50p_autoalpha_val[2125:2130]

[[(7, 0.60048747),
  (11, 0.13838895),
  (13, 0.076221749),
  (14, 0.069597922),
  (0, 0.054965086),
  (9, 0.016528273),
  (6, 0.015369389),
  (1, 0.015000599),
  (5, 0.011441759)],
 [(3, 0.30858582),
  (0, 0.16797721),
  (4, 0.10251819),
  (13, 0.098378561),
  (10, 0.086469546),
  (9, 0.077064008),
  (7, 0.073698595),
  (8, 0.066444539),
  (12, 0.015978726)],
 [(9, 0.34186819),
  (1, 0.16983628),
  (2, 0.1397157),
  (8, 0.12153468),
  (0, 0.081346437),
  (10, 0.051003367),
  (12, 0.039093219),
  (3, 0.023859939),
  (15, 0.017210785),
  (14, 0.013029487)],
 [(4, 0.60821575),
  (1, 0.16587478),
  (0, 0.080533184),
  (7, 0.061855454),
  (3, 0.057438254)],
 [(0, 0.41470584), (10, 0.35813782), (13, 0.13047756), (2, 0.076849885)]]

## Run no. of topics (12-16) with Alpha to Auto 200p

In [40]:
# Dictionary to store topics in
validation_200p = {}

In [41]:
# Function to calculate and store models for validating choice for no_of_topics
def cross_val_topics_200p(no_of_topics):
    model = LdaModel(corpus, num_topics=no_of_topics, id2word = dictionary, passes=200, alpha='auto', eval_every=2000)
    globals()['ldamodel_%dt_200p_autoalpha_val' % no_of_topics] = model
    model.save('pickled_files/models/noent/countvec/val/lda_{}t_200p_autoalpha_val.model'.format(no_of_topics))
    validation_200p[no_of_topics] = model.print_topics(num_topics=-1, num_words=50)

In [42]:
# Loop through x-x topics to see optimal number of topics
for i in np.arange(12,17):
    print('Running LDA with 200 passes for {} topics...'.format(i))
    cross_val_topics_200p(i)
    with open('pickled_files/models/noent/countvec/val/dict_topics_from_validation_200p.pickle', 'wb') as file:
        pickle.dump(validation_200p, file)

In [43]:
# # Load Models
# for no_of_topics in np.arange(12,17):
#     model =  models.LdaModel.load('pickled_files/models/noent/countvec/val/lda_{}t_200p_autoalpha_val.model'.format(no_of_topics))
#     globals()['ldamodel_%dt_200p_autoalpha_val' % no_of_topics] = model

2018-05-15 17:38:36,648 : INFO : loading LdaModel object from pickled_files/models/noent/countvec/val/lda_12t_200p_autoalpha_val.model
2018-05-15 17:38:36,651 : INFO : loading expElogbeta from pickled_files/models/noent/countvec/val/lda_12t_200p_autoalpha_val.model.expElogbeta.npy with mmap=None
2018-05-15 17:38:36,653 : INFO : setting ignored attribute id2word to None
2018-05-15 17:38:36,653 : INFO : setting ignored attribute state to None
2018-05-15 17:38:36,654 : INFO : setting ignored attribute dispatcher to None
2018-05-15 17:38:36,655 : INFO : loaded pickled_files/models/noent/countvec/val/lda_12t_200p_autoalpha_val.model
2018-05-15 17:38:36,656 : INFO : loading LdaState object from pickled_files/models/noent/countvec/val/lda_12t_200p_autoalpha_val.model.state
2018-05-15 17:38:36,664 : INFO : loaded pickled_files/models/noent/countvec/val/lda_12t_200p_autoalpha_val.model.state
2018-05-15 17:38:36,703 : INFO : loading LdaModel object from pickled_files/models/noent/countvec/val/ld

In [44]:
# # Load Validation Topics 
# with open('pickled_files/models/noent/countvec/val/dict_topics_from_validation_200p.pickle', 'rb') as file:
#     validation_200p = pickle.load(file)

In [45]:
pp.pprint(validation_200p)

{   12: [   (   0,
                '0.021*"gun" + 0.015*"trial" + 0.014*"defense" + '
                '0.013*"lawyer" + 0.012*"judge" + 0.011*"mental" + '
                '0.011*"shooting" + 0.010*"case" + 0.010*"prosecutor" + '
                '0.009*"jury" + 0.009*"court" + 0.009*"murder" + '
                '0.008*"violence" + 0.008*"defendant" + 0.008*"crime" + '
                '0.007*"insanity" + 0.007*"state" + 0.007*"law" + '
                '0.007*"death" + 0.007*"guilty" + 0.006*"psychiatrist" + '
                '0.006*"killing" + 0.005*"criminal" + 0.005*"testimony" + '
                '0.005*"people" + 0.005*"evidence" + 0.005*"charge" + '
                '0.005*"hearing" + 0.005*"attorney" + 0.005*"federal" + '
                '0.005*"witness" + 0.005*"mentally" + 0.004*"victim" + '
                '0.004*"district" + 0.004*"prosecution" + 0.004*"violent" + '
                '0.004*"juror" + 0.004*"psychiatric" + 0.004*"expert" + '
                '0.004*"testified" + 0.0

                '0.003*"live" + 0.003*"sister" + 0.003*"saw" + 0.002*"morning" '
                '+ 0.002*"place" + 0.002*"lost" + 0.002*"face" + '
                '0.002*"trying" + 0.002*"lived" + 0.002*"baby" + 0.002*"door" '
                '+ 0.002*"person" + 0.002*"case" + 0.002*"office" + '
                '0.002*"moved"'),
            (   7,
                '0.089*"drug" + 0.018*"company" + 0.014*"alcohol" + '
                '0.014*"treatment" + 0.010*"addiction" + 0.010*"medication" + '
                '0.010*"abuse" + 0.009*"patient" + 0.009*"doctor" + '
                '0.008*"effect" + 0.007*"percent" + 0.007*"antidepressant" + '
                '0.007*"prescription" + 0.006*"problem" + 0.006*"substance" + '
                '0.006*"addict" + 0.006*"product" + 0.006*"used" + '
                '0.006*"pill" + 0.006*"medical" + 0.006*"cocaine" + '
                '0.006*"smoking" + 0.006*"marijuana" + 0.005*"medicine" + '
                '0.005*"study" + 0.005*"drinking" + 0.0

### Validate no. of topics (12-16) with Average Weights

In [46]:
val_models_to_check = [
    ldamodel_12t_200p_autoalpha_val,
    ldamodel_13t_200p_autoalpha_val,
    ldamodel_14t_200p_autoalpha_val,
    ldamodel_15t_200p_autoalpha_val,
    ldamodel_16t_200p_autoalpha_val,
#     ldamodel_17t_200p_autoalpha_val,
#     ldamodel_18t_200p_autoalpha_val,
#     ldamodel_19t_200p_autoalpha_val,
#     ldamodel_20t_200p_autoalpha_val,
]

In [47]:
def get_art_topics_val(val_models_to_check):
    model_num = 12
    for model in val_models_to_check:
        print('Getting art_topics_ldamodel_%dt_200p_autoalpha_val...' % (model_num))
        global art_topics_val
        art_topics_val = []
        for article in corpus:
            art_topics_val.append(model.get_document_topics(article, 
#                                            minimum_probability=0.15,   
                                           minimum_phi_value=None, 
                                           per_word_topics=False)
             )
        for i in art_topics_val:
            i.sort(key=itemgetter(1),reverse=True)
        globals()['art_topics_ldamodel_%dt_200p_autoalpha_val' % model_num] = art_topics_val
        with open('pickled_files/models/noent/countvec/val/art_topics_lda_{}t_200p_autoalpha_val.pickle'.format(model_num),'wb') as file:
            pickle.dump(art_topics_val,file)
        print('Finished art_topics_ldamodel_%dt_200p_autoalpha_val.' % (model_num))
        model_num += 1

In [48]:
# Get the topics for each article for each model
get_art_topics_val(val_models_to_check)

In [49]:
# # Load Article Topics
# for model_num in np.arange(12,17):
#     with open('pickled_files/models/noent/countvec/val/art_topics_lda_{}t_200p_autoalpha_val.pickle'.format(model_num),'rb') as file:
#           art_topics_val = pickle.load(file)
#     globals()['art_topics_ldamodel_%dt_200p_autoalpha_val' % model_num] = art_topics_val

In [50]:
applied_topics = [
    art_topics_ldamodel_12t_200p_autoalpha_val,
    art_topics_ldamodel_13t_200p_autoalpha_val,
    art_topics_ldamodel_14t_200p_autoalpha_val,
    art_topics_ldamodel_12t_200p_autoalpha_val,
    art_topics_ldamodel_16t_200p_autoalpha_val,
#     art_topics_ldamodel_17t_200p_autoalpha_val,
#     art_topics_ldamodel_18t_200p_autoalpha_val,
#     art_topics_ldamodel_19t_200p_autoalpha_val,
#     art_topics_ldamodel_20t_200p_autoalpha_val,
]

In [51]:
def get_av_prob_scores(applied):
    applied_score = [[x[1] for x in article[:5]] for article in applied]
#     applied_score = [[x[1] for x in article if x[1]>.12] for article in applied] # --> extension to get biggest topics
    applied_score = [np.mean(article) for article in applied_score]
#     applied_score = np.sum(applied_score) # -->makes no difference with mean, but just a variation
    applied_score = np.mean(applied_score)
    return(applied_score)

In [52]:
# Get scores
scores_200p = defaultdict()
mod_num = 12
for applied in applied_topics:
    print('Getting mean average topic weights for articles in model with %d topics...' % mod_num)
    scores_200p[mod_num] = get_av_prob_scores(applied)
    mod_num += 1

In [53]:
# Save scores to pickle
with open('pickled_files/models/noent/countvec/val/av_weight_scores_12t-16t_200p_autoalpha_countvec.pickle','wb') as file:
    pickle.dump(scores_200p,file)

In [54]:
# # Load scores from pickle
# with open('pickled_files/models/noent/countvec/val/av_weight_scores_12t-16t_200p_autoalpha_countvec.pickle','rb') as file:
#     scores_200p = pickle.load(file)

In [55]:
# Which number of topics performs best with 200 passes?
scores_200p

# Even though 15 seems best, running the model for 300 passes for both 14 and 15 topics, 
# 14 topics resulted in the most distinct topics. Therefore, I went with that model.

defaultdict(None,
            {12: 0.19303617,
             13: 0.18328962,
             14: 0.17975949,
             15: 0.19303617,
             16: 0.17411229})

## Now Run the Model with optimal number of topics for 300 Passes

### 14 Topics

In [56]:
# Run Model (14)
no_of_topics = 14
ldamodel14 = LdaModel(corpus, num_topics=no_of_topics, id2word = dictionary, passes=300, alpha='auto', eval_every=2000)

In [57]:
# Save Model (14)
ldamodel14.save('pickled_files/models/noent/countvec/val/lda_{}t_500p_autoalpha_val.model'.format(no_of_topics))

In [58]:
# # Load Model
# no_of_topics = 14
# ldamodel14 = model.load('pickled_files/models/noent/countvec/val/lda_{}t_500p_autoalpha_val.model'.format(no_of_topics))

2018-05-15 17:38:38,825 : INFO : loading LdaModel object from pickled_files/models/noent/countvec/val/lda_14t_500p_autoalpha_val.model
2018-05-15 17:38:38,827 : INFO : loading expElogbeta from pickled_files/models/noent/countvec/val/lda_14t_500p_autoalpha_val.model.expElogbeta.npy with mmap=None
2018-05-15 17:38:38,829 : INFO : setting ignored attribute id2word to None
2018-05-15 17:38:38,830 : INFO : setting ignored attribute state to None
2018-05-15 17:38:38,831 : INFO : setting ignored attribute dispatcher to None
2018-05-15 17:38:38,831 : INFO : loaded pickled_files/models/noent/countvec/val/lda_14t_500p_autoalpha_val.model
2018-05-15 17:38:38,832 : INFO : loading LdaState object from pickled_files/models/noent/countvec/val/lda_14t_500p_autoalpha_val.model.state
2018-05-15 17:38:38,841 : INFO : loaded pickled_files/models/noent/countvec/val/lda_14t_500p_autoalpha_val.model.state


In [59]:
## Print Topics (14)
# ldamodel14.print_topics(num_topics=-1, num_words=10)

In [60]:
# Print Topics (14)
ldamodel14.show_topics(num_topics=-1, num_words=5, formatted=False)

[(0,
  [('game', 0.019290676),
   ('team', 0.014655967),
   ('player', 0.013197457),
   ('sport', 0.0096272184),
   ('play', 0.0086391047)]),
 (1,
  [('child', 0.071610503),
   ('school', 0.039327879),
   ('parent', 0.023915363),
   ('family', 0.020240301),
   ('student', 0.01847663)]),
 (2,
  [('life', 0.011472762),
   ('people', 0.011282165),
   ('woman', 0.0072893538),
   ('family', 0.0068212799),
   ('home', 0.0065922942)]),
 (3,
  [('art', 0.0068352227),
   ('book', 0.0065797614),
   ('film', 0.0050195237),
   ('street', 0.0049045617),
   ('life', 0.0043060104)]),
 (4,
  [('state', 0.024362968),
   ('president', 0.016713507),
   ('governor', 0.013491742),
   ('republican', 0.012855075),
   ('budget', 0.011843665)]),
 (5,
  [('court', 0.027797233),
   ('state', 0.021660788),
   ('case', 0.020019565),
   ('judge', 0.016594727),
   ('lawyer', 0.015601465)]),
 (6,
  [('woman', 0.033056237),
   ('gun', 0.028255416),
   ('law', 0.018436665),
   ('abortion', 0.014594568),
   ('violence',

In [61]:
# Save Topics
final_topics14 = ldamodel14.print_topics(num_topics=-1, num_words=50)
with open('pickled_files/models/noent/countvec/val/dict_topics_from_final_model_14t_500p.pickle', 'wb') as file:
        pickle.dump(final_topics14, file)

### Create Dictionary of the Resulting Topic Descriptions

In [62]:
pp.pprint(ldamodel14.show_topics(num_topics=-1, num_words=50, formatted=False))

[   (   0,
        [   ('game', 0.019290676),
            ('team', 0.014655967),
            ('player', 0.013197457),
            ('sport', 0.0096272184),
            ('play', 0.0086391047),
            ('season', 0.0082427021),
            ('coach', 0.0068989205),
            ('athlete', 0.005416363),
            ('football', 0.0054140333),
            ('run', 0.0053345719),
            ('league', 0.0046803448),
            ('playing', 0.0046123727),
            ('injury', 0.0043341378),
            ('baseball', 0.0040352782),
            ('training', 0.0037167822),
            ('field', 0.0036201878),
            ('running', 0.0035382255),
            ('win', 0.0034480568),
            ('race', 0.0034083212),
            ('played', 0.0033350296),
            ('ball', 0.0032576071),
            ('fan', 0.0031400183),
            ('basketball', 0.0031398002),
            ('club', 0.0031281544),
            ('gambling', 0.0030977998),
            ('hit', 0.0030377184),
            ('men

            ('problem', 0.0027883973),
            ('issue', 0.0027383221),
            ('person', 0.0026869718),
            ('better', 0.0025914316),
            ('information', 0.0025834448),
            ('relationship', 0.0025607287),
            ('view', 0.0025102119),
            ('standard', 0.0024052579),
            ('sense', 0.0023582138),
            ('read', 0.0023482449),
            ('kind', 0.002326323),
            ('society', 0.0023091913),
            ('mind', 0.0022835729),
            ('way', 0.0022441072),
            ('individual', 0.002239981),
            ('personal', 0.0022252435),
            ('language', 0.0021807817),
            ('subject', 0.0021516527),
            ('course', 0.0021419434),
            ('class', 0.0020885335),
            ('psychologist', 0.0020771371),
            ('feel', 0.0020465492),
            ('theory', 0.0019494399),
            ('belief', 0.0019432326),
            ('role', 0.0019308601),
            ('author', 0.0019128127),
  

In [63]:
topics = {
    0: 'sports/games',
    1: 'childhood/family/education',
    2: 'family/home/work-life',
    3: 'art/film/books',
    4: 'government/politics',
    5: 'justice-/penitentiary system',
    6: 'violence/assault/abuse',
    7: 'medical research',
    8: 'therapy/treatment/medication',
    9: 'community programs/services',
    10: 'police/shootings/murder',
    11: 'sociological trends',
    12: 'veterans/military/war',
    13: 'company/benefits/health insurance',
}

### Get list of words for WordClouds

#### Therapy

In [64]:
therapy = ldamodel14.show_topics(num_topics=-1, num_words=50)[8][1].split()

In [65]:
therapy_words = []
for word in np.arange(0,len(therapy))[::2]:
    therapy_words.append(int(float(therapy[word].split('*')[0])*1000)*[therapy[word].split('*')[1].split('"')[1]])
# therapy_words

#### Veterans

In [66]:
veterans = ldamodel14.show_topics(num_topics=-1, num_words=50)[12][1].split()

In [67]:
veterans_words = []
for word in np.arange(0,len(veterans))[::2]:
    veterans_words.append(int(float(veterans[word].split('*')[0])*1000)*[veterans[word].split('*')[1].split('"')[1]])
# veterans_words

#### Family

In [68]:
family = ldamodel14.show_topics(num_topics=-1, num_words=50)[2][1].split()

In [69]:
family_words = []
for word in np.arange(0,len(family))[::2]:
    family_words.append(int(float(family[word].split('*')[0])*1000)*[family[word].split('*')[1].split('"')[1]])
# family_words

#### Sociology

In [70]:
sociology = ldamodel14.show_topics(num_topics=-1, num_words=50)[11][1].split()

In [71]:
sociology_words = []
for word in np.arange(0,len(sociology))[::2]:
    sociology_words.append(int(float(sociology[word].split('*')[0])*1000)*[sociology[word].split('*')[1].split('"')[1]])
# sociology_words

#### Violence

In [72]:
violence = ldamodel14.show_topics(num_topics=-1, num_words=50)[6][1].split()

In [73]:
violence_words = []
for word in np.arange(0,len(violence))[::2]:
    violence_words.append(int(float(violence[word].split('*')[0])*1000)*[violence[word].split('*')[1].split('"')[1]])
# violence_words

#### Government

In [74]:
gov = ldamodel14.show_topics(num_topics=-1, num_words=50)[4][1].split()

In [75]:
gov_words = []
for word in np.arange(0,len(gov))[::2]:
    gov_words.append(int(float(gov[word].split('*')[0])*1000)*[gov[word].split('*')[1].split('"')[1]])
# gov_words

#### arts

In [76]:
arts = ldamodel14.show_topics(num_topics=-1, num_words=50)[3][1].split()

In [77]:
arts_words = []
for word in np.arange(0,len(arts))[::2]:
    arts_words.append(int(float(arts[word].split('*')[0])*1000)*[arts[word].split('*')[1].split('"')[1]])
# arts_words

#### Justice

In [78]:
just = ldamodel14.show_topics(num_topics=-1, num_words=50)[5][1].split()

In [79]:
just_words = []
for word in np.arange(0,len(just))[::2]:
    just_words.append(int(float(just[word].split('*')[0])*1000)*[just[word].split('*')[1].split('"')[1]])
# just_words

#### Medical research

In [80]:
medical = ldamodel14.show_topics(num_topics=-1, num_words=50)[7][1].split()

In [81]:
medical_words = []
for word in np.arange(0,len(medical))[::2]:
    medical_words.append(int(float(medical[word].split('*')[0])*1000)*[medical[word].split('*')[1].split('"')[1]])
# medical_words

# Combine with Articles

## Apply to Subset of Articles to Check

In [82]:
# Print topics so we can check
topics

{0: 'sports/games',
 1: 'childhood/family/education',
 2: 'family/home/work-life',
 3: 'art/film/books',
 4: 'government/politics',
 5: 'justice-/penitentiary system',
 6: 'violence/assault/abuse',
 7: 'medical research',
 8: 'therapy/treatment/medication',
 9: 'community programs/services',
 10: 'police/shootings/murder',
 11: 'sociological trends',
 12: 'veterans/military/war',
 13: 'company/benefits/health insurance'}

In [83]:
# Let's first check the first 50 and see what the min probability should be,
# and if the assigned topics make sense

art_topics_test = []
for article in np.arange(21670,21790):
    art_topics_test.append(ldamodel14.get_document_topics(corpus[article], 
                                           minimum_probability=0.15,   
                                                   # minprob will need to be adjusted based on no of topics generated by the model!
                                           minimum_phi_value=None, 
                                           per_word_topics=False)
#               .sort(key=itemgetter(1),reverse=True)
             )

In [84]:
for i in art_topics_test:
    i.sort(key=itemgetter(1),reverse=True)

In [85]:
for k,v in enumerate(art_topics_test,start=21670):
    print(k,v)

21670 [(9, 0.51404339), (2, 0.22148828)]
21671 [(13, 0.86589992)]
21672 [(10, 0.46076152), (2, 0.37694681)]
21673 [(10, 0.29183447), (8, 0.28755784)]
21674 [(4, 0.49859565), (6, 0.18539578)]
21675 [(13, 0.25250003), (4, 0.20723957)]
21676 [(10, 0.32224175), (12, 0.27431706), (2, 0.20827295)]
21677 [(12, 0.27236575), (10, 0.26420498), (2, 0.18685992)]
21678 [(6, 0.28238007), (4, 0.27523014), (11, 0.20649983), (12, 0.15528841)]
21679 [(13, 0.43546283), (8, 0.31409919), (10, 0.23414031)]
21680 [(0, 0.71233225)]
21681 [(3, 0.78569335)]
21682 [(2, 0.555336), (7, 0.23359857)]
21683 [(9, 0.56724453), (13, 0.35424405)]
21684 [(9, 0.61803246), (4, 0.16611029)]
21685 [(9, 0.38100609), (10, 0.23703404), (13, 0.19637783)]
21686 [(1, 0.87194788)]
21687 [(2, 0.4176273), (12, 0.27918294)]
21688 [(5, 0.86274594)]
21689 [(5, 0.75299621), (10, 0.19430774)]
21690 [(0, 0.79521972)]
21691 [(7, 0.4643091), (8, 0.33565918), (2, 0.16577157)]
21692 [(13, 0.82551783)]
21693 [(7, 0.48128837), (13, 0.31510994)]
2

In [86]:
art.iloc[21767] 

article_text                 [white, agency, struggling, reduced, budget, m...
article_text_nostop_extra    [white, agency, struggling, reduced, budget, m...
string                       white agency struggling reduced budget major c...
Name: https://www.nytimes.com/1994/11/06/nyregion/county-getting-millions-in-mental-health-windfall.html, dtype: object

## Apply Topics to Articles and Merge Topics into DF

In [87]:
# # Load pickle
# with open('pickled_files/art_with_topics_final.pickle','rb') as file:
#     art = pickle.load(file)

In [88]:
# Ok, now let's add the topics to a list to add to the articles
art_topics = []
for article in corpus:
    art_topics.append(ldamodel14.get_document_topics(article, 
                                           minimum_probability=0.15,   
                                                   # minprob will need to be adjusted based on no of topics generated by the model!
                                           minimum_phi_value=None, 
                                           per_word_topics=False)
#               .sort(key=itemgetter(1),reverse=True)
             )

In [89]:
# Let's see if the two line up
print(len(art_topics))
print(len(art))

29531
29531


In [90]:
# Yep, so let's merge them
art['topics'] = art_topics

In [91]:
# get max number of topics
art.topics.map(len).max()

5

In [92]:
art[['topics']].head()

Unnamed: 0_level_0,topics
index,Unnamed: 1_level_1
https://www.nytimes.com/2011/06/17/us/17ttgone.html,"[(0, 0.150113), (3, 0.536226)]"
https://www.nytimes.com/1987/02/27/movies/at-the-movies.html,"[(0, 0.150371), (3, 0.495912)]"
https://www.nytimes.com/1991/08/26/books/books-of-the-times-why-unshakable-belief-isn-t-the-same-as-truth.html,"[(0, 0.150435), (11, 0.486137)]"
https://www.nytimes.com/2011/08/07/realestate/outdoor-pools-sun-splash-repeat.html,"[(0, 0.150879), (2, 0.176495), (3, 0.203167), ..."
https://query.nytimes.com/gst/fullpage.html?res=9C03E2D61530F93AA25751C0A96F9C8B63,"[(0, 0.150896), (3, 0.534169)]"


## Create columns with weights for each topic

In [93]:
for t in np.arange(0,14):
    art[topics[t]] = [[i[1] if i[0]==t else 0 for i in a] for a in art['topics']]
    art[topics[t]] = [[i for i in a if i > 0] for a in art[topics[t]]]
    art[topics[t]] = art[topics[t]].apply(lambda x: 0 if len(x)==0 else x[0])

In [94]:
art.sort_values(by='topics',inplace=True)

In [95]:
# remove duplicates
art.reset_index(inplace=True)
for i in np.arange(0,8):
    art.drop(i,inplace=True)
art.set_index('index',inplace=True)

In [96]:
len(art)

29531

In [97]:
art.head()

Unnamed: 0_level_0,article_text,article_text_nostop_extra,string,topics,sports/games,childhood/family/education,family/home/work-life,art/film/books,government/politics,justice-/pentitentiary system,violence/assault/abuse,medical research,therapy/treatment/medication,community programs/services,police/shootings/murder,sociological trends,veterans/military/war,company/benefits/health insurance
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
https://www.nytimes.com/2011/06/17/us/17ttgone.html,"[quirky, discerning, pick, interesting, thing,...","[quirky, discerning, pick, interesting, state,...",quirky discerning pick interesting state austi...,"[(0, 0.150113), (3, 0.536226)]",0.150113,0.0,0.0,0.536226,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
https://www.nytimes.com/1987/02/27/movies/at-the-movies.html,"[award, nomination, announced, recently, yorke...","[award, nomination, announced, recently, yorke...",award nomination announced recently yorkers su...,"[(0, 0.150371), (3, 0.495912)]",0.150371,0.0,0.0,0.495912,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
https://www.nytimes.com/1991/08/26/books/books-of-the-times-why-unshakable-belief-isn-t-the-same-as-truth.html,"[fallibility, human, reason, everyday, life, g...","[fallibility, human, reason, everyday, life, g...",fallibility human reason everyday life gilovic...,"[(0, 0.150435), (11, 0.486137)]",0.150435,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.486137,0.0,0.0
https://www.nytimes.com/2011/08/07/realestate/outdoor-pools-sun-splash-repeat.html,"[iced, tea, perhaps, casual, friday, outdoor, ...","[iced, tea, perhaps, casual, friday, outdoor, ...",iced tea perhaps casual friday outdoor pool mi...,"[(0, 0.150879), (2, 0.176495), (3, 0.203167), ...",0.150879,0.0,0.176495,0.203167,0.0,0.0,0.0,0.0,0.0,0.399957,0.0,0.0,0.0,0.0
https://query.nytimes.com/gst/fullpage.html?res=9C03E2D61530F93AA25751C0A96F9C8B63,"[soapnet, erica, erica, strange, erin, karpluk...","[soapnet, erica, erica, strange, erin, karpluk...",soapnet erica erica strange erin karpluk smart...,"[(0, 0.150896), (3, 0.534169)]",0.150896,0.0,0.0,0.534169,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [98]:
# Save to pickle
with open('pickled_files/art_with_topics_final.pickle','wb') as file:
    art.to_pickle(file)

# Get Trends over Time

## Get Years from Raw Data

In [99]:
with open('pickled_files/raw_data.pickle', 'rb') as file:
    raw = pickle.load(file)

In [100]:
raw = raw[['pub_date']].copy()

In [101]:
raw['pub_year'] = [a[:4] for a in raw['pub_date']]

In [102]:
raw.head()

Unnamed: 0,pub_date,pub_year
https://lens.blogs.nytimes.com/2017/06/21/handicapped-but-no-longer-invisible-andres-millan/,2017-06-21,2017
https://lens.blogs.nytimes.com/2017/08/22/combat-photographer-marine-ptsd-book/,2017-08-22,2017
https://www.nytimes.com/2017/06/01/nyregion/bronx-police-shooting-mental-illness.html,2017-06-02,2017
https://www.nytimes.com/2017/06/01/world/canada/nurse-killings-insulin.html,2017-06-02,2017
https://www.nytimes.com/2017/06/02/learning/editorial-contest-winner-the-anguish-of-the-rich.html,2017-06-02,2017


## Append Years

In [103]:
# new = art.join(raw,on=None)

In [104]:
# load pickle
with open('pickled_files/art_with_topics_years_final.pickle', 'rb') as file:
    new = pickle.load(file)

In [105]:
len(new)

29509

In [106]:
new.head()

Unnamed: 0_level_0,article_text,article_text_nostop_extra,string,topics,sports/games,childhood/family/education,family/home/work-life,art/film/books,government/politics,justice-/pentitentiary system,violence/assault/abuse,medical research,therapy/treatment/medication,community programs/services,police/shootings/murder,sociological trends,veterans/military/war,company/benefits/health insurance,pub_date,pub_year
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
https://www.nytimes.com/aponline/2018/02/28/us/ap-us-dicks-rifle-sales-letter.html,"[following, letter, written, dick, sporting, g...","[following, letter, written, dick, sporting, g...",following letter written dick sporting good ce...,"[(6, 0.679574)]",0.0,0.0,0.0,0.0,0.0,0.0,0.679574,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2018-02-28,2018
https://www.nytimes.com/aponline/2018/02/28/us/ap-us-staged-shooting-sentence.html,"[portland, developer, serving, prison, term, s...","[portland, developer, serving, prison, term, s...",portland developer serving prison term scheme ...,"[(5, 0.346174), (10, 0.184921), (13, 0.195877)]",0.0,0.0,0.0,0.0,0.0,0.346174,0.0,0.0,0.0,0.0,0.184921,0.0,0.0,0.195877,2018-02-28,2018
https://www.nytimes.com/2018/02/28/sports/tennis/marion-bartoli.html,"[wimbledon, champion, marion, bartoli, endured...","[wimbledon, champion, marion, bartoli, endured...",wimbledon champion marion bartoli endured ment...,"[(0, 0.577956), (2, 0.179094)]",0.577956,0.0,0.179094,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2018-02-28,2018
https://www.nytimes.com/reuters/2018/02/28/us/28reuters-usa-guns-dicks-sporting.html,"[york, walmart, inc, retailer, joined, dick, s...","[walmart, inc, retailer, joined, dick, sportin...",walmart inc retailer joined dick sporting good...,"[(6, 0.662019), (13, 0.150865)]",0.0,0.0,0.0,0.0,0.0,0.0,0.662019,0.0,0.0,0.0,0.0,0.0,0.0,0.150865,2018-02-28,2018
https://www.nytimes.com/aponline/2018/02/28/us/politics/ap-us-trump-guns.html,"[washington, fellow, republican, hot, seat, pr...","[washington, fellow, republican, hot, seat, pr...",washington fellow republican hot seat presiden...,"[(4, 0.381876), (6, 0.517272)]",0.0,0.0,0.0,0.0,0.381876,0.0,0.517272,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2018-02-28,2018


In [107]:
# # to pickle
# with open('pickled_files/art_with_topics_years_final.pickle','wb') as file:
#     new.to_pickle(file)

## Get primary and secondary topics for each article (for tree-based viz)

In [108]:
primary_counts = {}
for t in topics:
    primary_counts[topics[t]] = sum([1 if x[0][0] == t else 0 for x in new['topics']])

In [109]:
primary_counts

{'art/film/books': 1776,
 'childhood/family/education': 5105,
 'community programs/services': 1465,
 'company/benefits/health insurance': 419,
 'family/home/work-life': 6633,
 'government/politics': 2703,
 'justice-/penitentiary system': 3394,
 'medical research': 2548,
 'police/shootings/murder': 687,
 'sociological trends': 566,
 'sports/games': 1298,
 'therapy/treatment/medication': 2043,
 'veterans/military/war': 258,
 'violence/assault/abuse': 614}

In [110]:
primary = pd.DataFrame(primary_counts,index=np.arange(0,15))

In [111]:
primary = pd.DataFrame(primary.iloc[0])

In [112]:
primary.rename(columns={0:'count'},inplace=True)

In [113]:
primary.to_csv('pickled_files/primary_topic_counts.csv',sep=',')

In [114]:
secondary_counts = {}
for t in topics:
    secondary_counts[topics[t]] = {}
    for top in topics:
        secondary_counts[topics[t]][topics[top]] = sum(
            [1 if len(x)>1 and x[0][0] == t and x[1][0] == top else 0 for x in new['topics']])

In [115]:
secondary = pd.DataFrame(secondary_counts)

In [116]:
secondary
# columns are primary topic
# rows are secondary topic

Unnamed: 0,art/film/books,childhood/family/education,community programs/services,company/benefits/health insurance,family/home/work-life,government/politics,justice-/penitentiary system,medical research,police/shootings/murder,sociological trends,sports/games,therapy/treatment/medication,veterans/military/war,violence/assault/abuse
art/film/books,0,559,0,0,2043,0,0,0,0,0,115,0,0,0
childhood/family/education,0,0,0,0,0,0,0,0,0,0,93,0,0,0
community programs/services,252,339,0,0,571,551,597,394,0,0,22,434,0,41
company/benefits/health insurance,35,118,409,0,138,512,181,137,16,166,16,397,100,23
family/home/work-life,0,1230,0,0,0,0,0,0,0,0,481,0,0,0
government/politics,122,152,0,0,302,0,0,0,0,0,12,0,0,0
justice-/penitentiary system,90,410,0,0,528,340,0,0,0,0,52,0,0,0
medical research,133,341,0,0,824,105,149,0,0,0,126,0,0,96
police/shootings/murder,56,92,249,0,294,55,808,33,0,0,14,88,0,85
sociological trends,393,364,168,0,487,170,159,436,54,0,56,681,0,126


In [117]:
secondary_counts = {}
for t in topics:
#     secondary_counts[topics[t]] = {}
    for top in topics:
        secondary_counts['{}_{}'.format(topics[t],topics[top])] = sum(
            [1 if len(x)>1 and x[0][0] == t and x[1][0] == top else 0 for x in new['topics']])

In [118]:
secondary_counts

{'art/film/books_art/film/books': 0,
 'art/film/books_childhood/family/education': 0,
 'art/film/books_community programs/services': 252,
 'art/film/books_company/benefits/health insurance': 35,
 'art/film/books_family/home/work-life': 0,
 'art/film/books_government/politics': 122,
 'art/film/books_justice-/penitentiary system': 90,
 'art/film/books_medical research': 133,
 'art/film/books_police/shootings/murder': 56,
 'art/film/books_sociological trends': 393,
 'art/film/books_sports/games': 0,
 'art/film/books_therapy/treatment/medication': 151,
 'art/film/books_veterans/military/war': 101,
 'art/film/books_violence/assault/abuse': 47,
 'childhood/family/education_art/film/books': 559,
 'childhood/family/education_childhood/family/education': 0,
 'childhood/family/education_community programs/services': 339,
 'childhood/family/education_company/benefits/health insurance': 118,
 'childhood/family/education_family/home/work-life': 1230,
 'childhood/family/education_government/politics

In [119]:
secondary = pd.DataFrame(secondary_counts,index=np.arange(0,15))

In [120]:
secondary = pd.DataFrame(secondary.iloc[0])

In [121]:
secondary.rename(columns={0:'count'},inplace=True)

In [122]:
secondary

Unnamed: 0,count
art/film/books_art/film/books,0
art/film/books_childhood/family/education,0
art/film/books_community programs/services,252
art/film/books_company/benefits/health insurance,35
art/film/books_family/home/work-life,0
art/film/books_government/politics,122
art/film/books_justice-/penitentiary system,90
art/film/books_medical research,133
art/film/books_police/shootings/murder,56
art/film/books_sociological trends,393


In [123]:
secondary.to_csv('pickled_files/secondary_topic_counts.csv',sep=',')

## Aggregate over Years

#### Aggregate Strength of Topic in Articles (for topic weight >15%)

In [124]:
years_strength = new.drop(['article_text','article_text_nostop_extra','string','topics','pub_date'],axis=1).groupby('pub_year').sum()

In [125]:
years_strength

Unnamed: 0_level_0,sports/games,childhood/family/education,family/home/work-life,art/film/books,government/politics,justice-/pentitentiary system,violence/assault/abuse,medical research,therapy/treatment/medication,community programs/services,police/shootings/murder,sociological trends,veterans/military/war,company/benefits/health insurance
pub_year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1981,16.483514,65.502655,49.598769,33.641154,35.047634,67.21676,4.703442,41.135209,35.496129,64.301655,34.280758,51.258898,27.591931,35.485021
1982,8.553854,40.763294,37.737641,31.953899,27.301618,112.480395,5.465624,46.320115,47.752636,60.160796,27.615939,50.488309,10.725679,36.857666
1983,10.633975,42.602385,37.755199,27.59363,34.737683,42.097333,4.032842,36.429499,35.809124,54.570961,15.786584,42.813292,12.977673,37.344833
1984,6.371284,38.721014,30.795523,27.877314,44.090089,38.602207,5.448517,30.177675,29.482374,61.155269,18.738675,41.330741,12.400311,36.599539
1985,7.116257,42.703924,34.374297,22.005622,14.084767,43.511498,6.561561,35.091722,38.894136,61.648486,25.761245,40.34363,10.62778,33.921182
1986,8.097611,50.680197,39.326893,27.484678,13.400504,47.681132,2.928119,36.517889,38.937132,65.46717,38.513197,42.813785,17.263157,30.014396
1987,9.976655,52.711736,48.42073,28.575394,20.685165,61.803006,3.845595,46.290717,46.185752,82.73442,29.327362,35.298626,14.677705,31.251901
1988,10.96962,38.188781,50.724044,20.824718,27.840755,38.883333,3.789648,49.569357,57.866777,75.257276,20.643233,48.941243,16.696451,34.237939
1989,5.950016,49.215885,42.216194,21.830924,15.333146,40.359635,6.430481,40.087068,49.614758,67.290466,25.16568,30.814453,10.832559,43.185972
1990,8.057935,53.556403,39.105605,22.537743,22.227673,35.442247,7.898641,34.321187,44.235219,57.110957,18.849541,33.758329,13.236684,34.540421


### Aggregate Count of Topic in Articles (for topic weight >15%)

### Supplement with of Articles per Year

In [126]:
years_count = new[['art/film/books','pub_year']].groupby('pub_year').count()

In [127]:
years_count.rename(columns={'art/film/books':'article_count'},inplace=True)

In [128]:
years_count

Unnamed: 0_level_0,article_count
pub_year,Unnamed: 1_level_1
1981,732
1982,705
1983,569
1984,553
1985,547
1986,600
1987,680
1988,649
1989,579
1990,555


In [129]:
years_strength = years_strength.join(years_count)

In [130]:
years_strength

Unnamed: 0_level_0,sports/games,childhood/family/education,family/home/work-life,art/film/books,government/politics,justice-/pentitentiary system,violence/assault/abuse,medical research,therapy/treatment/medication,community programs/services,police/shootings/murder,sociological trends,veterans/military/war,company/benefits/health insurance,article_count
pub_year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
1981,16.483514,65.502655,49.598769,33.641154,35.047634,67.21676,4.703442,41.135209,35.496129,64.301655,34.280758,51.258898,27.591931,35.485021,732
1982,8.553854,40.763294,37.737641,31.953899,27.301618,112.480395,5.465624,46.320115,47.752636,60.160796,27.615939,50.488309,10.725679,36.857666,705
1983,10.633975,42.602385,37.755199,27.59363,34.737683,42.097333,4.032842,36.429499,35.809124,54.570961,15.786584,42.813292,12.977673,37.344833,569
1984,6.371284,38.721014,30.795523,27.877314,44.090089,38.602207,5.448517,30.177675,29.482374,61.155269,18.738675,41.330741,12.400311,36.599539,553
1985,7.116257,42.703924,34.374297,22.005622,14.084767,43.511498,6.561561,35.091722,38.894136,61.648486,25.761245,40.34363,10.62778,33.921182,547
1986,8.097611,50.680197,39.326893,27.484678,13.400504,47.681132,2.928119,36.517889,38.937132,65.46717,38.513197,42.813785,17.263157,30.014396,600
1987,9.976655,52.711736,48.42073,28.575394,20.685165,61.803006,3.845595,46.290717,46.185752,82.73442,29.327362,35.298626,14.677705,31.251901,680
1988,10.96962,38.188781,50.724044,20.824718,27.840755,38.883333,3.789648,49.569357,57.866777,75.257276,20.643233,48.941243,16.696451,34.237939,649
1989,5.950016,49.215885,42.216194,21.830924,15.333146,40.359635,6.430481,40.087068,49.614758,67.290466,25.16568,30.814453,10.832559,43.185972,579
1990,8.057935,53.556403,39.105605,22.537743,22.227673,35.442247,7.898641,34.321187,44.235219,57.110957,18.849541,33.758329,13.236684,34.540421,555


### To pickle

In [131]:
# Save to pickle
with open('pickled_files/topics_years_strength_final.pickle','wb') as file:
    years_strength.to_pickle(file)

## Tableau Export

In [132]:
years_strength.reset_index(inplace=True)

In [133]:
years_strength.head()

Unnamed: 0,pub_year,sports/games,childhood/family/education,family/home/work-life,art/film/books,government/politics,justice-/pentitentiary system,violence/assault/abuse,medical research,therapy/treatment/medication,community programs/services,police/shootings/murder,sociological trends,veterans/military/war,company/benefits/health insurance,article_count
0,1981,16.483514,65.502655,49.598769,33.641154,35.047634,67.21676,4.703442,41.135209,35.496129,64.301655,34.280758,51.258898,27.591931,35.485021,732
1,1982,8.553854,40.763294,37.737641,31.953899,27.301618,112.480395,5.465624,46.320115,47.752636,60.160796,27.615939,50.488309,10.725679,36.857666,705
2,1983,10.633975,42.602385,37.755199,27.59363,34.737683,42.097333,4.032842,36.429499,35.809124,54.570961,15.786584,42.813292,12.977673,37.344833,569
3,1984,6.371284,38.721014,30.795523,27.877314,44.090089,38.602207,5.448517,30.177675,29.482374,61.155269,18.738675,41.330741,12.400311,36.599539,553
4,1985,7.116257,42.703924,34.374297,22.005622,14.084767,43.511498,6.561561,35.091722,38.894136,61.648486,25.761245,40.34363,10.62778,33.921182,547


In [134]:
years_strength.to_csv('pickled_files/tableau_years_strength_final.csv')