[All the news](https://components.one/datasets/#all-the-news)
143,000 articles from 15 American publications

In [23]:
import gensim
gensim.models.doc2vec.FAST_VERSION

1

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from collections import namedtuple
from corpus_builder import CorpusBuilder
import csv
import os
import sys
from tqdm import tqdm
from util import MALLET_PATH, show_topic_model

csv.field_size_limit(sys.maxsize)

131072

In [3]:
# This data is availble at 
path = './data/all-the-news/all_the_news_v2.csv'

In [4]:
Article = namedtuple('Article', 'id, title, author, date, content, year, month, publication, url, length'.split(', '))

In [5]:
articles = []
with open(path, mode='r', encoding='UTF-8') as csv_file:
    csv_reader = csv.reader(csv_file, delimiter=',')
    line_count = 0
    err_count = 0
    for row in csv_reader:
        if line_count == 0:
            #print(f'Column names are {", ".join(row)}')
            line_count += 1
        else:
            try:
                row = [cell.strip() for cell in row]
                id, title, author, date, content, year, month, publication, url, length = row
                year = int(year[:-2])
                month = int(month[:-2])
                length = int(length[:-2])
                article = Article(id, title, author, date, content, year, month, publication, url, length)
                articles.append(article)
                #print("%s\nBY %s ON %s\n%s" % (title, author, date, content[:200]))
            except ValueError as ex:
                #print("%d. <<%s>>" % (line_count, row))
                err_count += 1
                # raise ex
            line_count += 1
    print(f'Processed {line_count} lines with {err_count} errors.')

Processed 146151 lines with 6777 errors.


In [6]:
year2freq = {}
for article in tqdm(articles):
    if article.year not in year2freq: year2freq[article.year] = 0
    year2freq[article.year] += 1

100%|██████████| 139373/139373 [00:00<00:00, 1165374.31it/s]


In [7]:
year2freq

{2000: 1,
 2003: 2,
 2004: 1,
 2005: 2,
 2007: 1,
 2008: 2,
 2009: 3,
 2010: 5,
 2011: 7,
 2012: 27,
 2013: 452,
 2014: 2579,
 2015: 7986,
 2016: 79396,
 2017: 48909}

In [8]:
texts = [article.content for article in articles if article.year == 2016 or article.year == 2017]

In [9]:
%%time

DATA_DIRECTORY = './data/all-the-news'
MODEL_DIRECTORY = './model/all-the-news'
REFRESH = True

builder = CorpusBuilder(
    ndocs=len(texts),
    phrase_min_count=10, 
    vocabulary_size=50000,
    bigram_min_count=5,
    bigram_threshold=10,
    trigram_min_count=5,
    trigram_threshold=10,
    data_directory=DATA_DIRECTORY,
    model_directory=MODEL_DIRECTORY
)
if REFRESH:
    builder.train_phrasers(texts)    
    builder.save_phrasers()
    prepared_texts = builder.prepare_texts(texts)
    builder.build_vocabulary(prepared_texts, save=True)
    corpus = builder.build_corpus(prepared_texts)
    builder.build_dictionary(corpus, save=True)
builder.load_phrasers()
builder.load_vocabulary()
builder.load_dictionary()
encoded_corpus = builder.encode_corpus(corpus)

  0%|          | 4/128305 [00:00<1:08:59, 31.00it/s]

Streaming sentences.


100%|██████████| 128305/128305 [21:44<00:00, 98.36it/s]
  0%|          | 3/128305 [00:00<1:28:04, 24.28it/s]

Streaming sentences.


100%|██████████| 128305/128305 [30:28<00:00, 70.17it/s]
100%|██████████| 128305/128305 [25:11<00:00, 84.89it/s]
  0%|          | 186/128305 [00:00<01:08, 1856.92it/s]

Building vocabulary over 128305 documents.


100%|██████████| 128305/128305 [00:38<00:00, 3367.73it/s]
100%|██████████| 128305/128305 [00:25<00:00, 5070.24it/s]


CPU times: user 1h 25min 26s, sys: 1min 13s, total: 1h 26min 39s
Wall time: 1h 26min 48s


In [12]:
%%time

from gensim.models.wrappers import LdaMallet
from gensim import utils, matutils

num_topics = 128
model = LdaMallet(
    mallet_path=MALLET_PATH, 
    corpus=encoded_corpus,
    num_topics=num_topics, 
    id2word=builder.dictionary
)

CPU times: user 3min 56s, sys: 6.87 s, total: 4min 3s
Wall time: 55min 56s


In [13]:
show_topic_model(model, builder.dictionary, use_phrasers=True)

Topic 0:
  state, states, california, texas, governor, law, virginia, florida, colorado, bill
  north_carolina, west_virginia, south_carolina, executive_director, new_mexico, rhode_island
Topic 1:
  china, chinese, japan, beijing, country, u.s., japanese, kim, taiwan, pyongyang
  north_korea, south_korea, united_states, hong_kong, north_korean, south_china_sea
Topic 2:
  wearing, fashion, wear, dress, hair, wore, clothes, style, shoes, model
  post_shared_by, t-shirt, t-shirts, photo_posted_by, inspired_by, her_husband
Topic 3:
  israel, jewish, israeli, jews, time, palestinians, jerusalem, netanyahu, palestinian, people
  west_bank, united_states, anti-semitism, two-state_solution, tel_aviv, middle_east
Topic 4:
  company, sales, products, customers, market, business, consumers, companies, amazon, price
  less_than, whole_foods, wal-mart, chief_executive, two_years, e-commerce
Topic 5:
  trump, support, vote, republican, president, nominee, romney, party, told, mccain
  donald_trump, 

  president, voice, accused, hold, members, breaking, watch, people, refused, lift
  get_worse, few_breaks, professional_opinion_about, stealing_police_cars, sue_finley_was, hired_at_jpl
Topic 50:
  year, time, end, month, long, early, january, years, november, past
  two_years, next_year, last_month, less_than, two_months, this_week
Topic 51:
  apple, google, phone, app, devices, iphone, device, company, microsoft, data
  your_phone, iphone_7, apple_watch, pokémon_go, san_bernardino, tech_companies
Topic 52:
  french, france, paris, de, macron, nice, la, people, brian, le
  le_pen, national_front, marine_le_pen, de_la, his_wife, emmanuel_macron
Topic 53:
  report, reports, reported, found, released, time, notes, information, published, made
  last_month, earlier_this_month, state_department, last_october, crimes_against, an_attempt
Topic 54:
  church, god, christian, faith, people, religious, christians, religion, pope, life
  pope_francis, catholic_church, told_me, lopez_:, jesus_chr

  people, dog, dogs, time, didn, man, cooper, cat, don, wanted
  ve_been, ended_up, last_week, sex_workers, ve_seen, told_me
Topic 101:
  money, bank, pay, banks, financial, cash, paid, million, debt, business
  wall_street, wells_fargo, goldman_sachs, financial_crisis, deutsche_bank, how_much
Topic 102:
  ms., campaign, president, added, mr, washington, called, american, friday, made
  mr._trump, mrs._clinton, mr._obama, united_states, donald_j._trump, an_interview
Topic 103:
  trump, campaign, president, election, supporters, presidency, donald, trump., republican, comments
  donald_trump, president-elect, white_house, president-elect_donald_trump, trump_tower, hillary_clinton
Topic 104:
  don, people, doesn, ll, didn, ve, lot, things, good, isn
  re_going, ve_been, ve_got, talk_about, m_going, talking_about
Topic 105:
  film, movie, show, character, series, story, movies, films, characters, hollywood
  star_wars, star_trek, wonder_woman, tv_series, real-life, tv_shows
Topic 106:
  p

In [19]:
def save_model(model, tag, directory=MODEL_DIRECTORY):
    path = os.path.join(directory, "model-%s.pkl" % tag)
    print("Saving topic model to %s." % path)
    model.save(path)

def load_model(tag, directory=MODEL_DIRECTORY):
    path = os.path.join(directory, "model-%s.pkl" % tag)
    print("Loading topic model from %s." % path)
    model = LdaMallet.load(path)
    return model

tag = "%d-%d-%d-%d-%d-%d-%d" % (
    builder.ndocs, 
    builder.vocabulary_size, 
    builder.bigram_min_count, 
    builder.bigram_threshold,
    builder.trigram_min_count, 
    builder.trigram_threshold,
    num_topics
)
save_model(model, tag, directory=MODEL_DIRECTORY)
#show_topic_model(model, builder.dictionary, use_phrasers=True)

Saving topic model to ./model/all-the-news/model-128305-50000-5-10-5-10-128.pkl.


In [21]:
loaded_model = load_model(tag, directory=MODEL_DIRECTORY)
show_topic_model(loaded_model, builder.dictionary, use_phrasers=True)

Loading topic model from ./model/all-the-news/model-128305-50000-5-10-5-10-128.pkl.
Topic 0:
  state, states, california, texas, governor, law, virginia, florida, colorado, bill
  north_carolina, west_virginia, south_carolina, executive_director, new_mexico, rhode_island
Topic 1:
  china, chinese, japan, beijing, country, u.s., japanese, kim, taiwan, pyongyang
  north_korea, south_korea, united_states, hong_kong, north_korean, south_china_sea
Topic 2:
  wearing, fashion, wear, dress, hair, wore, clothes, style, shoes, model
  post_shared_by, t-shirt, t-shirts, photo_posted_by, inspired_by, her_husband
Topic 3:
  israel, jewish, israeli, jews, time, palestinians, jerusalem, netanyahu, palestinian, people
  west_bank, united_states, anti-semitism, two-state_solution, tel_aviv, middle_east
Topic 4:
  company, sales, products, customers, market, business, consumers, companies, amazon, price
  less_than, whole_foods, wal-mart, chief_executive, two_years, e-commerce
Topic 5:
  trump, support

  didn, told, time, wanted, knew, thought, asked, night, day, felt
  told_me, felt_like, next_day, talked_about, no_idea, looked_like
Topic 47:
  post, brooklyn, city, manhattan, mayor, train, told, sources, nypd, cuomo
  new_york, new_york_city, new_jersey, de_blasio, new_yorkers, long_island
Topic 48:
  moment, sense, world, man, power, end, makes, long, moments, turn
  many_ways, no_doubt, no_matter_how, ever_since, apart_from, penchant_for
Topic 49:
  president, voice, accused, hold, members, breaking, watch, people, refused, lift
  get_worse, few_breaks, professional_opinion_about, stealing_police_cars, sue_finley_was, hired_at_jpl
Topic 50:
  year, time, end, month, long, early, january, years, november, past
  two_years, next_year, last_month, less_than, two_months, this_week
Topic 51:
  apple, google, phone, app, devices, iphone, device, company, microsoft, data
  your_phone, iphone_7, apple_watch, pokémon_go, san_bernardino, tech_companies
Topic 52:
  french, france, paris, de

  city, people, local, cities, community, residents, town, area, neighborhood, live
  san_francisco, los_angeles, puerto_rico, city_council, grew_up, new_orleans
Topic 94:
  guy, good, thing, man, things, kind, real, bad, joke, love
  gon_na, looks_like, better_than, talking_about, i_guess, pretty_much
Topic 95:
  black, white, people, race, racist, racism, color, diversity, history, americans
  african-american, african-americans, black_lives_matter, black_men, white_men, black_man
Topic 96:
  law, rules, policy, government, rule, laws, act, federal, order, legal
  comply_with, federal_law, would_require, into_effect, federal_agencies, title_ix
Topic 97:
  number, year, data, average, numbers, compared, rate, population, increased, americans
  more_likely, less_than, higher_than, recent_years, more_than_half, 40_percent
Topic 98:
  isis, iraq, city, mosul, syria, fighting, group, forces, civilians, militants
  islamic_state, iraqi_forces, security_forces, backed_by, isis_fighters, ira