In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

import matplotlib.pyplot as plt
%matplotlib inline  

import warnings
warnings.filterwarnings("ignore")

%load_ext autoreload
%autoreload 2

In [2]:
from tmwt.corpus import Builder
from tmwt.util import *

In [80]:
import gensim
from gensim.corpora import Dictionary
from gensim.models.ldamodel import LdaModel
from gensim.models.wrappers import LdaMallet
from gensim.models.wrappers.ldamallet import malletmodel2ldamodel
from gensim import similarities
from nltk.tokenize import sent_tokenize, word_tokenize
import random

## 1. Building the WikiPedia Corpus

In [74]:
%%time
records = load_records(path='/opt/code/github/Topic-Modeling-Without-Tears/data/kb-50K.tsv')
texts = [text for record_id, text in records]
print("# records:", len(records))

Loaded 116231 records with 25236 errors.
CPU times: user 21.6 s, sys: 248 ms, total: 21.9 s
Wall time: 22 s


In [78]:
%%time

REFRESH = True

builder = Builder(
    ndocs=len(texts),
    phrase_min_count=3, 
    vocabulary_size=10000,
    bigram_min_count=3,
    bigram_threshold=10,
    trigram_min_count=3,
    trigram_threshold=10,
    data_directory=DATA_DIRECTORY,
    model_directory=MODEL_DIRECTORY
)
if REFRESH:
    builder.train_phrasers(texts)    
    builder.save_phrasers()
    prepared_texts = builder.prepare_texts(texts)
    builder.build_vocabulary(prepared_texts, save=True)
    corpus = builder.build_corpus(prepared_texts)
    builder.build_dictionary(corpus, save=True)
builder.load_phrasers()
builder.load_vocabulary()
builder.load_dictionary()
encoded_corpus = builder.encode_corpus(corpus)

builder.save_dictionary(path=DICTIONARY_PATH)

Streaming text for bigram phraser  ...: 100%|██████████| 116231/116231 [03:26<00:00, 562.03it/s]
Streaming text for trigram phraser ...: 100%|██████████| 116231/116231 [04:53<00:00, 395.94it/s]
Preparing texts ...: 100%|██████████| 116231/116231 [04:15<00:00, 454.52it/s]
Building vocabulary over 116231 documents.: 100%|██████████| 116231/116231 [00:04<00:00, 27510.04it/s]
Building corpus ...: 100%|██████████| 116231/116231 [00:03<00:00, 35018.70it/s]


CPU times: user 14min 26s, sys: 11.2 s, total: 14min 37s
Wall time: 14min 39s


## 2. Train a Topic Model

In [81]:
%%time

num_topics = 128
model = LdaMallet(
    mallet_path=MALLET_PATH, 
    corpus=encoded_corpus,
    num_topics=num_topics, 
    id2word=builder.dictionary
)

model.save(MALLET_MODEL_PATH)

CPU times: user 20.5 s, sys: 613 ms, total: 21.2 s
Wall time: 6min 19s


In [82]:
dictionary = Dictionary.load(DICTIONARY_PATH) 
model = LdaMallet.load(MALLET_MODEL_PATH)
show_topic_model(model, dictionary, use_phrasers=True)

Topic 0:
  including, include, includes, major, numerous, number, primarily, notable, addition, campbell
  as_well_as, a_variety, addition_to, there_are, many_other, several_other
Topic 1:
  form, similar, forms, structure, called, type, single, common, found, include
  they_are, such_as, there_are, each_other, also_called, rather_than
Topic 2:
  television, radio, show, network, channel, broadcast, bbc, tv, shows, live
  owned_by, broadcast_on, hosted_by, is_an_american, radio_station, radio_stations
Topic 3:
  greek, africa, turkey, country, egypt, greece, african, lebanon, turkish, syria
  middle_east, north_africa, ottoman_empire, as_well_as, west_africa, due_to
Topic 4:
  earth, sun, space, star, moon, stars, light, mass, surface, mission
  to_be, solar_system, named_after, away_from, due_to, larger_than
Topic 5:
  country, iraq, status, citizens, free, countries, official, issued, registered, passport
  saudi_arabia, according_to, united_arab_emirates, to_be, as_well_as, there_ar

  island, bay, islands, sea, colorado, point, beach, coast, part, peninsula
  new_mexico, pacific_ocean, atlantic_ocean, east_coast, an_area, west_coast
Topic 64:
  class, ship, ships, built, service, navy, uss, fleet, launched, commissioned
  royal_navy, united_states_navy, launched_on, during_world_war, world_war_ii, named_after
Topic 65:
  season, team, nfl, game, playoffs, record, games, nhl, franchise, played
  national_football_league, national_hockey_league, regular_season, super_bowl, he_played_college, former_american_football
Topic 66:
  john, david, michael, paul, robert, wilson, bell, mark, miller, thompson
  is_an_american, best_known_for, is_best_known, known_for, has_worked, also_worked
Topic 67:
  film, films, theatre, actor, director, movie, actress, television, producer, role
  directed_by, is_an_american, produced_by, based_on, was_nominated_for, same_name
Topic 68:
  club, played, season, manager, side, league, appearances, goals, liverpool, playing
  football_leagu

  species, food, found, family, animals, native, fish, tree, plant, plants
  such_as, they_are, have_been, years_ago, derived_from, ranging_from
Topic 126:
  tournament, held, final, competition, won, played, event, championship, cup, open
  ended_on, took_place, singles_title, defending_champions, took_place_at, played_on_outdoor
Topic 127:
  study, research, science, field, theory, philosophy, development, studies, fields, work
  concerned_with, deals_with, to_understand, focused_on, based_on, nobel_prize


In [83]:
lda = malletmodel2ldamodel(model, gamma_threshold=0.001, iterations=50)
lda.save(GENSIM_MODEL_PATH)
lda = LdaModel.load(GENSIM_MODEL_PATH)
show_topic_lda(lda, dictionary, use_phrasers=True)

Topic 0:
  including, include, includes, major, numerous, number, primarily, notable, addition, campbell
  as_well_as, a_variety, addition_to, there_are, many_other, several_other
Topic 1:
  form, similar, forms, structure, called, type, single, common, found, include
  they_are, such_as, there_are, each_other, also_called, rather_than
Topic 2:
  television, radio, show, network, channel, broadcast, bbc, tv, shows, live
  owned_by, broadcast_on, hosted_by, is_an_american, radio_station, radio_stations
Topic 3:
  greek, africa, turkey, country, egypt, greece, african, lebanon, turkish, syria
  middle_east, north_africa, ottoman_empire, as_well_as, west_africa, due_to
Topic 4:
  earth, sun, space, star, moon, stars, light, mass, surface, mission
  to_be, solar_system, named_after, away_from, due_to, larger_than
Topic 5:
  country, iraq, status, citizens, free, countries, official, issued, registered, passport
  saudi_arabia, according_to, united_arab_emirates, to_be, as_well_as, there_ar

  development, project, program, health, support, environment, provide, resources, policy, projects
  access_to, to_improve, to_promote, to_develop, to_create, health_care
Topic 51:
  typically, process, specific, person, generally, rules, individual, practice, provide, case
  such_as, may_be, can_be, for_example, or_more, they_are
Topic 52:
  small, long, large, body, size, smaller, length, wood, called, similar
  also_known_as, attached_to, also_called, can_be, could_be, to_create
Topic 53:
  house, members, council, parliament, elected, member, election, seat, assembly, representatives
  legislative_assembly, national_assembly, legislative_council, lower_house, upper_house, general_election
Topic 54:
  team, played, football, teams, players, international, rugby, sport, competition, play
  world_cup, national_team, rugby_union, they_have, is_controlled_by, national_football_team
Topic 55:
  march, january, october, early, april, september, june, august, december, november
  that_yea

Topic 100:
  played, play, game, test, ball, match, cricket, lead, players, runs
  west_indies, who_played, county_championship, to_take, cricket_club, he_scored
Topic 101:
  season, team, record, games, played, baseball, game, series, al, franchise
  national_league, world_series, major_league_baseball, american_league, new_york_yankees, major_league
Topic 102:
  division, fourth, place, arizona, total, ten, top, 2nd, seventh, sixth
  first_time, along_with, prior_to, tied_for, top_three, one_point
Topic 103:
  main, part, including, local, portugal, portuguese, made, hamilton, included, ring
  new_zealand, known_as, as_well_as, due_to, they_were, commonly_known_as
Topic 104:
  national, association, members, member, organization, founded, international, federation, society, organisation
  affiliated_with, a_founding_member, an_independent, scout_movement, formerly_known_as, governing_body
Topic 105:
  region, province, area, regions, areas, provinces, territory, capital, cities, muni

## 3. Build Similarity Index

- [Similarity Queries](https://radimrehurek.com/gensim/tut3.html)

In [84]:
%%time

vectors = lda[encoded_corpus]
index = similarities.MatrixSimilarity(vectors)

index.save(SIMILARITY_INDEX_PATH)

CPU times: user 10min 26s, sys: 2.92 s, total: 10min 29s
Wall time: 2min 38s


In [85]:
index = similarities.MatrixSimilarity.load(SIMILARITY_INDEX_PATH)

## 4 Build Entity Linker

In [100]:
class EntityLinker(object):
    def __init__(
        self, 
        directory=MODEL_DIRECTORY,
        similarity_threshold=0.6,
        n_suggestions=7

    ):
        path = os.path.join(directory, 'dictionary.pkl')
        self.dictionary = Dictionary.load(path)   
        
        path = os.path.join(directory, 'gensim.model')
        self.model = LdaMallet.load(path)
        
        path = os.path.join(directory, 'similarity.index')
        self.index = similarities.MatrixSimilarity.load(path)
        
        self.bigram_phraser, self.trigram_phraser = load_phrasers(directory)
        
        self.similarity_threshold = similarity_threshold
        self.n_suggestions = n_suggestions


    def tokenize(self, text):
        return [token.lower() for token in word_tokenize(text)]

    def __call__(self, text):
        text = clean(text)
        tokens = self.tokenize(text)
        tokens = self.bigram_phraser[tokens]
        tokens = self.trigram_phraser[tokens]
        bow = self.dictionary.doc2bow(tokens)
        vector = self.model[bow]
        sims = self.index[vector]
        sims = enumerate(sims)
        sims = sorted(sims, key=lambda item:-item[-1])
        suggestions = []
        cnt = 0
        seen = set([])
        for i, sim in sims:
            title, summary = records[i][0], records[i][1]
            #print("%d. [%0.2f] %s/%s" % (cnt + 1, sim, cat, subcat))
            if cnt > 0 and (cnt >= self.n_suggestions or sim < self.similarity_threshold): break
            if title in seen: continue
            cnt += 1
            seen.add(title)
            suggestions.append((sim, title, summary))
        return suggestions

linker = EntityLinker(similarity_threshold=0.6, n_suggestions=7)

In [103]:
N = 200
indices = range(len(records))

In [109]:
index = random.choice(indices)
title, context = records[index]
print("[%d] %s: %s" % (index, title, context[:N]))
for sim, title, summary in linker(context):
    print("[%0.2f] %s\n%s" % (sim, title, summary[:N]))
    

[102722] Peer Nielsen: Peer Nielson (sometimes shown as Peer Noorbohm, born June 25, 1942) is a Danish sprint canoer who competed in the early 1960s. He won the bronze medal in the C-2 1000 m event at the 1964 Summer Olympi
[1.00] Peer Nielsen
Peer Nielson (sometimes shown as Peer Noorbohm, born June 25, 1942) is a Danish sprint canoer who competed in the early 1960s. He won the bronze medal in the C-2 1000 m event at the 1964 Summer Olympi
[0.82] Hans Nilsson (canoeist)
Hans Nilsson (born September 5, 1946) is a Swedish sprint canoer who competed in the late 1960s and early 1970s. He won a bronze medal in the K-4 10000 m event at the 1970 ICF Canoe Sprint World Champ
[0.82] Petra Tierlich
Petra Tierlich, born 25 February 1945, is an East German luger who competed in the 1960s. She won three medals in the women's singles event at the FIL World Luge Championships with a gold in 1969 and 
[0.79] Ingela Ericsson
Ingela Ericsson (born September 27, 1968) is a Swedish sprint canoer who comp

In [108]:
context = "Before she joined the US Senate, Amy Klobuchar spent much of her career locking people up as the prosecutor for Hennepin County, Minnesota. But if she’s elected president, the Democrat has vowed to enact reforms within a month that could free thousands, if not tens of thousands, of people from federal prisons — and she won’t even need Congress to do it.".strip()
#context = "And now Klobuchar wants to use that power, much as President Barack Obama did toward the end of his term, to roll back mass incarceration and the war on drugs. By setting up a new system for clemency as soon as possible, she aims to release thousands of people with overly long prison sentences who’ve shown signs of rehabilitation."
for sim, title, summary in linker(context):
    print("[%0.2f] %s\n%s" % (sim, title, summary[:N]))

[0.74] Hurricane Diana
Hurricane Diana was the deadliest tropical cyclone during the 1990 Atlantic hurricane season, killing 139 people in Mexico. Forming out of a tropical wave on August 4, the system brushed Honduras befo
[0.74] Hurricane Karen (2007)
Hurricane Karen was the eleventh named storm and fourth hurricane of the 2007 Atlantic hurricane season. Karen was a Cape Verde-type hurricane that developed in the eastern tropical Atlantic out of a 
[0.72] Hurricane Iris (1995)
Hurricane Iris was the first of three tropical cyclones to affect the Lesser Antilles in a three-week period, preceding the more destructive hurricanes Luis and Marilyn. The ninth named storm and fift
[0.72] 2001 Atlantic hurricane season
The 2001 Atlantic hurricane season was a fairly active Atlantic hurricane season that produced 17 tropical cyclones, 15 named storms, nine hurricanes, and four major hurricanes. The season officially 
[0.72] 2004 Atlantic hurricane season
The 2004 Atlantic hurricane season was