Notebook based on code by Jonathon Mugan

https://www.udemy.com/complete-nlp-course/

In [53]:
import numpy as np

In [11]:
#import spacy
from spacy.lang.en import English

In [24]:
#cleaning words and return a list of tokens
parser = English()

def tokenize(text):
    lda_tokens = []
    tokens = parser(text)
    for token in tokens:
        if token.orth_.isspace():
            continue
        elif token.like_url:
            lda_tokens.append('URL')
        elif token.orth_.startswith('@'):
            lda_tokens.append('SCREEN_NAME')
        elif '@' in token.orth_:
            lda_tokens.append('EMAIL')
        else:
            lda_tokens.append(token.lower_)
    return lda_tokens

In [14]:
#demo
sent = '@bob said the #chicken was at the #junkyard. See http://www.jonathanmugan.com.'
out_tokens = tokenize(sent)
print(out_tokens)

['SCREEN_NAME', 'said', 'the', '#', 'chicken', 'was', 'at', 'the', '#', 'junkyard', '.', 'see', 'URL', '.']


In [15]:
# Lemmatization

from nltk.corpus import wordnet as wn
def get_lemma(word):
    lemma = wn.morphy(word)
    if lemma is None:
        return word
    else: 
        return lemma
    
# or can use this
from nltk.stem.wordnet import WordNetLemmatizer
def get_lemma2(word):
    return WordNetLemmatizer().lemmatize(word)

In [17]:
import nltk
en_stop = set(nltk.corpus.stopwords.words('english'))

In [18]:
def prepare_text_for_lda(text):
    tokens = tokenize(text)
    tokens = [token for token in tokens if len(token) > 4]
    tokens = [token for token in tokens if token not in en_stop]
    tokens = [get_lemma(token) for token in tokens]
    return tokens

In [22]:
# The Newsgroup Data
# http://scikit-learn.org/stable/datasets/twenty_newsgroups.html#newsgroups
from sklearn.datasets import fetch_20newsgroups
texts = fetch_20newsgroups(subset='train')
print(dir(texts))
print(len(texts.target)) # 11,314 posts
print(texts.target)
print(texts.target_names)
print(texts.data[0])

Downloading 20news dataset. This may take a few minutes.
Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)


['DESCR', 'data', 'description', 'filenames', 'target', 'target_names']
11314
[7 4 4 ... 3 1 8]
['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']
From: lerxst@wam.umd.edu (where's my thing)
Subject: WHAT car is this!?
Nntp-Posting-Host: rac3.wam.umd.edu
Organization: University of Maryland, College Park
Lines: 15

 I was wondering if anyone out there could enlighten me on this car I saw
the other day. It was a 2-door sports car, looked to be from the late 60s/
early 70s. It was called a Bricklin. The doors were really small. In addition,
the front bumper was separate from the rest of the body. This is 
all I know. If anyone can tellme a model name,

In [25]:
# get the data
import random
text_data = []
for text in  texts.data:
    tokens = prepare_text_for_lda(text)
    if random.random() > .99:
        print(tokens)
    text_data.append(tokens)


['EMAIL', 'annick', 'ansselin', 'subject', 'sensitivity', 'superstition', 'posting', 'organization', 'department', 'physiology', 'university', 'sydney', 'australia', 'line', 'EMAIL', 'EMAIL', 'steve', 'giammarco', 'write', 'flame', 'years', 'natural', 'extract', 'source', 'mention', 'cause', 'report', 'aftereffect', 'nasty', 'artificial', 'extract', 'whatever', 'cause', 'chinese', 'restaurant', 'syndrome', 'pretty', 'believe', 'anyone', 'hear', 'sodium', 'glutamate', 'fairly', 'straight', 'forward', 'compound', 'source', 'problem', 'comment', 'suggest', 'impurity', 'cause', 'experience', 'effects', 'double', 'blind', 'study', 'stuff', 'cause', 'rather', 'severe', 'effects', 'possibly', 'incorrect', 'assumption', 'food', 'processing', 'sugar', 'beet', 'source', 'ferment', 'cheese', 'mushroom', 'contain', 'react', 'sometimes', 'strongly', 'however', 'react', 'strongly', 'sodium', 'chloride', 'table', 'excess', 'cause', 'different', 'symptom', 'except', 'common', 'rapid', 'heartbeat', 'un

['EMAIL', 'eddie', 'tuggle', 'subject', 'svr4.0.3.6', 'forsale', 'keywords', 'organization', 'denver', 'operations', 'line', 'article', 'EMAIL', 'EMAIL', 'larry', 'snyder', 'write', 'EMAIL', 'daryl', 'mcdaniel', 'write', 'consider', 'conversation', 'would', 'consider', 'package', 'inflate', 'price', 'start', 'larry', 'snyder', 'EMAIL', 'problem', 'using', 'years', 'problem', 'eddie', 'tuggle', 'EMAIL', 'nothing', 'either', 'denver', 'operations', 'thinking', 'make', '16201', 'centretech', 'aurora', '80011', 'shakespeare', 'voice', '303.360.4001', '303.360.4133']
['EMAIL', 'roger', 'maynard', 'subject', 'names', 'organization', 'computer', 'science', 'laurentian', 'university', 'sudbury', 'distribution', 'line', 'EMAIL', 'EMAIL', 'deepak', 'chhabra', 'write', 'however', 'aside', 'question', 'whether', 'change', 'names', 'base', 'reason', 'given', 'making', 'easy', 'casual', 'whether', 'unique', 'divisional', 'names', 'base', 'individual', 'deserve', 'honour', 'latter', 'unique', 'touch'

['EMAIL', 'casper', 'davi./ppe', 'subject', 'american', 'jewish', 'congress', 'letter', 'clinton', 'software', 'vnews', 'organization', 'european', 'organization', 'nuclear', 'research', 'line', 'article', 'EMAIL', 'EMAIL', 'arromdee', 'write', 'article', 'EMAIL', 'EMAIL', 'casper', 'davi./ppe', 'write', 'complain', 'taking', 'point', 'bosnia', 'european', 'bosnians', 'morally', 'superior', 'serbian', 'imply', 'side', 'equal', 'sometimes', 'difficult', 'impossible', 'determine', 'victim', 'victim', 'exist', 'victim', 'exist', 'stagger', 'number', 'victim', 'world', 'think', 'balance', 'intervention', 'would', 'create', 'victim', 'include', 'american', 'since', 'first', 'responsibility', 'government', 'protect', 'american', 'think', 'serve', 'stay', 'bosnia', 'regional', 'conflict', 'would', 'atrocity', 'side', 'german', 'course', 'try', 'carve', 'territory', 'germany', 'either', 'except', 'small', 'scale', 'resistance', 'larger', 'uprising', 'government', 'people', 'butchering', 'centu

['EMAIL', 'lawnmowerman', 'subject', 'burns', 'dividian', 'ranch', 'survivor', 'keywords', 'thing', 'posting', 'reply', 'matthew', 'hamilton', 'organization', 'state', 'university', 'line', 'article', 'EMAIL', 'EMAIL', 'tavares', 'write', 'article', 'EMAIL', 'EMAIL', 'lawnmowerman', 'write', 'guess', 'shooting', 'baby', 'right', 'bastard', 'EMAIL', 'believe', 'speak', 'company', 'EMAIL', 'write', 'today', 'special', 'investor', 'packet', 'thanks', 'reply', 'never', 'never', 'right', 'shoot', 'baby', 'however', 'branch', 'davidian', 'people', 'insist', 'stay', 'savior', 'right', 'brain', 'wash', 'believing', 'truth', 'means', 'life', 'cause', 'therefore', 'david', 'fault', 'however', 'death', 'child', 'unlike', 'bastard', 'supposedly', '----------------------------------------------------------------------------+', 'matthew', 'hamilton', 'EMAIL', 'a.k.a', 'physics', 'major', 'EMAIL', 'lawnmowerman', 'state', 'university', 'EMAIL', '-------------------------------------------------------

['EMAIL', 'scott.g.crawford', 'subject', 'riding', 'mower', 'organization', 'distribution', 'keywords', 'ariens', 'riding', 'mower', 'line', 'ariens', 'riding', 'mower', 'mower', 'perfect', 'condition', 'contain', 'following', 'feature', 'electric', 'start', 'double', 'bagger', 'battery', 'engine', 'inflatable', 'tire', 'give', 'cushion', 'give', 'tune', 'blade', 'sharpen', 'month', 'move', 'house', 'small', 'grass', 'require', 'large', 'mower', 'engine', 'replace', 'rebuild', 'faulty', 'mower', 'repair', 'price', '600.00', 'phone', 'night', 'weekend']
['EMAIL', 'charlie', 'prael', 'subject', 'moonbase', 'organization', 'shakala', 'clanzen', 'radio', 'network', 'sunnyvale', 'line', 'EMAIL', 'mohney', 'write', 'allen', 'sometimes', 'think', 'sometimes', 'rashly', 'making', 'statement', 'without', 'thinking', 'wanna', 'guess', 'today', 'launch', 'large', 'amount', 'stuff', 'private', 'titan', 'doug--', 'actually', 'memory', 'serve', 'atlas', 'outgrowth', 'titan', 'probably', 'quite', 'al

['EMAIL', 'kellett', 'subject', 'education', 'organization', 'netcom', 'line', 'article', 'EMAIL', 'EMAIL', 'viveiros', 'write', 'seem', 'spend', 'significant', 'amount', 'correct', 'error', 'reliability', 'test', 'condom', 'abstinence', 'years', 'famous', 'study', 'show', 'failure', 'condom', 'study', 'show', 'failure', 'abstinence', 'adult', 'couple', 'rely', 'abstinence', 'prevent', 'pregnancy', 'pregnant', 'alarm', 'numbers', 'willpower', 'abstain', 'thinking', 'school', 'plan', 'parenthood', 'siecus', 'style', 'values', 'method', 'teach', 'contraceptive', 'technology', 'advise', 'choice', 'actually', 'increase', 'pregnancy', 'rates', 'post', 'article', 'happy', 'email', 'interest', 'article', 'include', 'source', 'contact', 'information', 'research', 'verify', 'statement', 'outstanding', 'source', 'acquiring', 'abstinence', 'relate', 'curriculum', 'single', 'quantity', 'research', 'produce', 'result', 'abstinence', 'relate', 'curriculum', 'found', 'decrease', 'pregnancy', 'rates',

['clinton', 'EMAIL', 'clinton', 'subject', 'clinton', 'president', 'radio', 'interview', 'pittsburgh', '4.17.93', 'organization', 'artificial', 'intelligence', 'line', 'posting', 'white', 'house', 'office', 'press', 'secretary', 'pittsburgh', 'pennsylvania', 'immediate', 'release', 'april', 'interview', 'president', 'michael', 'whitely', 'radio', 'pittsburgh', 'pittsburgh', 'international', 'airport', 'pittsburgh', 'pennsylvania', '10:40', 'everyone', 'listening', 'radio', 'whitely', 'radio', 'pittsburgh', 'international', 'airport', 'president', 'unite', 'state', 'clinton', 'welcome', 'president', 'thank', 'things', 'brief', 'amount', 'breaking', 'angeles', 'guess', 'entire', 'country', 'holding', 'breath', 'wonder', 'going', 'happen', 'trial', 'angeles', 'police', 'officer', 'hear', 'officer', 'sergeant', 'sergeant', 'officer', 'powell', 'found', 'guilty', 'officer', 'found', 'guilty', 'situation', 'building', 'since', 'first', 'trial', 'trial', 'verdict', 'wonder', 'thought', 'morni

['EMAIL', 'shafer', 'subject', 'crazy', 'imaginitive', 'reply', 'EMAIL', 'message', '04:54:03', 'organization', 'dryden', 'edwards', 'line', '04:54:03', 'EMAIL', 'nsmca', 'idea', 'nsmca', 'wilbur', 'orville', 'wright', 'quite', 'others', 'common', 'misconception', 'nothing', 'wright', 'brother', 'correspondance', 'number', 'experimenter', 'octave', 'chanute', 'lillienthal', 'model', 'tunnel', 'short', 'quite', 'mainstream', 'regard', 'eccentric', 'community', 'suggest', 'bishop', 'biography', 'harry', 'gates', 'comb', 'never', 'remember', 'own', 'learjet', 'print', 'easily', 'obtainable', 'bishop', 'trade', 'paperback', 'better', 'would', 'multi', 'volume', 'wright', 'writings', 'print', 'hideously', 'expensive', 'shafer', 'kotfr', 'dryden', 'flight', 'research', 'facility', 'edwards', 'EMAIL', 'course', 'speak', 'better', 'unknown', 'fighter', 'pilot']
['EMAIL', 'subject', 'russian', 'email', 'contact', 'organization', 'space', 'shuttle', 'program', 'office', 'newsreader', 'version', 

['EMAIL', 'subject', 'islam', 'frankenstien', 'line', 'posting', 'organization', 'mcgill', 'university', 'article', 'right', 'margin', 'answer', 'article', 'EMAIL', 'EMAIL', 'kaveh', 'smith', 'write', 'found', 'jewish', 'people', 'imagentative', 'creative', 'jewish', 'religion', 'foundation', 'christianity', 'islam', 'words', 'judaism', 'father', 'religion', 'islam', 'turn', 'father', 'ironic', 'communizem', 'threat', 'almost', 'religion', 'going', 'raise', 'thought', 'believing', 'unite', 'islam', 'believe', 'ebrahim', 'killing', 'frankenstien', 'story', 'going', 'happening', 'going', 'muslim', 'nuke', 'would', 'distroy', 'whole', 'world', 'would', 'since', 'kill', 'followers', 'believe', 'heaven', 'peacefull', 'ending', 'judaism', 'father', 'islam', 'prophets', 'judaism', 'ignore', 'prophets', 'later', 'prophets', 'include', 'jesus', 'christ', 'christian', 'muslim', 'believe', 'mohammed', 'believing', 'unite', 'peoples', 'however', 'christianity', 'islam', 'reflect', 'people', 'diffe

['EMAIL', 'bulent', 'murtezaoglu', 'subject', 'armenia', 'could', 'shoot', 'turkish', 'plane', 'reply', 'EMAIL', 'message', '16:45:17', 'posting', 'organization', 'computer', 'science', 'department', 'university', 'rochester', 'EMAIL', 'EMAIL', 'EMAIL', 'EMAIL', 'article', 'EMAIL', 'EMAIL', 'write', 'stuff', 'delete', 'country', 'turk', 'azeri', 'consistantly', 'armenia', 'karabakh', 'conflict', 'azerbaijan', 'gimme', 'break', 'capital', 'letters', 'nonsense', 'seem', 'short', 'sight', 'armenian', 'escalate', 'hostilities', 'hope', 'turkey', 'think', 'moment', 'armenia', 'anyone', 'conflict', 'karabakhi', 'armenian', 'live', 'homeland', 'years', 'armenia', 'given', 'azeri', 'stalin', 'directly', 'involve', 'conflict', 'defend', 'azeri', 'aggression', 'expect', 'azeri', 'friendly', 'force', 'fighting', 'within', 'border', 'insist', 'turkey', 'karabakh', 'crisis', 'repeat', 'cyprus', 'invasion', 'never', 'occur', 'playing', 'would', 'turkey', 'invade', 'throw', 'cyprus', 'buzzword', 'aro

['EMAIL', 'louis', 'gonzalez', 'subject', 'sphinx', 'satellite', 'image', 'processing', 'organization', 'universite', 'science', 'technologie', 'lille', 'france', 'line', 'posting', 'newsreader', 'version', 'sphinx', 'sphinx', 'friendly', 'state', 'image', 'processing', 'analysis', 'package', 'across', 'spectrum', 'performance', 'computer', 'platform', 'operate', 'window', 'system', 'create', 'daily', 'research', 'need', 'scientist', 'conducting', 'climate', 'investigation', 'using', 'satellite', 'remote', 'sensing', 'technique', 'intuitive', 'graphic', 'interface', 'sphinx', 'feature', 'interactive', 'interface', 'menu', 'point', 'click', 'dialog', 'box', 'make', 'image', 'processing', 'analysis', 'simple', 'accessible', 'menuing', 'enable', 'build', 'attractive', 'image', 'layout', 'quickly', 'provide', 'flexibility', 'return', 'conduct', 'image', 'analysis', 'processing', 'operations', 'image', 'format', 'compatibility', 'using', 'smart', 'write', 'function', 'sphinx', 'allow', 'eas

['EMAIL', 'sorenson', 'subject', 'nuclear', 'heavy', 'weapon', 'militia', 'organization', 'state', 'university', 'line', 'EMAIL', 'colorado', 'frank', 'crary', 'write', '170-some', 'line', 'making', 'progress', 'ironic', 'delete', 'line', 'emacs', 'continually', 'message', 'garbage', 'collecting', 'think', 'try', 'something', 'EMAIL', 'sorenson', 'write', 'thank', 'great', 'change', 'tire', 'debate', 'lurker', 'talk.politics.guns', 'kindler', 'gentle', 'newsgroup', 'would', 'thought', 'exactly', 'reason', 'restriction', 'possible', 'endangerment', 'others', 'poorly', 'maintain', 'rifle', 'dangerous', 'since', 'endanger', 'others', 'justification', 'restriction', 'remind', 'shrapnel', 'consider', 'reasonable', 'space', 'limitation', 'mistake', 'blowing', 'hazard', 'nearby', 'range', 'point', 'distance', 'level', 'threat', 'endangerment', 'others', 'restriction', 'possibility', 'hence', 'restriction', 'restriction', 'suffer', 'fundamental', 'restrict', 'base', 'instrument', 'rather', 'pl

['EMAIL', 'subject', 'biblical', 'backing', 'koresh', 'cite', 'enclose', 'organization', 'organization', 'line', 'posting', 'article', 'EMAIL', 'EMAIL', 'stephen', 'write', 'think', 'david', 'koresh', 'solid', 'structure', 'sound', 'biblical', 'backing', 'broadcast', 'think', 'anyone', 'really', 'care', 'solid', 'structure', 'sermon', 'death', 'responsible', 'concern', 'people', 'enclose', 'partial', 'source', 'cite', 'quote', 'exactly', 'christian', 'sermon', 'pretty', 'inspire', 'though', 'differ', 'conclusion', 'argument', 'present', 'back', 'take', 'still', 'ongoing', 'thought', 'ramble', 'koresh', 'definite', 'relationship', 'curious', 'first', 'revelation', 'entirety', 'psalms', 'revelation', 'demonstrate', 'major', 'point', 'writings', 'prophets', 'include', 'david', 'psalms', 'revelation', 'telling', 'story', 'understand', 'relate', 'largely', 'explain', 'charles', 'manson', 'revelation', 'pattern', 'wonder', 'koresh', 'like', 'beatles', 'sequence', 'following', 'key', 'koresh'

['EMAIL', 'thomas', 'schlatter', 'subject', 'window', 'organization', 'world', 'public', 'access', 'brookline', 'line', 'article', 'EMAIL', 'EMAIL', 'write', 'somebody', 'purchase', 'window', 'try', 'muck', 'around', 'setting', 'avail', 'wrong', 'problem', 'running', '386-enhanced', 'windows', 'take', 'memory']
['EMAIL', 'jason', 'subject', 'sensitivity', 'superstition', 'posting', 'reply', 'EMAIL', 'organization', 'communications', 'research', 'line', 'article', 'EMAIL', 'EMAIL', 'write', 'puke', 'throw', 'kid', 'pull', 'muscle', 'tongue', 'heaves', 'everything', 'allergic', 'anything', 'funny', 'thing', 'personaly', 'story', 'reaction', 'greatly', 'heart', 'speed', 'flush', 'claim', 'heart', 'skip', 'beats', 'react', 'headache', 'stomach', 'watery', 'running', 'itchy', 'rash', 'serious', 'accusation', 'include', 'respiration', 'difficulty', 'brain', 'damage', 'vomiting', 'guess', 'become', 'number', 'suspect', 'problem', 'might', 'poisoning', 'hear', 'things', 'think', 'jason']
['EMA

['EMAIL', 'edward', 'fischer', 'subject', 'sandberg', 'note', 'indian', 'series', 'organization', 'cornell', 'ithaca', '14853', 'distribution', 'line', 'article', 'EMAIL', 'EMAIL', 'roger', 'lustig', 'write', 'right', 'care', 'player', 'credit', 'player', 'help', 'score', 'care', 'amuse', 'seem', 'sdcns', 'realize', 'baseball', 'combining', 'effort', 'every', 'player', 'consider', 'night', 'bottom', '-clemens', 'pitch', 'strong', 'innings', 'allow', '-ryan', 'pitch', 'couple', 'shutout', 'innings', 'though', 'need', 'excellent', 'defensive', 'play', 'behind', '-quantrill', 'pitch', 'couple', 'innings', 'ahead', 'credit', 'score', 'bottom', 'inning', 'look', 'effort', 'quantrill', 'credit', 'offense', '-dawson', 'vaughn', 'think', 'early', 'without', 'either', 'would', '-quintana', 'solid', 'single', '-zupcic', 'pinch', 'quintana', 'provide', 'speed', 'first', 'third', '-cooper', 'rip', 'second', 'single', 'inning', '-melvin', 'avoid', 'getting', 'something', '-scrub', 'richardson', 'do

['EMAIL', 'tharp', 'subject', 'happy', 'easter', 'organization', 'tektronix', 'colorado', 'system', 'englewood', 'line', 'article', 'EMAIL', 'EMAIL', 'jonathan', 'quist', 'write', 'roll', 'royce', 'own', 'british', 'would', 'civilization', 'aston', 'martin', 'jaguar', 'general', 'motor', 'lotus', 'vauxhall', 'rover', 'own', 'honda', '-----------------------------------------------------------------------------', 'tharp', 'EMAIL', 'cds.tek.com', 'indian', 'scout', 'indian', 'chief', 'ahrma', 'brother', 'chief.|', 'r90s(#151', 'tr-2b(#751', '524737', 'build', 'rock', 'r50/2/velorex', 'knock', 'compulsive', 'harleys', 'employer', 'joiner', 'grief', '-----------------------------------------------------------------------------']
['EMAIL', 'peter', 'tryndoch', 'subject', 'advice', 'need', 'line', 'allmartin', 'emdedmm', 'advice', 'need', 'EMAIL', 'martin', 'organization', 'boeing', 'currely', 'market', 'recently', 'kelvin', 'anyone', 'brand', 'extremely', 'happy', 'small', 'brand', 'compare

['EMAIL', 'subject', 'note', 'indian', 'series', 'line', 'organization', 'university', 'distribution', 'article', 'EMAIL', 'EMAIL', 'steven', 'goldman', 'write', 'start', 'probably', 'alomar', 'great', 'player', 'help', 'recognition', 'likely', 'vote', 'alomar', 'differ', 'opinion', 'likely', 'vote', 'attend', 'game', 'balloting', 'great', 'number', 'attendee', 'ballot', 'soak', 'floor', 'stand', 'toronto', 'vote', 'unfortunately', 'thing', 'attend', 'game', 'season', 'happen', 'every', 'apathetic', 'attitude', 'ballot', 'really', 'offend']
['EMAIL', 'thomas', 'subject', 'andromeda', 'strain', 'summary', 'organization', 'telectronics', 'pacing', 'system', 'line', 'article', 'EMAIL', 'EMAIL', 'hartung', 'write', 'opportunity', 'watch', 'flick', 'years', 'since', 'channel', 'something', 'interest', 'technology', 'demonstrate', 'handling', 'infectious', 'disease', 'similar', 'toxic', 'substance', 'clearly', 'fake', 'computer', 'robotic', 'technology', 'certainly', 'science', 'fiction', 'a

['subject', 'rickey', 'rickey', 'whine', 'EMAIL', 'brian', 'smith', 'expire', '04:00:00', 'distribution', 'organization', 'university', 'kentucky', 'science', 'line', 'article', 'EMAIL', 'EMAIL', 'david', 'write', 'followup', 'thought', 'original', 'article', 'specific', 'comment', 'method', 'EMAIL', 'oswalt', 'obtain', 'record', 'computer', 'readable', 'every', 'major', 'league', 'baseball', 'several', 'years', 'devise', 'algorithm', 'situation', 'write', 'computer', 'program', 'calculate', 'every', 'major', 'league', 'player', 'contribution', 'using', 'works', 'every', 'situation', 'every', 'baseball', 'season', 'situation', 'determine', 'inning', 'score', 'baserunners', 'situation', 'count', 'times', 'eventually', 'situation', 'occur', 'divide', 'number', 'times', 'situation', 'value', 'situation', 'first', 'george', 'lindsey', 'early', 'report', 'article', 'investigation', 'strategy', 'baseball', 'author', 'george', 'lindsey', 'journal', 'operations', 'research', 'issue', 'volume',

['EMAIL', 'the_doge', 'subject', 'learn', 'wacko', 'keywords', 'prophet', 'profit', 'organization', 'public', 'access', 'SCREEN_NAME', 'denver', 'distribution', 'line', 'actually', 'important', 'things', 'glean', 'start', 'getting', 'desperate', 'answer', 'question', 'natural', 'pinkboys', 'anything', 'means', 'still', 'plenty', 'false', 'jesus', 'business', 'enterprising', 'subgenii', 'remember', 'separate', 'pink', 'green', 'automatic', 'weapon', 'mexico', 'swiss', 'account', 'smile', 'flunky', 'flower', 'airport', 'shrug', 'never', 'never', 'never', 'start', 'believe', 'bulldada', 'david', 'koresh', "hand't", 'start', 'swallow', 'apocalypso', 'working', 'crossword', 'puzzle', 'bahamas', 'today', 'instead', 'contribute', 'mulch', 'layer', 'start', 'shooting', 'likely', 'shoot', 'better', 'shot', 'short', 'barnum', 'right', 'stupidity', 'correct', 'endeth', 'lesson', 'the_doge', 'south', 'louis', 'dobbs', 'approve', 'medium', 'conspirator(tm', 'beyond', 'sunday', 'louis', 'community',

In [26]:
len(text_data)

11314

In [27]:
from gensim import corpora
dictionary = corpora.Dictionary(text_data)

In [28]:
# converting to bag-of-words
corpus = [dictionary.doc2bow(text) for text in text_data]

In [29]:
len(corpus)

11314

In [30]:
# save the corpus and dictionary, we will use these in another video to visualize
import pickle
pickle.dump( corpus, open( "corpus.pkl", "wb" ) )
dictionary.save('dictionary.gensim')

In [32]:
import gensim
ldamodels = {}
for n_topics in [10,20,30]:
    ldamodels[n_topics] = gensim.models.ldamodel.LdaModel(corpus, num_topics=n_topics,
                                               id2word= dictionary, passes = 15)
    ldamodels[n_topics].save('model'+str(n_topics)+'.gensim')
    
    print('Words for',n_topics,'topics')
    topics = ldamodels[n_topics].print_topics(num_words=4)
    for topic in topics:
        print(topic)

Words for 10 topics
(0, '0.011*"EMAIL" + 0.010*"christian" + 0.009*"people" + 0.009*"jesus"')
(1, '0.005*"organization" + 0.005*"subject" + 0.004*"line" + 0.004*"captain"')
(2, '0.073*"EMAIL" + 0.023*"subject" + 0.023*"line" + 0.023*"organization"')
(3, '0.007*"gordon" + 0.007*"ground" + 0.006*"banks" + 0.005*"EMAIL"')
(4, '0.012*"program" + 0.010*"EMAIL" + 0.008*"image" + 0.007*"system"')
(5, '0.011*"people" + 0.011*"would" + 0.008*"EMAIL" + 0.007*"right"')
(6, '0.008*"armenian" + 0.008*"government" + 0.007*"encryption" + 0.006*"people"')
(7, '0.058*"EMAIL" + 0.016*"line" + 0.015*"subject" + 0.015*"organization"')
(8, '0.039*"EMAIL" + 0.037*"max>\'ax>\'ax>\'ax>\'ax>\'ax>\'ax>\'ax>\'ax>\'ax>\'ax>\'ax>\'ax>\'ax>\'ax" + 0.012*"player" + 0.009*"game"')
(9, '0.089*"EMAIL" + 0.018*"space" + 0.007*"launch" + 0.006*"orbit"')
Words for 20 topics
(0, '0.010*"EMAIL" + 0.009*"power" + 0.009*"would" + 0.008*"circuit"')
(1, '0.021*"space" + 0.008*"apple" + 0.007*"system" + 0.006*"EMAIL"')
(2, '0.01

In [46]:
#evaluation
yhat_train = []
for text in corpus:
    text_topics = ldamodels[20].get_document_topics(text)
    label = sorted(text_topics, key=lambda x: x[1], reverse=True)[0][0]
    yhat_train += [label]

In [54]:
from sklearn.metrics import adjusted_rand_score
train_ari = adjusted_rand_score(texts.target, np.array(yhat_train))
print(train_ari)

0.09632084166242705


In [56]:
yhat_train_10 = []
for text in corpus:
    text_topics = ldamodels[10].get_document_topics(text)
    label = sorted(text_topics, key=lambda x: x[1], reverse=True)[0][0]
    yhat_train_10 += [label]
print(adjusted_rand_score(texts.target, np.array(yhat_train_10)))

0.08896504732032838


In [57]:
yhat_train_30 = []
for text in corpus:
    text_topics = ldamodels[30].get_document_topics(text)
    label = sorted(text_topics, key=lambda x: x[1], reverse=True)[0][0]
    yhat_train_30 += [label]
print(adjusted_rand_score(texts.target, np.array(yhat_train_30)))

0.11065326361250165


test

In [58]:
texts_test = fetch_20newsgroups(subset='test')

In [66]:
text_data_test = []
for text in  texts_test.data:
    tokens = prepare_text_for_lda(text)
    if random.random() > .99:
        print(tokens)
    text_data_test.append(tokens)
corpus_test = [dictionary.doc2bow(text) for text in text_data_test]

['EMAIL', 'chuck', 'subject', 'daily', 'verse', 'line', 'dishonest', 'money', 'dwindle', 'gather', 'money', 'little', 'little', 'make', 'proverbs', '13:11']
['EMAIL', 'subject', 'adobe', 'photoshop', 'mailing', 'organization', 'texas', 'state', 'university', 'line', 'looking', "havn't", 'mailing', 'newsgroup', 'user', 'adobe', 'photoshop', 'assume', 'miss', 'ahead', 'enough', 'interest', 'start', 'mailing', 'and/or', 'newsgroup', 'might', 'interest', 'subscribe', 'thanks', '--bob', 'grateful', 'insert', 'usual', 'disclaimer', 'texas', 'state', 'commerce', 'texas', 'historic', 'image', 'processing', 'project', 'EMAIL', 'watch', 'address', 'change']
['EMAIL', 'subject', 'thanks', 'add', 'forward', 'space', 'digest', 'organization', 'international', 'space', 'university', 'original', 'sender', 'EMAIL', 'distribution', 'line', 'thank', 'everyone', 'anyone', 'information', 'project', '-----', '\\____|', 'report', 'request', 'keith', 'malinowski', 'EMAIL', 'stockton', 'state', 'college', 'po

['EMAIL', 'guido', 'klemans', 'subject', 'disk', 'protect', 'organization', 'eindhoven', 'university', 'technology', 'line', 'posting', 'article', 'EMAIL', 'EMAIL', 'sheng', 'kasey', 'chang', 'write', 'byteocide', 'believe', 'company', 'implement', 'special', 'patch', 'register', 'loudly', 'exclaim', 'bootup', 'register', 'xxxxx', 'address', 'state', 'disable', 'benefit', 'encourage', 'registration', 'pirate', 'make', 'traceable', 'patch', 'registration', 'message', 'encrypt', 'quite', 'change', 'therefore', 'provide', 'stable', 'trail', 'allow', 'hands', 'violate', 'license', 'agreement', 'copy', 'different', 'registration', 'compare', 'locate', 'usually', 'keep', 'register', 'company', 'public', 'domain', 'yellow', 'brick', 'tinseltown', 'something', '--kasey', 'chang', '-------------------------------------------------------------------------------', 'guido', 'klemans', 'internet', 'EMAIL', 'valid', '-------------------------------------------------------------------------------', '

['EMAIL', 'colorado', 'allen', 'koberg', 'subject', 'christianity', 'crisis', 'hanegraaff', 'organization', 'university', 'colorado', 'boulder', 'line', 'article', 'EMAIL', 'EMAIL', 'frank', 'decenso', 'write', 'anyone', 'important', 'feelings', 'though', 'intend', 'judging', 'promos', 'constantly', 'radio', 'sound', 'macarthur', 'charismatic', 'chaos', 'series', 'talks', 'things', 'health', 'wealth', 'prosperity', 'thing', 'prominent', 'religion', 'every', 'crouch', 'showing', 'building', 'talking', 'unhealthy']
['EMAIL', 'subject', 'chromium', 'dietary', 'suppliment', 'weight', 'organization', 'college', 'osteopathic', 'medicine', 'line', 'posting', 'article', 'EMAIL', 'EMAIL', 'henry', 'melton', 'write', 'request', 'sage', 'usenet', 'know', 'chromium', 'weight', 'control', 'suppliments', 'multiple', 'product', 'advertising', 'would', 'information', 'first', 'impulse', 'metal', 'henry', 'melton', 'chromium', 'things', 'chest', 'blast', 'kidney', 'stone', 'post', 'kidney', 'stone', 'p

['EMAIL', 'central', 'bernard', 'subject', 'denver', 'yank', 'assault', 'organization', 'microsystems', 'line', 'distribution', 'world', 'reply', 'EMAIL', 'central', 'posting', 'article', 'EMAIL', 'EMAIL', 'entropic', 'destroyer', 'write', 'denver', 'suppose', 'voice', 'suppose', 'rocky', 'mountain', 'empire', 'following', 'firearm', 'supply', 'classified', 'heading', 'friday', 'april', 'opinion', 'found', 'wisdom', 'person', 'speak', 'walter', '303)820', 'notice', 'denver', 'longer', 'knowingly', 'accept', 'advertise-', 'assault', 'weap-', 'denver', 'find', 'assault', 'weapon', 'pose', 'threat', 'health', 'safety', 'security', 'reader', 'think', '--dan', 'spooksmoke', 'revolution', 'assasination', 'thorium', 'cobalt-60', 'clintin', 'EMAIL', 'liberty', 'death', 'EMAIL', 'something', '-----begin', 'public', 'block-----', 'version', 'mqcnaitfksqaaaeeakceejwi9f5kmjykp0logc5dghrpbmy2xhoo8kpehmdyuf8a', '1bfdqsj53kostz6hroshsdzlvul1/40vpjmmntfr+vyz4jvd3rl4iuq2ummmex3', 'itf3ult8xn', 'qabsvhc

['EMAIL', 'mason', 'subject', 'comic', 'organization', 'university', 'florida', 'gainesville', 'line', 'posting', 'selling', 'following', 'comic_strip', 'list', 'guide', 'price', 'overstreet', 'comic_strip', 'values', 'monthly', 'wizard', 'could', 'overstreet', 'price', 'single', 'item', 'often', 'extra', 'copy', 'item', 'comic_strip', 'unless', 'otherwise', 'note', 'title', 'guide', 'price', '-------------------------------------------------------------------------------', 'adventure', 'comic_strip', 'unless', 'note', '130.00', '100.00', 'animal', '46.00', '30.00', 'animal', 'animal', 'reprint', 'wonder', 'woman', 'intro', 'animal', 'avenger', '21.00', 'avenger', 'avenger', 'annual', 'police', '22.50', 'police', 'green', 'arrow', '35.50', '15.00', 'green', 'arrow', 'infinity', 'gauntlet', '17.00', 'infinity', 'gauntlet', 'legion', 'super', 'hero', 'legion', 'super', 'hero', 'legion', 'super', 'hero', 'l.e.g.i.o.n.', '22.00', '15.00', 'omega', '23.00', '15.00', 'omega', 'omega', 'annua

['EMAIL', 'idler', 'subject', 'mormon', 'belief', 'bastard', 'organization', 'university', 'victoria', 'line', 'EMAIL', 'write', 'could', 'anyone', 'enlighten', 'mormon', 'church', 'view', 'child', 'wedlock', 'particular', 'interest', 'stigma', 'attach', 'child', 'oppose', 'parent', 'child', 'without', 'however', 'saint', 'would', 'pregnancy', 'outside', 'marriage', 'occasion', 'mourning', 'church', 'member', 'would', 'judgmental', 'problem', 'situation', 'welfare', 'assistance', 'provide', 'church', 'bishop', 'usually', 'require', 'family', 'making', 'effort', 'gospel', 'standard', 'provide', 'however', 'occasions', 'assistance', 'provide', 'child', 'former', 'bishop', 'child', 'always', 'worthy', 'especially', 'learn', 'prohibition', 'mormon', 'faith', 'bastard', 'entering', 'heaven', 'names', 'enter', 'genealogical', 'mormons', 'bastard', 'context', 'latter', 'saint', 'believe', 'temple', 'ordinance', 'family', 'preserve', 'eternity', 'genealogical', 'material', 'software', 'produce

['EMAIL', 'edward', 'fischer', 'subject', 'morris', 'organization', 'cornell', 'ithaca', '14853', 'line', 'article', 'EMAIL', 'EMAIL', 'roger', 'maynard', 'write', 'player', 'better', 'another', 'solutely', 'player', 'would', 'play', 'better', 'player', 'lineup', 'sheer', 'speculation', 'impossible', 'ascertain', 'discipline', 'certainty', 'necessary', 'state', 'something', 'baseball', 'therefore', 'clemens', 'better', 'morris', 'larkin', 'better', 'griffin', 'ascertain', 'prove', 'require', 'since', 'obviously', 'threads', 'meaningless', 'simply', '-valentine', 'going', 'cordial', 'roger', 'maynard', 'complete', 'total', 'dickhead', 'insist', 'details']
['EMAIL', 'michael', 'covington', 'subject', 'allergic', 'reaction', 'laser', 'printer', 'posting', 'organization', 'program', 'university', 'georgia', 'athens', 'line', 'laser', 'printer', 'often', 'ozone', 'smell', 'clorox', 'adequate', 'ventilation', 'recommend', 'michael', 'covington', 'associate', 'research', 'scientist', 'artific

['EMAIL', 'glauert', 'subject', 'challenge', 'microsoft', 'supporter', 'posting', 'organization', 'olivetti', 'research', 'cambridge', 'england', 'line', 'article', 'EMAIL', 'EMAIL', 'bollacker', 'write', 'glauert', 'EMAIL', 'write', 'words', 'opinion', 'msw3.1', 'inferior', 'competition', 'msw3.1', 'objectively', 'inferior', 'technically', 'inferior', 'list', 'starting', 'define', 'criterion', 'criterion', 'inferior', 'afraid', 'actually', 'prove', 'show', 'aspect', 'msw3.1', 'technically', 'inferior', 'system', 'earlier', 'try', 'trivially', 'aspect', 'msw3.1', 'actually', 'technically', 'superior', 'system', 'resource', 'requirement', 'performace', 'device', 'support', 'printer', 'support', 'study', 'learn', 'windows', 'might', 'agree', 'balance', 'msw3.1', 'technically', 'inferior', 'competition', 'still', 'prove', 'point', 'another', 'perhaps', 'incorrect', 'range', 'available', 'application', 'application', 'hardware', 'multi', 'platform', 'support', 'quality', 'sales', 'service'

['EMAIL', 'gerry', 'subject', 'christianity', 'repeat', 'life', 'line', 'article', 'EMAIL', 'EMAIL', 'write', 'gerry', 'write', 'nothing', 'christianity', 'preclude', 'repeat', 'life', 'earth', 'somewhere', 'appoint', 'judgement', 'concordance', 'memory', 'appear', 'somewhere', 'bible', 'given', 'fairly', 'specific', 'context', 'judgement', 'decide', 'issue', '--clh', 'indeed', 'immediate', 'context', 'otherwise', 'would', 'need', 'suffer', 'often', 'since', 'foundation', 'world', 'consummation', 'manifest', 'sacrifice', 'inasmuch', 'appoint', 'come', 'judgement', 'christ', 'offer', 'shall', 'appear', 'second', 'eagerly', 'await', 'first', 'point', 'verse', 'larger', 'context', 'subject', 'destiny', 'individual', 'human', 'rather', 'singular', 'nature', 'christ', 'sacrifice', 'fulfillment', 'fall', 'mankind', 'rudolf', 'frieling', 'elaborate', 'detail', 'christianity', 'reincarnation', 'thrust', 'passage', 'context', 'liken', 'incarnation', 'sacrifice', 'christ', 'mankind', 'individual

['EMAIL', 'ilyess', 'bdira', 'subject', 'saudia', 'control', 'despite', 'buying', 'posting', 'organization', 'concordia', 'university', 'montreal', 'canada', 'line', 'EMAIL', 'write', 'impotant', 'update', 'line', 'overhead', 'copyright', 'despite', 'muslim', 'around', 'world', 'things', 'shaping', 'killing', 'might', 'happen', 'future', 'though', 'bosnia', 'right', 'saying', 'martillo', 'stage', 'think', 'things', 'pessimistic', 'realistic', 'stage', 'impose', 'people', 'government', 'remote', 'control', 'fight', 'oppress', 'people', 'confident', 'venture', 'order', 'sudan', 'yemen', 'alliance', 'algeria', 'libya', 'years', 'tunisia', 'later', 'egypt', 'direct', 'colonization', 'prevent', 'islamic', 'government', 'might', 'egypt', 'attack', 'sudan', 'envolved', 'atrocity', 'fail', 'achieve', 'victory', 'meanwhile', 'saudia', 'face', 'civil', 'yemen', 'sudan', 'north', 'african', 'alliance', 'algeria', 'libya', 'morocco', 'attack', 'france', 'involve', 'everywhere', 'concentrate', 'pla

['EMAIL', 'anders', 'bjoernestad', 'subject', 'universal', 'phone', 'reply', 'EMAIL', 'anders', 'bjoernestad', 'organization', 'telematics', 'norwegian', 'institute', 'technology', 'line', 'article', 'EMAIL', 'EMAIL', 'joakim', 'gunnarsson', 'write', 'things', 'watch', 'germany', 'think', 'hold', 'sweden', 'connection', 'handle', 'dial', 'phone', 'pulse', 'dial', 'sweden', 'first', 'digit', 'digit', 'push', 'position', 'make', 'dial', 'process', 'convert', 'numbers', 'interest', 'think', 'connection', 'norway', 'handle', 'touchtone', 'dial', 'touchtone', 'norway', 'interest', 'system', 'different', 'phone', 'depend', 'country', 'region', 'system', 'country', 'another', 'system', 'different', 'phone', 'depend', 'country', 'almost', 'right', 'pulsdialing', 'phone', 'little', 'switch', 'inside', 'norway', 'renumber', 'numbers', 'phone', 'anders', 'bj{\\o}rnestad', 'division', 'computer', 'science', 'telematics', 'norwegian', 'technology', 'trondheim', 'norway', 'email', 'EMAIL', 'internet

In [61]:
print(len(texts_test.data))
print(len(corpus_test))

7532
7532


In [67]:
yhat_test = []
for text in corpus_test:
    text_topics = ldamodels[20].get_document_topics(text)
    label = sorted(text_topics, key=lambda x: x[1], reverse=True)[0][0]
    yhat_test += [label]
print(adjusted_rand_score(texts_test.target, np.array(yhat_test)))

0.06479343221459004


visualisation

In [63]:
import pyLDAvis.gensim

In [64]:
# set sort_topics to False so it keeps the same topic ids as in Gensim
lda_display = pyLDAvis.gensim.prepare(ldamodels[20],corpus,dictionary,sort_topics=False)
pyLDAvis.display(lda_display)

In [68]:
# set sort_topics to False so it keeps the same topic ids as in Gensim
lda_display_test = pyLDAvis.gensim.prepare(ldamodels[20],corpus_test,dictionary,sort_topics=False)
pyLDAvis.display(lda_display_test)