### 1) Preprocess text using Python's Natural Language Toolkit
### 2) Produce topic modelling with Latent Dirichlet Allocation
### 3) Visualize intertopic distance

In [233]:
import pickle

from pprint import pprint
import pandas as pd
import string
import re
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import nltk
from nltk.corpus import stopwords, wordnet
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer

In [462]:
def get_wordnet_pos(treebank_tag):
    """Convert the part-of-speech naming scheme
       from the nltk default to that which is
       recognized by the WordNet lemmatizer"""

    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

    
def preprocess_series_text(data):
    """Perform complete preprocessing on a Pandas series
       including removal of alpha numerical words, normalization,
       punctuation removal, tokenization, stop word removal, 
       and lemmatization."""
    
    # remove alpha numerical words and make lowercase
    alphanum_re = re.compile(r"""\w*\d\w*""")
    alphanum_lambda = lambda x: alphanum_re.sub('', x.strip().lower())

    data = data.map(alphanum_lambda)

    # remove punctuation
    punc_re = re.compile('[%s]' % re.escape(string.punctuation))
    punc_lambda = lambda x: punc_re.sub(' ', x)

    data = data.map(punc_lambda)

    # tokenize words
    data = data.map(word_tokenize)

    # remove stop words
    sw = stopwords.words('english')
    sw += ['ve', 'thank', 'don', 'just', 'did', 'lot', 'let', 'doesn', '–', '—', 'chris', 'anderson', 'yeah', 'ca', 'oh', 'mike', 'also', 'ok', 'percent']
    sw = set(sw)
    sw_lambda = lambda x: list(filter(lambda y: y not in sw, x))

    data = data.map(sw_lambda)

    # part of speech tagging--must convert to format used by lemmatizer
    data = data.map(nltk.pos_tag)
    pos_lambda = lambda x: [(y[0], get_wordnet_pos(y[1])) for y in x]
    data = data.map(pos_lambda)

    # lemmatization
    lemmatizer = WordNetLemmatizer()
    lem_lambda = lambda x: [lemmatizer.lemmatize(*y) for y in x]
    data = data.map(lem_lambda)
    
    return data

In [463]:
with open('database_2455_1.pkl', 'rb') as picklefile:
    db_raw = pickle.load(picklefile)

In [525]:
df = pd.DataFrame(db_raw)


In [465]:
def join_fragments(row):
    t = ' '.join(row['text'])
    return t

In [466]:
df['text'] = df.apply(join_fragments, axis=1)

In [467]:
df = df.replace('', np.nan).dropna()

In [3]:
# df.sort_values(by='views', ascending=False)

In [469]:
df['tokens'] = preprocess_series_text(df.text)

In [471]:
import datetime as dt
df['date'] = df['date'].apply(lambda x: dt.datetime(x[1], x[0], 15))
                                

In [2]:
# corpus = []
# for item in df.tokens:
#     corpus.append(' '.join(item)) #final corpus list
    
# corpus

In [474]:
df['corpus'] = corpus

### Final dataframe with original text and tokenized corpus in last column

In [475]:
df.head()

Unnamed: 0,date,info,seconds,speaker,text,title,url,views,tokens,corpus
0,2016-11-15,Sleep researcher,633.0,Wendy Troxel,"It's six o'clock in the morning, pitch black o...",Why school should start later for teens,/talks/wendy_troxel_why_school_should_start_la...,106879.0,"[six, clock, morning, pitch, black, outside, y...",six clock morning pitch black outside year old...
1,2017-04-15,"Policy entrepreneur, climate expert, author",787.0,Ted Halstead,I have a two-year-old daughter named Naya who ...,A climate solution where all sides can win,/talks/ted_halstead_a_climate_solution_where_a...,115611.0,"[two, year, old, daughter, name, naya, mistake...",two year old daughter name naya mistake impres...
2,2016-11-15,Caregiver,969.0,Lucy Kalanithi,A few days after my husband Paul was diagnosed...,What makes life worth living in the face of death,/talks/lucy_kalanithi_what_makes_life_worth_li...,385291.0,"[day, husband, paul, diagnose, stage, iv, lung...",day husband paul diagnose stage iv lung cancer...
3,2017-04-15,AI expert,1055.0,Stuart Russell,This is Lee Sedol. Lee Sedol is one of the wor...,3 principles for creating safer AI,/talks/stuart_russell_how_ai_might_make_us_bet...,304731.0,"[lee, sedol, lee, sedol, one, world, great, go...",lee sedol lee sedol one world great go player ...
4,2017-04-15,"Actor, producer, activist",1071.0,Shah Rukh Khan,"Namaskar. I'm a movie star, I'm 51 years of ag...","Thoughts on humanity, fame and love",/talks/shah_rukh_khan_thoughts_on_humanity_fam...,2232142.0,"[namaskar, movie, star, year, age, use, botox,...",namaskar movie star year age use botox yet cle...


In [476]:
# with open('corpus_final.pkl', 'wb') as picklefile:
#     pickle.dump(df, picklefile)

In [342]:
count_vectorizer = CountVectorizer(ngram_range=(1,1), max_df=0.65, min_df=.01, 
                                   token_pattern="\\b[a-z][a-z]+\\b")

X = count_vectorizer.fit_transform(corpus)

In [343]:
X.shape

(2300, 5533)

In [478]:
n_topics = 25
n_iter = 10

lda = LatentDirichletAllocation(n_topics=n_topics,
                                max_iter=n_iter,
                                random_state=0)

lda.fit(X)



LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7, learning_method=None,
             learning_offset=10.0, max_doc_update_iter=100, max_iter=10,
             mean_change_tol=0.001, n_jobs=1, n_topics=25, perp_tol=0.1,
             random_state=0, topic_word_prior=None,
             total_samples=1000000.0, verbose=0)

In [460]:
# from sklearn.externals import joblib
# joblib.dump(lda, 'lda_model_2_20_topics_65_df.pkl') 

['lda_model_2_20_topics_65_df.pkl']

In [479]:
def print_topic_top_words(model, cv, n_top_words=10):
    feature_names = cv.get_feature_names()
    
    for topic_vec in enumerate(model.components_):
        topic_num = topic_vec[0]
        topic_words = topic_vec[1]
        
        print('Topic {}:'.format(topic_num + 1))
        
        topic_values = sorted(zip(topic_words, feature_names), 
                              reverse=True)[:n_top_words]
    
        print(' '.join([y for x,y in topic_values]))
        # pprint(topic_values)
        
    return

#### top 20 words in 25 topics:

In [480]:
n_top_words = 20
print_topic_top_words(lda, count_vectorizer, n_top_words)

Topic 1:
ocean fish sea water ice place big foot particle animal long deep area earth coral last meter high mile whale
Topic 2:
woman child men girl story family mother man love young father school old never home community ask boy parent meet
Topic 3:
woman child disease never black able problem name family doctor mother allow health story love human government body still create
Topic 4:
war country state political power american democracy government group medium vote become attack united religion china election muslim kill middle
Topic 5:
cord spinal soap opera loop nervous initiate stimulate coordinate ego descend starbucks obstacle lesson robot segment stimulation big walk fight
Topic 6:
black hour image data girl hole become woman social help mother job ask often piece white together record human week
Topic 7:
kid play school music learn student game video teacher sound teach child education ask great idea help hear share old
Topic 8:
child feel never hour minute month four help ev

### Visualize intertopic distance

In [348]:
import pyLDAvis, pyLDAvis.sklearn
from IPython.display import display

# Setup to run in Jupyter notebook
pyLDAvis.enable_notebook()

# Create the visualization
vis = pyLDAvis.sklearn.prepare(lda, X, count_vectorizer)

# Export as a standalone HTML web page
pyLDAvis.save_html(vis, 'lda_2_65_df.html')

# Let's view it!
display(vis)

### Turn count-vectorized matrix to an array of the top 100 words

In [428]:
X.toarray()

array([[0, 0, 1, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 1, 1, ..., 0, 0, 0],
       ..., 
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [2, 0, 0, ..., 2, 0, 0]], dtype=int64)

In [448]:
df_cv = pd.DataFrame(X.toarray(),columns=count_vectorizer.get_feature_names()).T

In [450]:
df_cv['total'] = df_cv.sum(axis=1)

In [523]:
df_total = df_cv['total'].sort_values(ascending=False)
df_total[:100] # top 100 words

human          4360
idea           4281
problem        4251
great          4138
big            4095
woman          3984
ask            3862
place          3818
story          3776
country        3735
create         3702
become         3670
percent        3666
today          3662
system         3650
child          3561
question       3561
build          3475
fact           3454
learn          3400
last           3204
end            3178
another        3177
turn           3165
never          3139
technology     3092
old            3078
help           3077
long           3032
love           3023
               ... 
four           2233
person         2217
may            2212
space          2192
experience     2192
grow           2188
family         2183
bring          2175
run            2158
whole          2144
away           2137
home           2131
power          2126
everything     2117
body           2112
light          2095
bad            2082
five           2058
cell           2046
