# Topic Modeling

## Topic Modeling - Attempt #1 (All Text)

In [33]:
import pandas as pd
import pickle

data = pd.read_pickle('dtm_stop_ted.pkl')
data

Unnamed: 0,abdul,ability,able,abortion,abroad,absence,absolutely,abuse,abusive,academia,...,yearold,yes,york,youd,youll,young,youre,youth,youve,zero
andrew,0,0,3,0,1,0,0,0,1,0,...,0,2,1,0,2,0,7,0,2,0
brene,0,1,2,0,0,0,3,1,0,0,...,0,0,0,0,0,1,7,0,0,0
cameron,0,0,1,0,0,0,0,0,0,0,...,0,0,2,1,1,3,2,1,2,0
celeste,0,0,1,1,0,0,0,0,0,0,...,0,2,0,0,1,0,11,0,3,0
maze,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,8,0,0,0
robert,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,2,4,0,0,0
shaw,0,2,3,0,0,1,0,0,0,1,...,2,0,0,1,0,0,4,0,3,0
simon,0,1,3,0,0,0,0,0,0,0,...,0,1,2,0,0,0,4,0,0,1
thomas,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
tom,0,1,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [34]:
from gensim import matutils, models
import scipy.sparse

# import logging
# logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [35]:
tdm = data.transpose()
tdm.head()

Unnamed: 0,andrew,brene,cameron,celeste,maze,robert,shaw,simon,thomas,tom
abdul,0,0,0,0,1,0,0,0,0,0
ability,0,1,0,0,0,0,2,1,0,1
able,3,2,1,1,0,0,3,3,1,0
abortion,0,0,0,1,0,0,0,0,0,0
abroad,1,0,0,0,0,0,0,0,0,0


In [36]:
# df --> sparse matrix --> gensim corpus
sparse_counts = scipy.sparse.csr_matrix(tdm)
corpus = matutils.Sparse2Corpus(sparse_counts)

In [37]:
# Gensim also requires dictionary of the all terms and their respective location in the term-document matrix
cv = pickle.load(open("cv_stop_ted.pkl", "rb"))
id2word = dict((v, k) for k, v in cv.vocabulary_.items())

In [38]:
lda = models.LdaModel(corpus=corpus, id2word=id2word, num_topics=2, passes=10)
lda.print_topics()

[(0,
  '0.009*"think" + 0.009*"depression" + 0.008*"life" + 0.008*"feel" + 0.007*"thats" + 0.006*"really" + 0.006*"work" + 0.006*"brain" + 0.006*"look" + 0.006*"talk"'),
 (1,
  '0.014*"dont" + 0.010*"believe" + 0.008*"talk" + 0.007*"youre" + 0.007*"right" + 0.007*"theyre" + 0.006*"buy" + 0.005*"conversation" + 0.005*"listen" + 0.005*"great"')]

In [39]:
# LDA for num_topics = 3
lda = models.LdaModel(corpus=corpus, id2word=id2word, num_topics=3, passes=10)
lda.print_topics()

[(0,
  '0.012*"dont" + 0.009*"talk" + 0.007*"conversation" + 0.007*"app" + 0.007*"listen" + 0.006*"look" + 0.006*"theyre" + 0.006*"question" + 0.006*"id" + 0.006*"model"'),
 (1,
  '0.015*"depression" + 0.011*"life" + 0.010*"think" + 0.010*"feel" + 0.007*"relationship" + 0.007*"dont" + 0.006*"talk" + 0.006*"study" + 0.006*"youre" + 0.006*"good"'),
 (2,
  '0.011*"believe" + 0.010*"brain" + 0.009*"thats" + 0.007*"work" + 0.007*"dont" + 0.007*"think" + 0.006*"buy" + 0.005*"world" + 0.005*"talk" + 0.005*"theyre"')]

In [40]:
# LDA for num_topics = 4
lda = models.LdaModel(corpus=corpus, id2word=id2word, num_topics=4, passes=10)
lda.print_topics()

[(0,
  '0.010*"vulnerability" + 0.009*"app" + 0.009*"thats" + 0.009*"think" + 0.008*"work" + 0.008*"really" + 0.008*"connection" + 0.007*"talk" + 0.007*"love" + 0.006*"research"'),
 (1,
  '0.012*"life" + 0.012*"dont" + 0.010*"relationship" + 0.010*"study" + 0.010*"talk" + 0.009*"conversation" + 0.008*"good" + 0.008*"listen" + 0.007*"really" + 0.007*"youre"'),
 (2,
  '0.018*"brain" + 0.010*"positive" + 0.009*"happiness" + 0.007*"school" + 0.007*"thats" + 0.007*"average" + 0.007*"sister" + 0.007*"success" + 0.006*"world" + 0.006*"change"'),
 (3,
  '0.012*"depression" + 0.011*"think" + 0.011*"dont" + 0.009*"feel" + 0.008*"believe" + 0.007*"right" + 0.006*"talk" + 0.006*"thats" + 0.006*"day" + 0.005*"look"')]

## Topic Modeling - Attempt #2 (Nouns Only)

In [41]:
# function to pull out nouns 
from nltk import word_tokenize, pos_tag

def nouns(text):
    is_noun = lambda pos: pos[:2] == 'NN'
    tokenized = word_tokenize(text)
    all_nouns = [word for (word, pos) in pos_tag(tokenized) if is_noun(pos)] 
    return ' '.join(all_nouns)

In [42]:
# Read in the cleaned data, before the CountVectorizer step
data_clean = pd.read_pickle('data_clean_ted.pkl')
data_clean

Unnamed: 0,transcript
andrew,i felt a funeral in my brain and mourner to an...
brene,so ill start with this a couple year ago an ev...
cameron,hi my name be cameron russell and for the last...
celeste,all right i want to see a show of hand how man...
maze,hello doha hello salaam alaikum i love come to...
robert,what keep u healthy and happy a we go through ...
shaw,when i be seven year old and my sister be just...
simon,how do you explain when thing dont go a we ass...
thomas,ive always have a fascination for computer and...
tom,my name be tom and ive come here today to come...


In [43]:
data_nouns = pd.DataFrame(data_clean.transcript.apply(nouns))
data_nouns

Unnamed: 0,transcript
andrew,i funeral brain mourner tread — till sense — s...
brene,start year event planner i speak event call fl...
cameron,hi name russell model year tension room i dres...
celeste,i show hand someone facebook something politic...
maze,hello doha hello salaam alaikum i place nation...
robert,life future self time energy survey millennial...
shaw,i year sister year top bunk bed year sister ti...
simon,thing dont others thing assumption example app...
thomas,fascination computer technology i apps iphone ...
tom,name come today i money mouth way exchange cas...


In [44]:
# document-term matrix using only nouns
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import CountVectorizer

# Re-add stop words 
add_stop_words = ['like', 'im', 'know', 'year', 'dont', 'thats', 'right', 'people',
                  'youre', 'got', 'gonna', 'time', 'think', 'thing', 'say','way']
stop_words = text.ENGLISH_STOP_WORDS.union(add_stop_words)

# document-term matrix with only nouns
cvn = CountVectorizer(stop_words=stop_words)
data_cvn = cvn.fit_transform(data_nouns.transcript)
data_dtmn = pd.DataFrame(data_cvn.toarray(), columns=cvn.get_feature_names())
data_dtmn.index = data_nouns.index
data_dtmn

Unnamed: 0,abdul,ability,abortion,absence,abuse,academia,acceptance,access,accord,account,...,wuhahaha,yale,yarn,yeah,yes,york,youd,youll,youth,youve
andrew,0,0,0,0,0,0,0,1,0,0,...,0,0,1,0,0,1,0,1,0,0
brene,0,1,0,0,1,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
cameron,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,2,1,1,1,2
celeste,0,0,1,0,0,0,1,0,1,0,...,0,0,0,0,1,0,0,0,0,3
maze,1,0,0,0,0,0,0,0,0,0,...,1,0,0,1,0,0,0,0,0,0
robert,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
shaw,0,2,0,1,0,1,0,0,0,1,...,0,2,0,0,0,0,0,0,0,3
simon,0,1,0,0,0,0,1,1,0,0,...,0,0,0,0,0,2,0,0,0,0
thomas,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
tom,0,1,0,0,0,0,0,0,0,0,...,0,0,0,2,0,0,0,0,0,0


In [45]:
#gensim corpus
corpusn = matutils.Sparse2Corpus(scipy.sparse.csr_matrix(data_dtmn.transpose()))

# Create the vocabulary dictionary
id2wordn = dict((v, k) for k, v in cvn.vocabulary_.items())

In [46]:
# 2 topics
ldan = models.LdaModel(corpus=corpusn, num_topics=2, id2word=id2wordn, passes=10)
ldan.print_topics()

[(0,
  '0.027*"laughter" + 0.013*"life" + 0.011*"brain" + 0.008*"work" + 0.008*"relationship" + 0.007*"talk" + 0.007*"study" + 0.007*"conversation" + 0.007*"research" + 0.006*"school"'),
 (1,
  '0.023*"depression" + 0.011*"day" + 0.009*"laughter" + 0.007*"treatment" + 0.007*"fact" + 0.007*"brain" + 0.007*"life" + 0.006*"model" + 0.006*"world" + 0.005*"work"')]

In [47]:
# topics = 3
ldan = models.LdaModel(corpus=corpusn, num_topics=3, id2word=id2wordn, passes=10)
ldan.print_topics()

[(0,
  '0.028*"life" + 0.025*"laughter" + 0.022*"relationship" + 0.017*"study" + 0.011*"men" + 0.010*"friend" + 0.008*"family" + 0.006*"health" + 0.006*"guy" + 0.006*"east"'),
 (1,
  '0.020*"laughter" + 0.009*"conversation" + 0.009*"talk" + 0.009*"work" + 0.008*"vulnerability" + 0.008*"day" + 0.007*"story" + 0.007*"research" + 0.007*"theyre" + 0.007*"connection"'),
 (2,
  '0.025*"depression" + 0.017*"brain" + 0.015*"laughter" + 0.009*"day" + 0.008*"world" + 0.008*"school" + 0.008*"lot" + 0.008*"treatment" + 0.007*"experience" + 0.007*"happiness"')]

In [48]:
# Let's try 4 topics
ldan = models.LdaModel(corpus=corpusn, num_topics=4, id2word=id2wordn, passes=10)
ldan.print_topics()

[(0,
  '0.032*"laughter" + 0.026*"brain" + 0.012*"happiness" + 0.011*"school" + 0.010*"sister" + 0.010*"success" + 0.009*"world" + 0.007*"friend" + 0.007*"guy" + 0.007*"problem"'),
 (1,
  '0.020*"laughter" + 0.012*"conversation" + 0.010*"vulnerability" + 0.010*"work" + 0.009*"talk" + 0.009*"theyre" + 0.009*"connection" + 0.008*"research" + 0.007*"number" + 0.007*"world"'),
 (2,
  '0.033*"depression" + 0.014*"laughter" + 0.013*"day" + 0.010*"treatment" + 0.009*"model" + 0.009*"lot" + 0.008*"app" + 0.008*"life" + 0.008*"question" + 0.008*"experience"'),
 (3,
  '0.031*"life" + 0.026*"relationship" + 0.020*"study" + 0.013*"men" + 0.009*"laughter" + 0.008*"work" + 0.008*"health" + 0.008*"family" + 0.008*"loop" + 0.007*"id"')]

## Topic Modeling - Attempt #3 (Nouns and Adjectives)

In [49]:
def nouns_adj(text):
    is_noun_adj = lambda pos: pos[:2] == 'NN' or pos[:2] == 'JJ'
    tokenized = word_tokenize(text)
    nouns_adj = [word for (word, pos) in pos_tag(tokenized) if is_noun_adj(pos)] 
    return ' '.join(nouns_adj)

In [50]:
data_nouns_adj = pd.DataFrame(data_clean.transcript.apply(nouns_adj))
data_nouns_adj

Unnamed: 0,transcript
andrew,i funeral brain mourner tread — tread — till s...
brene,ill start couple year event planner i speak ev...
cameron,hi name russell last little ive model year i u...
celeste,right i show hand many someone facebook someth...
maze,hello doha hello salaam alaikum i internationa...
robert,u healthy happy life future best self time ene...
shaw,i year old sister year old top bunk bed i year...
simon,thing dont others able thing assumption exampl...
thomas,ive fascination computer technology i few apps...
tom,name tom ive come today clean i money i mouth ...


In [51]:
# document-term matrix using only nouns and adjectives
cvna = CountVectorizer(stop_words=stop_words, max_df=.8)
data_cvna = cvna.fit_transform(data_nouns_adj.transcript)
data_dtmna = pd.DataFrame(data_cvna.toarray(), columns=cvna.get_feature_names())
data_dtmna.index = data_nouns_adj.index
data_dtmna

Unnamed: 0,abdul,ability,able,abortion,absence,abuse,abusive,academia,academic,acceptance,...,yeah,yearlong,yearold,yes,york,youd,youll,young,youth,youve
andrew,0,0,3,0,0,0,1,0,1,0,...,0,0,0,0,1,0,1,0,0,0
brene,0,1,2,0,0,1,0,0,2,0,...,0,1,0,0,0,0,0,1,0,0
cameron,0,0,1,0,0,0,0,0,0,0,...,1,0,0,0,2,1,1,3,1,2
celeste,0,0,1,1,0,0,0,0,0,1,...,0,0,0,1,0,0,0,0,0,3
maze,1,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
robert,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,2,0,0
shaw,0,2,3,0,1,0,0,1,0,0,...,0,0,2,0,0,1,0,0,0,3
simon,0,1,3,0,0,0,0,0,0,1,...,0,0,0,1,2,0,0,0,0,0
thomas,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
tom,0,1,0,0,0,0,0,0,0,0,...,2,0,0,0,0,0,0,0,0,0


In [52]:
#gensim corpus
corpusna = matutils.Sparse2Corpus(scipy.sparse.csr_matrix(data_dtmna.transpose()))

#vocabulary dictionary
id2wordna = dict((v, k) for k, v in cvna.vocabulary_.items())

In [53]:
# 2 topics
ldana = models.LdaModel(corpus=corpusna, num_topics=2, id2word=id2wordna, passes=10)
ldana.print_topics()

[(0,
  '0.015*"depression" + 0.007*"talk" + 0.006*"life" + 0.006*"theyre" + 0.006*"day" + 0.006*"conversation" + 0.006*"vulnerability" + 0.005*"question" + 0.005*"research" + 0.005*"experience"'),
 (1,
  '0.012*"brain" + 0.009*"life" + 0.006*"relationship" + 0.006*"world" + 0.006*"study" + 0.005*"day" + 0.005*"school" + 0.005*"success" + 0.005*"positive" + 0.004*"app"')]

In [54]:
#3 topics
ldana = models.LdaModel(corpus=corpusna, num_topics=3, id2word=id2wordna, passes=10)
ldana.print_topics()

[(0,
  '0.013*"app" + 0.010*"middle" + 0.009*"lot" + 0.008*"program" + 0.007*"east" + 0.007*"hello" + 0.006*"little" + 0.006*"ive" + 0.006*"guy" + 0.006*"sir"'),
 (1,
  '0.017*"brain" + 0.011*"life" + 0.009*"relationship" + 0.008*"study" + 0.008*"world" + 0.007*"success" + 0.006*"day" + 0.006*"positive" + 0.006*"happiness" + 0.005*"men"'),
 (2,
  '0.016*"depression" + 0.007*"talk" + 0.007*"life" + 0.006*"theyre" + 0.006*"day" + 0.006*"conversation" + 0.006*"vulnerability" + 0.005*"question" + 0.005*"experience" + 0.005*"little"')]

In [55]:
# 4 topics
ldana = models.LdaModel(corpus=corpusna, num_topics=4, id2word=id2wordna, passes=10)
ldana.print_topics()

[(0,
  '0.025*"depression" + 0.009*"vulnerability" + 0.008*"life" + 0.008*"treatment" + 0.007*"connection" + 0.007*"experience" + 0.006*"day" + 0.006*"research" + 0.005*"theyre" + 0.005*"story"'),
 (1,
  '0.020*"brain" + 0.011*"positive" + 0.010*"happiness" + 0.008*"sister" + 0.008*"school" + 0.008*"average" + 0.007*"middle" + 0.007*"success" + 0.007*"world" + 0.006*"negative"'),
 (2,
  '0.011*"loop" + 0.008*"great" + 0.008*"id" + 0.008*"applause" + 0.008*"lady" + 0.008*"music" + 0.008*"gentleman" + 0.007*"sound" + 0.007*"brisbane" + 0.007*"voice"'),
 (3,
  '0.012*"life" + 0.008*"relationship" + 0.008*"day" + 0.008*"theyre" + 0.007*"conversation" + 0.007*"study" + 0.006*"talk" + 0.006*"question" + 0.006*"app" + 0.006*"number"')]

## Identify Topics in Each Document

In [58]:
#final LDA
ldana = models.LdaModel(corpus=corpusna, num_topics=3, id2word=id2wordna, passes=100)
ldana.print_topics()

[(0,
  '0.025*"depression" + 0.017*"life" + 0.012*"relationship" + 0.009*"study" + 0.008*"day" + 0.008*"treatment" + 0.006*"experience" + 0.005*"bad" + 0.005*"men" + 0.005*"medication"'),
 (1,
  '0.011*"vulnerability" + 0.008*"connection" + 0.007*"research" + 0.006*"middle" + 0.006*"shame" + 0.006*"lady" + 0.006*"story" + 0.006*"applause" + 0.006*"life" + 0.005*"theyre"'),
 (2,
  '0.013*"brain" + 0.007*"world" + 0.007*"day" + 0.006*"conversation" + 0.006*"school" + 0.006*"success" + 0.006*"question" + 0.006*"talk" + 0.006*"theyre" + 0.005*"positive"')]

four topics 
* Topic 0: life, relation
* Topic 1: research
* Topic 2: conversation

In [59]:
# topics each transcript contains
corpus_transformed = ldana[corpusna]
list(zip([a for [(a,b)] in corpus_transformed], data_dtmna.index))

[(0, 'andrew'),
 (1, 'brene'),
 (2, 'cameron'),
 (2, 'celeste'),
 (1, 'maze'),
 (0, 'robert'),
 (2, 'shaw'),
 (2, 'simon'),
 (2, 'thomas'),
 (1, 'tom')]

  first pass of LDA
* Topic 0: life, relation [andrew,maze,robert]
* Topic 1: research [brene,simon]
* Topic 2: conversation [cameron,celeste,shaw,thomas,tom]