# Topic Modeling

## Attempt 1 - All Text

In [2]:
# Read document-term matrix
import pandas as pd
import pickle

data = pd.read_pickle('dtm_stop.pkl')
data

Unnamed: 0,ab,abandoned,abbey,ability,able,aboutany,aboutim,abscess,absolute,absolutely,...,zacks,zane,ziggy,zip,zoo,ﬁesta,ﬁne,ﬁreworks,ﬂoor,ﬂy
Trainspotting,0,0,0,0,0,0,0,1,0,2,...,0,0,1,0,0,0,0,0,0,0
The_Wrestler,0,0,0,2,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
Whiplash,0,0,0,0,3,0,0,0,2,1,...,0,1,0,0,0,0,0,0,0,0
Coco,0,1,0,0,0,1,1,0,0,0,...,0,0,0,1,0,1,1,1,1,1
Rocky,0,0,0,0,0,0,0,0,0,6,...,1,0,0,0,3,0,0,0,0,0
Oldboy,1,0,1,0,3,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [3]:
# Import LDA with gensim
from gensim import matutils, models
import scipy.sparse

In [4]:
# Transpose matrix
tdm = data.transpose()
tdm.head()

Unnamed: 0,Trainspotting,The_Wrestler,Whiplash,Coco,Rocky,Oldboy
ab,0,0,0,0,0,1
abandoned,0,0,0,1,0,0
abbey,0,0,0,0,0,1
ability,0,2,0,0,0,0
able,0,1,3,0,0,3


In [5]:
# Term-document matrix into a gensim format; df -> sparse matrix -> gensim corpus
sparse_counts = scipy.sparse.csr_matrix(tdm)
corpus = matutils.Sparse2Corpus(sparse_counts)

In [6]:
# Dictionary with all term and their respective location in the term-document matrix
cv = pickle.load(open("cv_stop.pkl", "rb"))
id2word = dict((v, k) for k, v in cv.vocabulary_.items())

In [7]:
# Specify number of topics and number of passes
lda = models.LdaModel(corpus=corpus, id2word=id2word, num_topics=3, passes=10)
lda.print_topics()

[(0,
  '0.014*"yeah" + 0.008*"gonna" + 0.007*"lm" + 0.007*"fucking" + 0.006*"wanna" + 0.006*"look" + 0.006*"say" + 0.005*"hes" + 0.005*"did" + 0.005*"fuck"'),
 (1,
  '0.011*"yeah" + 0.011*"fucking" + 0.010*"oh" + 0.009*"ram" + 0.008*"want" + 0.006*"fuck" + 0.006*"going" + 0.005*"family" + 0.005*"miguel" + 0.005*"la"'),
 (2,
  '0.011*"daesu" + 0.008*"oh" + 0.006*"ill" + 0.006*"years" + 0.006*"did" + 0.006*"really" + 0.005*"want" + 0.004*"let" + 0.004*"lee" + 0.004*"mido"')]

In [8]:
# Specify number of topics and number of passes
lda = models.LdaModel(corpus=corpus, id2word=id2word, num_topics=5, passes=10)
lda.print_topics()

[(0,
  '0.024*"yeah" + 0.016*"ram" + 0.013*"oh" + 0.011*"fuck" + 0.010*"fucking" + 0.009*"gonna" + 0.008*"want" + 0.008*"really" + 0.007*"uh" + 0.006*"ill"'),
 (1,
  '0.013*"lm" + 0.011*"yeah" + 0.009*"rocky" + 0.009*"aint" + 0.008*"fight" + 0.008*"wanna" + 0.007*"look" + 0.007*"hes" + 0.007*"yo" + 0.006*"say"'),
 (2,
  '0.001*"yeah" + 0.000*"oh" + 0.000*"want" + 0.000*"fucking" + 0.000*"did" + 0.000*"look" + 0.000*"say" + 0.000*"gonna" + 0.000*"lm" + 0.000*"hes"'),
 (3,
  '0.001*"yeah" + 0.000*"fucking" + 0.000*"want" + 0.000*"oh" + 0.000*"going" + 0.000*"look" + 0.000*"really" + 0.000*"fuck" + 0.000*"make" + 0.000*"need"'),
 (4,
  '0.011*"fucking" + 0.007*"going" + 0.007*"oh" + 0.007*"want" + 0.006*"family" + 0.006*"did" + 0.005*"miguel" + 0.005*"la" + 0.005*"really" + 0.004*"ill"')]

## Attempt 2 - Noun Only

In [9]:
# Let's creat a function to pull out nouns from a string of text
from nltk import word_tokenize, pos_tag

def nouns(text):
    '''Giben a string of text, tokenize the text and pull out only the nouns.'''
    is_noun = lambda pos: pos[:2] == 'NN'
    tokenized = word_tokenize(text)
    all_nouns = [word for (word, pos) in pos_tag(tokenized) if is_noun(pos)]
    return ' '.join(all_nouns)

In [10]:
# Read in the cleaned data
data_clean = pd.read_pickle('data_clean.pkl')

In [11]:
# Apply the nouns function to the transcripts to filter only on nouns
data_nouns = pd.DataFrame(data_clean.transcript.apply(nouns))
data_nouns

Unnamed: 0,transcript
Trainspotting,career family fucking television choose machin...
The_Wrestler,peoples ram robinson ram haymakers pile driver...
Whiplash,stay name sir year year players ask i answer m...
Coco,something happenedbefore i time family papa mu...
Rocky,youre sucker action youre fightin bum advice w...
Oldboy,i i story hell way fuck name someone elses hol...


In [12]:
# Create a new document_term matrix using only nouns
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import CountVectorizer

# Re-add the additional stop words
add_stop_words = ['ive', 'like', 'im', 'know', 'just', 'dont', 'thats', 'right', 'people',
                 'youre', 'got', 'gonna', 'time', 'think', 'want', 'yeah', 'say', 'hi', 'hello', 'ha'
                 , 'ok', 'uhoh', 'okay', 'lets', 'hey', 'ram', 'jam', 'ramjam', 'yo', 'gon', 'oh', 'mr']
stop_words = text.ENGLISH_STOP_WORDS.union(add_stop_words)

# Recreate a document-term matrix with only nouns
cvn = CountVectorizer(stop_words=stop_words)
data_cvn = cvn.fit_transform(data_nouns.transcript)
data_dtmn = pd.DataFrame(data_cvn.toarray(), columns=cvn.get_feature_names_out())
data_dtmn.index = data_nouns.index
data_dtmn

Unnamed: 0,abbey,ability,aboutany,abscess,academy,accident,accidenthow,account,act,actin,...,yourshow,yousay,youstay,youthe,youve,zoo,ﬁesta,ﬁne,ﬁreworks,ﬂoor
Trainspotting,0,0,0,1,1,1,0,1,1,0,...,0,0,0,0,10,0,0,0,0,0
The_Wrestler,0,2,0,0,0,0,0,0,0,0,...,0,0,0,0,2,0,0,0,0,0
Whiplash,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Coco,0,0,1,0,0,0,1,0,1,0,...,1,1,1,3,0,0,1,1,1,1
Rocky,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,3,0,0,0,0
Oldboy,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0


In [13]:
# Create gensim corpus
corpusn = matutils.Sparse2Corpus(scipy.sparse.csr_matrix(data_dtmn.transpose()))

# Create the vocabulary dictionary
id2wordn = dict((v, k) for k, v in cvn.vocabulary_.items())

In [14]:
# Try different amount of topics
ldan = models.LdaModel(corpus=corpusn, num_topics=4, id2word=id2wordn, passes=10)
ldan.print_topics()

[(0,
  '0.009*"fucking" + 0.009*"man" + 0.009*"years" + 0.009*"daesu" + 0.008*"way" + 0.006*"life" + 0.006*"cunt" + 0.005*"day" + 0.005*"thing" + 0.005*"hit"'),
 (1,
  '0.001*"man" + 0.001*"family" + 0.001*"fuck" + 0.001*"hes" + 0.001*"home" + 0.001*"music" + 0.001*"way" + 0.001*"years" + 0.001*"look" + 0.000*"world"'),
 (2,
  '0.022*"man" + 0.010*"fuck" + 0.008*"look" + 0.007*"hes" + 0.006*"lot" + 0.005*"guys" + 0.005*"thanks" + 0.005*"whats" + 0.005*"night" + 0.005*"things"'),
 (3,
  '0.024*"family" + 0.014*"miguel" + 0.013*"music" + 0.009*"home" + 0.009*"héctor" + 0.008*"mama" + 0.008*"papa" + 0.008*"guitar" + 0.007*"dante" + 0.007*"photo"')]

## Attempt 3 - Nouns and Adjectives only

In [15]:
def nouns_adj(text):
    '''Giben a string of text, tokenize the text and pull out only the nouns and adjectives.'''
    is_noun_adj = lambda pos: pos[:2] == 'NN' or pos[:2] == 'JJ'
    tokenized = word_tokenize(text)
    all_nouns = [word for (word, pos) in pos_tag(tokenized) if is_noun_adj(pos)]
    return ' '.join(all_nouns)

In [16]:
# Apply the nouns_adjective function to the transcripts to filter only on nouns and adjectives
data_nouns_adj = pd.DataFrame(data_clean.transcript.apply(nouns_adj))
data_nouns_adj

Unnamed: 0,transcript
Trainspotting,career family fucking big television choose ma...
The_Wrestler,true american peoples ram robinson ram haymake...
Whiplash,stay name neiman sir year first year im player...
Coco,something happenedbefore i long time family pa...
Rocky,youre sucker action youre fightin bum advice w...
Oldboy,anniversary i i story hell way fuck name oh da...


In [17]:
# Recreate a document-term matrix with only nouns and adjectives also remove common words with max_df
cvna = CountVectorizer(stop_words=stop_words)
data_cvna = cvna.fit_transform(data_nouns_adj.transcript)
data_dtmna = pd.DataFrame(data_cvna.toarray(), columns=cvna.get_feature_names_out())
data_dtmna.index = data_nouns_adj.index
data_dtmna

Unnamed: 0,ab,abbey,ability,able,aboutany,aboutim,abscess,absolute,abuelita,academy,...,youstop,youthe,youve,yoyo,ziggy,zoo,ﬁesta,ﬁne,ﬁreworks,ﬂoor
Trainspotting,0,0,0,0,0,0,1,0,0,1,...,0,0,11,0,1,0,0,0,0,0
The_Wrestler,0,0,2,1,0,0,0,0,0,0,...,0,0,7,0,0,0,0,0,0,0
Whiplash,0,0,0,3,0,0,0,2,0,0,...,0,0,0,0,0,0,0,0,0,0
Coco,0,0,0,0,1,1,0,0,5,0,...,1,3,0,0,0,0,1,1,1,1
Rocky,0,0,0,0,0,0,0,0,0,0,...,0,0,0,2,0,3,0,0,0,0
Oldboy,1,1,0,3,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0


In [18]:
# Create gensim corpus
corpusna = matutils.Sparse2Corpus(scipy.sparse.csr_matrix(data_dtmna.transpose()))

# Create the vocabulary dictionary
id2wordna = dict((v, k) for k, v in cvna.vocabulary_.items())

In [19]:
# Try different amount of topics
ldana = models.LdaModel(corpus=corpusna, num_topics=3, id2word=id2wordna, passes=200)
ldana.print_topics()

[(0,
  '0.019*"man" + 0.016*"good" + 0.010*"uh" + 0.010*"fuck" + 0.007*"little" + 0.007*"randy" + 0.006*"leg" + 0.006*"ill" + 0.005*"big" + 0.005*"ah"'),
 (1,
  '0.011*"family" + 0.010*"good" + 0.008*"miguel" + 0.007*"music" + 0.006*"lm" + 0.006*"hes" + 0.005*"night" + 0.005*"papa" + 0.005*"mama" + 0.005*"home"'),
 (2,
  '0.011*"fucking" + 0.010*"good" + 0.008*"fuck" + 0.008*"man" + 0.006*"daesu" + 0.006*"sorry" + 0.006*"years" + 0.004*"way" + 0.004*"ill" + 0.004*"tommy"')]

In [20]:
# Which topics each episode contains
corpus_transformed = ldana[corpusna]
list(zip([a for [(a, b)] in corpus_transformed], data_dtmna.index))

[(2, 'Trainspotting'),
 (0, 'The_Wrestler'),
 (2, 'Whiplash'),
 (1, 'Coco'),
 (1, 'Rocky'),
 (2, 'Oldboy')]