# Topic Modeling

In [65]:
from sklearn.base import BaseEstimator, TransformerMixin

from sklearn.decomposition import LatentDirichletAllocation
from sklearn.decomposition import TruncatedSVD, NMF

from sklearn.pipeline import make_pipeline

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

import pickle

In [66]:
class SKTopics(BaseEstimator, TransformerMixin):
    def __init__(self, estimator = 'LDA', n_topics=20):
        self.n_topics = n_topics
        if estimator == 'LSA':
            self.estimator = TruncatedSVD(n_components=self.n_topics)
        elif estimator == 'NMF':
            self.estimator = NMF(n_components=self.n_topics)
        else:
            self.estimator = LatentDirichletAllocation(n_components=self.n_topics)
        self.model = make_pipeline(CountVectorizer(stop_words='english', max_features=200), 
                                   LatentDirichletAllocation(n_components=self.n_topics))
    
    def fit_transform(self, documents):
        self.model.fit_transform(documents)
        return self.model
    
    def get_topics(self, n = 25):
        vectorizer = self.model.named_steps['countvectorizer']
        model = self.model.steps[-1][1]
        names = vectorizer.get_feature_names()
        topics = dict()
        for idx, topic in enumerate(model.components_):
            features = topic.argsort()[:-(n-1): -1]
            tokens = [names[i] for i in features]
            topics[idx] = tokens
        return topics

In [67]:
pwd

'/Users/alexander.fioto/personal_github/Seinfeld-Chatbot/iter-3-GPT2'

In [68]:
with open('../data/episode_dialogues.pkl', 'rb') as f:
    episode_dialogues = pickle.load(f)

In [69]:
dialogue_list = [values for key, values in episode_dialogues.items()]
X = dialogue_list

In [74]:
sk = SKTopics(estimator = 'LDA', n_topics=20)

In [75]:
sk.fit_transform(X_summarized)

Pipeline(steps=[('countvectorizer',
                 CountVectorizer(max_features=200, stop_words='english')),
                ('latentdirichletallocation',
                 LatentDirichletAllocation(n_components=20))])

In [76]:
sk.get_topics(n=10)

{0: ['yeah', 'll', 'like', 'mad', 'bucks', 'just', 'tell', 'oh'],
 1: ['know', 'hey', 'yeah', 'oh', 'don', 'right', 'like', 'jerry'],
 2: ['know', 'dont', 'youre', 'im', 'yeah', 'great', 'world', 'guy'],
 3: ['don', 'yeah', 'jerry', 'know', 'tell', 'guy', 'come', 'george'],
 4: ['ha', 'right', 'job', 'big', 'course', 'yeah', 'got', 'maybe'],
 5: ['don', 'hey', 'just', 'got', 'guy', 'know', 'yeah', 'want'],
 6: ['yeah', 'just', 'know', 'uh', 'gonna', 'hey', 'jerry', 'big'],
 7: ['just', 'think', 'good', 'know', 'right', 'uh', 'mean', 'dead'],
 8: ['like', 'want', 'know', 'mean', 'thank', 'peterman', 'did', 've'],
 9: ['oh', 'yeah', 'right', 'hey', 'jerry', 'like', 'hello', 'apartment'],
 10: ['know', 'got', 'yeah', 'don', 'oh', 'really', 'tomorrow', 'george'],
 11: ['like', 'yeah', 'hear', 'wait', 'come', 'little', 'hey', 'hes'],
 12: ['don', 'll', 'think', 'grace', 'hey', 'know', 'just', 'yeah'],
 13: ['got', 'right', 'know', 'baby', 'yeah', 'oh', 'uh', 'huh'],
 14: ['yeah', 'oh', 'kno

In [21]:
from gensim.summarization import summarize, keywords, mz_keywords

In [32]:
sample = X[0]

In [38]:
example_summary = summarize(sample, ratio=.05)

In [81]:
X_summarized = [summarize(x, ratio=.05) for x in X]

In [82]:
X_summarized[0]

'No one has any interest in seeing you on caffeine.How come youre not doin the second show tomorrow?Well, theres this uh, woman might be comin in.Wait a second, wait a second, what coming in, what woman is coming in?I told you about Laura, the girl I met in Michigan?No, you didnt!I thought I told you about it, yes, she teaches political science?\nYou wanna know why she called you?Yes!Youre a back-up, youre a second-line, a just-in-case, a B-plan, a contingency!Oh, I get it, this is about the button.Claire, Claire, youre a woman, right?What gave it away, George?Uhm...Id like to ask you...ask you to analyze a hypothetical phone call, you know, from a female point of view.\nNow, a woman calls me, all right?Uh huh.She says she has to come to New York on business...Oh you are beautiful!...and, and maybe shell see me when she gets there, does this woman intend to spend time with me?Id have to say, uuhh, no.NO.)To be polite.To be polite.\nI feel so cramped...And you didnt even hear how she so

In [79]:
print(keywords(X_summarized[0], ratio=.2))

thinking
think
yeah
thinks youre
world
didnt
birthday
ready


In [56]:
example_summary = example_summary.replace('\n', ' ')
type(sample)

str

In [61]:
mz_keywords(, scores=True,weighted=False, threshold=1.0)

[]

In [36]:
print(summarize(sample, ratio=.05))

No one has any interest in seeing you on caffeine.How come youre not doin the second show tomorrow?Well, theres this uh, woman might be comin in.Wait a second, wait a second, what coming in, what woman is coming in?I told you about Laura, the girl I met in Michigan?No, you didnt!I thought I told you about it, yes, she teaches political science?
You wanna know why she called you?Yes!Youre a back-up, youre a second-line, a just-in-case, a B-plan, a contingency!Oh, I get it, this is about the button.Claire, Claire, youre a woman, right?What gave it away, George?Uhm...Id like to ask you...ask you to analyze a hypothetical phone call, you know, from a female point of view.
Now, a woman calls me, all right?Uh huh.She says she has to come to New York on business...Oh you are beautiful!...and, and maybe shell see me when she gets there, does this woman intend to spend time with me?Id have to say, uuhh, no.NO.)To be polite.To be polite.
I feel so cramped...And you didnt even hear how she sounde