# Topic Modeling

In [15]:
from sklearn.base import BaseEstimator, TransformerMixin

from sklearn.decomposition import LatentDirichletAllocation
from sklearn.decomposition import TruncatedSVD, NMF

from sklearn.pipeline import make_pipeline

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

import pickle

  return f(*args, **kwds)


In [16]:
class SKTopics(BaseEstimator, TransformerMixin):
    def __init__(self, estimator = 'LDA', n_topics=20):
        self.n_topics = n_topics
        if estimator == 'LSA':
            self.estimator = TruncatedSVD(n_components=self.n_topics)
        elif estimator == 'NMF':
            self.estimator = NMF(n_components=self.n_topics)
        else:
            self.estimator = LatentDirichletAllocation(n_components=self.n_topics)
        self.model = make_pipeline(CountVectorizer(stop_words='english', max_features=200), 
                                   LatentDirichletAllocation(n_components=self.n_topics))
    
    def fit_transform(self, documents):
        self.model.fit_transform(documents)
        return self.model
    
    def get_topics(self, n = 25):
        vectorizer = self.model.named_steps['countvectorizer']
        model = self.model.steps[-1][1]
        names = vectorizer.get_feature_names()
        topics = dict()
        for idx, topic in enumerate(model.components_):
            features = topic.argsort()[:-(n-1): -1]
            tokens = [names[i] for i in features]
            topics[idx] = tokens
        return topics

In [17]:
pwd

'/Users/alexander.fioto/personal_github/Seinfeld-Chatbot/iter-3-GPT2'

In [18]:
with open('../data/episode_dialogues.pkl', 'rb') as f:
    episode_dialogues = pickle.load(f)

In [21]:
dialogue_list = [values for key, values in episode_dialogues.items()]
X = dialogue_list

In [34]:
sk = SKTopics(estimator = 'LDA', n_topics=20)

In [35]:
sk.fit_transform(X)

Pipeline(steps=[('countvectorizer',
                 CountVectorizer(max_features=200, stop_words='english')),
                ('latentdirichletallocation',
                 LatentDirichletAllocation(n_components=20))])

In [36]:
sk.get_topics(n=10)

{0: ['baby', 'man', 'oh', 'look', 'll', 'kramer', 'just', 'like'],
 1: ['yeah', 'know', 'oh', 'don', 'hey', 'gonna', 'like', 'just'],
 2: ['im', 'yeah', 'hey', 'know', 'ah', 'ill', 'oh', 'just'],
 3: ['yeah', 'know', 'oh', 'don', 'did', 'really', 'hey', 'like'],
 4: ['car', 'know', 'hey', 'jerry', 'don', 'gonna', 'alright', 'kramer'],
 5: ['funny', 'know', 'jerry', 'said', 'man', 'oh', 'yeah', 'don'],
 6: ['jerry', 'apartment', 'alright', 'oh', 'newman', 'hello', 'want', 'time'],
 7: ['looking', 'gave', 'hi', 'hey', 'help', 'hello', 'hell', 'heard'],
 8: ['ha', 'newman', 'oh', 'know', 'mr', 'big', 'just', 'yeah'],
 9: ['doctor', 'hey', 'going', 'oh', 'know', 'ya', 'don', 'good'],
 10: ['looking', 'gave', 'hi', 'hey', 'help', 'hello', 'hell', 'heard'],
 11: ['know', 'don', 'like', 'oh', 'll', 'going', 'got', 'right'],
 12: ['oh', 'yeah', 'don', 'hey', 'know', 'just', 'like', 'right'],
 13: ['oh', 'know', 'yeah', 'uh', 'alright', 'just', 'don', 'll'],
 14: ['dont', 'know', 'im', 'just', 