# Topic clustering using LDA

### 1. Clean data and return tokens.

In [1]:
import spacy
spacy.load('en')
from spacy.lang.en import English
parser = English()

def tokenize(text):
    lda_tokens = []
    tokens = parser(text)
    for token in tokens:
        if token.orth_.isspace():
            continue
        elif token.like_url:
            lda_tokens.append('URL')
        elif token.orth_.startswith('@'):
            lda_tokens.append('SCREEN_NAME')
        else:
            lda_tokens.append(token.lower_)
    return lda_tokens


###  Find the meanings of words, synonyms,antonyms using WordNet

In [2]:
import nltk
nltk.download('wordnet')



[nltk_data] Downloading package wordnet to
[nltk_data]     /home/quicksilver/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
from nltk.corpus import wordnet as wn
def get_lemma(word):
    lemma = wn.morphy(word)
    if lemma is None:
        return word
    else:
        return lemma
    
from nltk.stem.wordnet import WordNetLemmatizer
def get_lemma2(word):
    return WordNetLemmatizer().lemmatize(word)


In [4]:
nltk.download('stopwords')
en_stop = set(nltk.corpus.stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/quicksilver/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Prepare text for topic clustering

In [5]:
def prepare_text_for_lda(text):
    tokens = tokenize(text)
    tokens = [token for token in tokens if len(token) > 4]
    tokens = [token for token in tokens if token not in en_stop]
    tokens = [get_lemma(token) for token in tokens]
    return tokens


In [45]:
import random
text_data = []
with open('bigtranscript.txt') as f:
    for line in f:
        tokens = prepare_text_for_lda(line)
        print(tokens)
        text_data.append(tokens)

['00:00:00', 'welcome', 'resolution', 'bobby', 'ghosh', 'office', 'jar', 'around', 'speaking', 'masters', 'design', 'industry', 'still', 'still', 'life', 'thank', 'joining', 'speaking', 'today', 'speaking', 'scott', 'belsky', 'scott', 'founder', 'behance', 'going', 'maximize', 'creative', 'output', 'constraint', 'boost', 'creativity', 'pattern', 'design', 'company']
['00:00:30', 'watch', 'episode', 'around', 'right', 'partner', 'message', 'going', 'thanks', 'squarespace', 'support', 'domain', 'website', 'online', 'store', 'squarespace', 'visit', 'enter', 'resolution', 'first', 'purchase', 'scott', 'thanks', 'joining', 'thanks']
['00:01:00', 'thing', 'design', 'clear', 'state', 'people', 'wednesday', 'think', 'things', 'become', 'clear', 'first', 'design', 'little', 'things', 'difference', 'think', 'little', 'things', 'override', 'corporation', 'sometimes', 'prioritize', 'small', 'team', 'little', 'things', 'whether', 'aesthetic']
['00:01:30', 'experience', 'product', 'whether', 'decisi

['00:00:00', 'welcome', 'resolution', 'bobby', 'gould', 'jar', 'around', 'going', 'masters', 'design', 'industry', 'going', 'learn', 'going', 'learn', 'company', 'world', 'approach', 'communicate', 'deploy', 'design', 'business', 'every', 'single', 'episode', 'experience', 'story', '00:00:30', 'discussion', 'going', 'stick', 'around', 'right', 'partner', 'message', 'thanks', 'squarespace', 'support', 'whether', 'domain', 'website', 'online', 'store', 'squarespace', 'visit', 'enter', 'resolution', 'first', 'purchase', 'thanks', 'joining', 'thing', 'clear', 'feel00:01:00', 'people', 'getting', 'better', 'would', 'still', 'design', 'steve', 'look', 'works', 'paraphrase', 'course', 'believe', 'little', 'think', 'design', 'ralph', 'thinking', 'solving', 'problem', 'design', 'simple', 'intentional', 'something', 'try', '00:01:30', 'problem', 'thinking', 'hopefully', 'maybe', 'collaborative', 'fashion', 'hear', 'fashion', 'often', 'times', 'rule', 'design', 'tool', 'really', 'something', 'alm

* First, we are creating a dictionary from the data, then converting to a bag-of-words corpus and saving the dictionary and corpus for future use.
* We are using Gensim for this particular task.

In [46]:
from gensim import corpora
dictionary = corpora.Dictionary(text_data)


In [47]:
corpus = [dictionary.doc2bow(text) for text in text_data]

### At present, we are limiting number of topics per mixture to 5

In [49]:
import gensim
NUM_TOPICS = 5
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=30)
ldamodel.save('model21.gensim')

In [50]:


topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)



(0, '0.012*"going" + 0.011*"right" + 0.010*"something" + 0.010*"middle"')
(1, '0.024*"design" + 0.020*"think" + 0.018*"people" + 0.017*"going"')
(2, '0.011*"designer" + 0.011*"creative" + 0.010*"would" + 0.010*"interface"')
(3, '0.019*"think" + 0.019*"things" + 0.012*"product" + 0.012*"company"')
(4, '0.017*"designer" + 0.016*"think" + 0.016*"design" + 0.013*"people"')
