# Topic clustering using LDA

### 1. Clean data and return tokens.

In [1]:
import spacy
spacy.load('en')
from spacy.lang.en import English
parser = English()

def tokenize(text):
    lda_tokens = []
    tokens = parser(text)
    for token in tokens:
        if token.orth_.isspace():
            continue
        elif token.like_url:
            lda_tokens.append('URL')
        elif token.orth_.startswith('@'):
            lda_tokens.append('SCREEN_NAME')
        else:
            lda_tokens.append(token.lower_)
    return lda_tokens


###  Find the meanings of words, synonyms,antonyms using WordNet

In [2]:
import nltk
nltk.download('wordnet')



[nltk_data] Downloading package wordnet to
[nltk_data]     /home/quicksilver/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
from nltk.corpus import wordnet as wn
def get_lemma(word):
    lemma = wn.morphy(word)
    if lemma is None:
        return word
    else:
        return lemma
    
from nltk.stem.wordnet import WordNetLemmatizer
def get_lemma2(word):
    return WordNetLemmatizer().lemmatize(word)


In [4]:
nltk.download('stopwords')
en_stop = set(nltk.corpus.stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/quicksilver/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Prepare text for topic clustering

In [5]:
def prepare_text_for_lda(text):
    tokens = tokenize(text)
    tokens = [token for token in tokens if len(token) > 4]
    tokens = [token for token in tokens if token not in en_stop]
    tokens = [get_lemma(token) for token in tokens]
    return tokens


For starters, we have a transcription that contains a discussion about product design.

In [6]:
import random
text_data = []
with open('UIUX.txt') as f:
    for line in f:
        tokens = prepare_text_for_lda(line)
        print(tokens)
        text_data.append(tokens)

['00:00:00', 'richard', 'fulcher', 'richard', 'designer', 'google', 'senior', 'designer', 'right', 'working', 'years', 'thank', 'joining', 'pleasure', 'today', 'describe', 'experience', 'design', 'umbrella', 'several', 'relate', 'discipline', 'focus', 'design', 'experience']
['00:00:30', 'individual', 'person', 'achieve', 'different', 'type', 'experience', 'commonly', 'think', 'software', 'design', 'things', 'environment', 'design', 'maybe', 'design', 'physical', 'product', 'things', 'event', 'coordination', 'anything', 'experience', 'construct', 'going', 'first', 'going', 'concept', 'study']
['00:01:00', 'contact', 'environment', 'operate', 'order', 'design', 'tool', 'achieve', 'task', 'complete', 'goal', 'user', 'contact', 'task', 'goal', 'goal', 'anyone', 'going', 'product', 'service', 'designing', 'really', 'important', 'remember', 'really']
['00:01:30', 'range', 'condemn', 'designing', 'think', 'designing', 'super', 'idealize', 'reality', 'going', 'building', 'people', 'different'

* First, we are creating a dictionary from the data, then converting to a bag-of-words corpus and saving the dictionary and corpus for future use.
* We are using Gensim for this particular task.

In [7]:
from gensim import corpora
dictionary = corpora.Dictionary(text_data)


In [8]:
corpus = [dictionary.doc2bow(text) for text in text_data]

### At present, we are limiting number of topics per mixture to 5

In [9]:
import gensim
NUM_TOPICS = 5
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=30)
ldamodel.save('model5.gensim')

In [10]:


topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)



(0, '0.069*"design" + 0.056*"experience" + 0.029*"going" + 0.029*"things"')
(1, '0.068*"different" + 0.047*"designing" + 0.025*"people" + 0.025*"context"')
(2, '0.047*"research" + 0.025*"working" + 0.025*"think" + 0.025*"user"')
(3, '0.061*"goal" + 0.032*"product" + 0.032*"going" + 0.032*"task"')
(4, '0.009*"going" + 0.009*"product" + 0.009*"things" + 0.009*"think"')


* We have been able to generate top 5 topic mixtures from our collection of transcripts.
* We can safely say that all these topics refer to aspects of product design in some way or the other.


### Running the model after combining all transcripts

In [11]:
with open('bigtranscript.txt') as f:
    for line in f:
        tokens = prepare_text_for_lda(line)
        text_data.append(tokens)

In [12]:
dictionary = corpora.Dictionary(text_data)

In [13]:
corpus = [dictionary.doc2bow(text) for text in text_data]

In [14]:
NUM_TOPICS = 5
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=30)
ldamodel.save('model0.gensim')

In [15]:
topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

(0, '0.017*"think" + 0.011*"people" + 0.011*"things" + 0.010*"would"')
(1, '0.023*"think" + 0.021*"things" + 0.016*"really" + 0.013*"going"')
(2, '0.027*"really" + 0.020*"image" + 0.015*"people" + 0.012*"great"')
(3, '0.023*"design" + 0.015*"going" + 0.014*"think" + 0.014*"people"')
(4, '0.022*"think" + 0.018*"people" + 0.015*"really" + 0.012*"designer"')


#### This is how the summary of all topics present inside the transcripts looks like. Enough to give us a general idea of what these interviews tend to discuss about.

### Further Work

* Try lda2vec and compare results with lda.
* Assign labels to topics generated by the unsupervised learning model.