# Topic modelling using LDA

In [47]:
import gensim

In [50]:
corpus = "In terms of unforgettable looks, and enduring desire from enthusiasts who may have grown up gluing together the AMT 3-in-1 model kit that it inspired, the 1940 models stand today as some of the most iconic, instantly recognizable automobiles that the Ford Motor Company ever produced. That year, Fords were produced in two series: Standard and Deluxe. The easiest way to tell them apart is to look for a cleaner one-piece grille on Standard models, while the Deluxe version has a three-piece grille assembly. Both cars also had slightly different pieces of hood trim. This 1940 Ford Standard Tudor sedan was a very popular model that year–around 151,000 of them were built and sold. This Standard has been under the same California ownership since 1994, after the seller bought it from an owner in Texas. The seller describes the car as being entirely original, though the age of the finish and status of any restoration or refresh are unknown."

In [51]:
from nltk import sent_tokenize

In [52]:
list_of_sentence = sent_tokenize(corpus)

In [53]:
list_of_sentence

['In terms of unforgettable looks, and enduring desire from enthusiasts who may have grown up gluing together the AMT 3-in-1 model kit that it inspired, the 1940 models stand today as some of the most iconic, instantly recognizable automobiles that the Ford Motor Company ever produced.',
 'That year, Fords were produced in two series: Standard and Deluxe.',
 'The easiest way to tell them apart is to look for a cleaner one-piece grille on Standard models, while the Deluxe version has a three-piece grille assembly.',
 'Both cars also had slightly different pieces of hood trim.',
 'This 1940 Ford Standard Tudor sedan was a very popular model that year–around 151,000 of them were built and sold.',
 'This Standard has been under the same California ownership since 1994, after the seller bought it from an owner in Texas.',
 'The seller describes the car as being entirely original, though the age of the finish and status of any restoration or refresh are unknown.']

In [54]:
list_of_simple_preprocess_data = []
for i in list_of_sentence:
    list_of_simple_preprocess_data.append(gensim.utils.simple_preprocess(i, deacc=True, min_len=3))

In [55]:
texts = list_of_simple_preprocess_data

In [56]:
texts

[['terms',
  'unforgettable',
  'looks',
  'and',
  'enduring',
  'desire',
  'from',
  'enthusiasts',
  'who',
  'may',
  'have',
  'grown',
  'gluing',
  'together',
  'the',
  'amt',
  'model',
  'kit',
  'that',
  'inspired',
  'the',
  'models',
  'stand',
  'today',
  'some',
  'the',
  'most',
  'iconic',
  'instantly',
  'recognizable',
  'automobiles',
  'that',
  'the',
  'ford',
  'motor',
  'company',
  'ever',
  'produced'],
 ['that',
  'year',
  'fords',
  'were',
  'produced',
  'two',
  'series',
  'standard',
  'and',
  'deluxe'],
 ['the',
  'easiest',
  'way',
  'tell',
  'them',
  'apart',
  'look',
  'for',
  'cleaner',
  'one',
  'piece',
  'grille',
  'standard',
  'models',
  'while',
  'the',
  'deluxe',
  'version',
  'has',
  'three',
  'piece',
  'grille',
  'assembly'],
 ['both',
  'cars',
  'also',
  'had',
  'slightly',
  'different',
  'pieces',
  'hood',
  'trim'],
 ['this',
  'ford',
  'standard',
  'tudor',
  'sedan',
  'was',
  'very',
  'popular',
  

In [57]:
bigram = gensim.models.Phrases(list_of_simple_preprocess_data) 

In [9]:
bigram

<gensim.models.phrases.Phrases at 0x7ffb13d92990>

In [58]:
from gensim.utils import lemmatize
from nltk.corpus import stopwords

In [59]:
stops = set(stopwords.words('english')) 

In [60]:
def process_texts(texts):
    texts = [[word for word in line if word not in stops] for line in texts]
    texts = [bigram[line] for line in texts]
    texts = [[word.decode("utf-8").split('/')[0] for word in lemmatize(' '.join(line), allowed_tags=re.compile('(NN)'), min_length=5)] for line in texts]
    return texts

In [61]:
import re
train_texts = process_texts(list_of_simple_preprocess_data)

In [62]:
from gensim.models import LdaModel
from gensim.models.wrappers import LdaMallet
from gensim.corpora import Dictionary

In [63]:
train_texts

[['desire',
  'enthusiast',
  'model',
  'model',
  'today',
  'iconic',
  'automobile',
  'motor',
  'company'],
 ['series', 'standard'],
 ['piece',
  'grille',
  'standard',
  'model',
  'version',
  'piece',
  'grille',
  'assembly'],
 ['piece'],
 ['standard', 'tudor', 'sedan', 'model'],
 ['standard', 'california', 'ownership', 'seller', 'owner', 'texas'],
 ['seller', 'though', 'status', 'restoration']]

In [67]:
#dictionary = Dictionary(train_texts)
corpus = [dictionary.doc2bow(text) for text in train_texts]

In [66]:
print(dictionary)

Dictionary(24 unique tokens: ['automobile', 'company', 'desire', 'enthusiast', 'iconic']...)


In [46]:
print(corpus)

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 2), (6, 1), (7, 1)], [(8, 1), (9, 1)], [(5, 1), (9, 1), (10, 1), (11, 2), (12, 2), (13, 1)], [(12, 1)], [(5, 1), (9, 1), (14, 1), (15, 1)], [(9, 1), (16, 1), (17, 1), (18, 1), (19, 1), (20, 1)], [(19, 1), (21, 1), (22, 1), (23, 1)]]


In [68]:
ldamodel = LdaModel(corpus=corpus, num_topics=2, id2word=dictionary)

In [69]:
ldamodel.show_topics()

[(0,
  '0.120*"model" + 0.094*"standard" + 0.086*"piece" + 0.070*"grille" + 0.042*"iconic" + 0.041*"version" + 0.041*"company" + 0.041*"assembly" + 0.041*"today" + 0.041*"automobile"'),
 (1,
  '0.098*"seller" + 0.074*"standard" + 0.061*"texas" + 0.061*"california" + 0.061*"ownership" + 0.060*"owner" + 0.058*"restoration" + 0.057*"though" + 0.056*"status" + 0.043*"piece"')]

In [70]:
import pyLDAvis.gensim
pyLDAvis.enable_notebook()

In [71]:
pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))
