# Topic Modeling using Latent Dirichlet Allocation (LDA)

In [20]:
import re
import numpy as np
import pandas as pd
import gensim
import string
import nltk
from nltk.tokenize import RegexpTokenizer, word_tokenize
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer

## Document corpus

In [30]:
documents = ["Sugar is bad to consume. My sister likes to have sugar, but not my father.",
        "My father spends a lot of time driving my sister around to dance practice.",
        "Doctors suggest that driving may cause increased stress and blood pressure.",
        "Sometimes I feel pressure to perform well at school, but my father never seems to drive my sister to do better.",
        "Health experts say that Sugar is not good for your lifestyle."]

## Data Preprocessing

In [25]:
def preprocessing(documents):
    
    tokenizer = RegexpTokenizer(r'\w+')

    for i in range(len(documents)):
        filtered_words = []
        tokens = tokenizer.tokenize(documents[i])
        
        words = [word.lower() for word in tokens]
        
        for word in words:
            if word.isalnum() and (word not in stop_words) and len(word) > 1:
                filtered_words.append(word)
        
        stemmed_words = [lemma.lemmatize(word) for word in filtered_words]
        
        documents[i] = stemmed_words
        
    return documents

In [31]:
tokenizer = RegexpTokenizer(r'\w+')
stop_words = set(stopwords.words('english'))
lemma = WordNetLemmatizer()

preprocessed_documents = preprocessing(documents)

In [32]:
preprocessed_documents

[['sugar', 'bad', 'consume', 'sister', 'like', 'sugar', 'father'],
 ['father',
  'spends',
  'lot',
  'time',
  'driving',
  'sister',
  'around',
  'dance',
  'practice'],
 ['doctor',
  'suggest',
  'driving',
  'may',
  'cause',
  'increased',
  'stress',
  'blood',
  'pressure'],
 ['sometimes',
  'feel',
  'pressure',
  'perform',
  'well',
  'school',
  'father',
  'never',
  'seems',
  'drive',
  'sister',
  'better'],
 ['health', 'expert', 'say', 'sugar', 'good', 'lifestyle']]

## Building the Document-Term Matrix

In [34]:
# Importing Gensim
import gensim
from gensim import corpora

# Creating the term dictionary of our courpus, where every unique term is assigned an index. 
dictionary = corpora.Dictionary(doc_clean)

# Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above.
doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean]

In [35]:
doc_term_matrix

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 2)],
 [(2, 1), (4, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1)],
 [(8, 1),
  (13, 1),
  (14, 1),
  (15, 1),
  (16, 1),
  (17, 1),
  (18, 1),
  (19, 1),
  (20, 1)],
 [(2, 1),
  (4, 1),
  (18, 1),
  (21, 1),
  (22, 1),
  (23, 1),
  (24, 1),
  (25, 1),
  (26, 1),
  (27, 1),
  (28, 1),
  (29, 1)],
 [(5, 1), (30, 1), (31, 1), (32, 1), (33, 1), (34, 1)]]