# Topic Modeling 
Using SPACY and scikit learn 

In [24]:
import pandas as pd
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
import numpy as np
from scipy.spatial import distance
#load the medium model as it has word2vec the small model doesn't 
nlp = spacy.load('en_core_web_md')

N = 100

RS = 42

In [11]:
file_path = 'Data/IMDB Dataset.csv'

df = pd.read_csv(file_path)

df.head(3)

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive


In [12]:
# extract first 100 text entries
text = df['review'][0:N].values

In [31]:
# pass text into the statistical model 
Docs = nlp.pipe(text)


In [14]:
vocab = nlp.vocab

In [15]:
#generate vectors
Docs_vector = [doc.vector for doc in Docs]


AttributeError: 'spacy.tokens.doc.Doc' object has no attribute 'orth_'

In [30]:
Docs_vector2

<generator object Language.pipe at 0x000001EB08B37AE0>

In [16]:
np.shape(Docs_vector)

(100, 300)

# Function for vocab similarity

In [17]:

def vector_sim(vector,vocab):
    '''
    Outputs the word most similar to the input vector
    '''
    p = vector
    
    # Format the vocabulary for use in the distance function
    ids = [x for x in vocab.vectors.keys()]
    vectors = [vocab.vectors[x] for x in ids]
    vectors = np.array(vectors)
    
    # *** Find the closest word below ***
    closest_index = distance.cdist(p, vectors).argmin()
    word_id = ids[closest_index]
    output_word = vocab[word_id].text

    return output_word

In [22]:
vector = Docs_vector[0]
vector = np.reshape(vector,[1,300])
vector_sim(vector,vocab)

'presumably'

# Now apply LDA

In [23]:
from sklearn.decomposition import LatentDirichletAllocation

In [26]:
lda = LatentDirichletAllocation(n_components=4,random_state=RS)

In [44]:
# can't use Docs_vector straigh as it contains negative components -essentially I need to use a different embedding
max_Docs_vector = np.max(Docs_vector)
shape = np.shape(Docs_vector)
Docs_vector_pos = Docs_vector + np.ones(shape)*max_Docs_vector

In [43]:
lda_matrix = lda.fit_transform(Docs_vector_pos) 

In [50]:
topics_vectors = lda.components_
topics_vectors_shape = np.shape(topics_vectors)
topics_vectors = topics_vectors - np.ones(topics_vectors_shape)*max_Docs_vector

In [54]:
#find vector dims
VDim = np.shape(topics_vectors[0])[0]


300

In [57]:
topic = []
for I in range(len(topics_vectors)):
    v = topics_vectors[I]
    v = np.reshape(v,[1,VDim])
    topic.append(vector_sim(v,vocab))

topic

['of', 'of', 'of', 'an']

# Re do using NLTK + scikit-learn

In [59]:
import os
import pandas as pd
from nltk.tokenize import RegexpTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

In [61]:
# Initialize regex tokenizer
tokenizer = RegexpTokenizer(r'\w+')

# Vectorize document using TF-IDF
tfidf = TfidfVectorizer(lowercase=True,
                        stop_words='english',
                        ngram_range = (1,1),
                        tokenizer = tokenizer.tokenize)

# Fit and Transform the documents
train_data = tfidf.fit_transform(text)   



In [64]:
# Define the number of topics or components
num_components=5

# Create LDA object
model=LatentDirichletAllocation(n_components=num_components)

# Fit and Transform SVD model on data
lda_matrix = model.fit_transform(train_data)

# Get Components 
lda_components=model.components_

In [78]:
# Print the topics with their terms
terms = tfidf.get_feature_names_out()

for index, component in enumerate(lda_components):
    zipped = zip(terms, component)
    top_terms_key=sorted(zipped, key = lambda t: t[1], reverse=True)[:7]
    top_terms_list=list(dict(top_terms_key).keys())
    top_terms_comp = list(dict(top_terms_key).values())

    print("Topic %d:" % (index))
    [print((top_terms_list[i], top_terms_comp[i])) for i in range(len(top_terms_list))]
     


Topic 0:
('br', 2.7925339140475405)
('s', 1.6395355774599403)
('movie', 1.5981602832151338)
('like', 1.1901579014875656)
('t', 1.0560177948354081)
('war', 1.0398044651678267)
('film', 0.8875702634769103)
Topic 1:
('br', 1.5704411012680493)
('film', 1.1010562764207426)
('movie', 1.100690734614989)
('s', 0.9238731565134715)
('story', 0.7519701305496012)
('lot', 0.7361693950857513)
('goldie', 0.7026728215531821)
Topic 2:
('br', 3.3391667334271573)
('movie', 1.813446399631298)
('s', 1.5199195181955698)
('film', 1.4350005619134318)
('t', 1.2154273253728212)
('bad', 1.0669913622951264)
('just', 1.0623681502038043)
Topic 3:
('br', 3.4236165316395253)
('s', 1.6445600576228787)
('movie', 1.5064319345853383)
('film', 1.3885580840416798)
('funny', 0.9957584189514619)
('movies', 0.8921163220874821)
('b', 0.8723364944151497)
Topic 4:
('br', 1.877835505737501)
('movie', 1.178730068353227)
('s', 1.0856513278401425)
('jimmy', 0.9137528365362999)
('just', 0.8674513260737451)
('t', 0.8629769920970198)
(