<a href="https://colab.research.google.com/github/Ashwin1999/NLP-project---LDA/blob/main/LDA_on_Wikipedia_Movie_Plot.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv('https://raw.githubusercontent.com/Ashwin1999/NLP-project---LDA/main/Datasets/Movie%20Plots.csv')

# Sinc the dataset has around 34890 rows, it'll take too long to fit the LDA model. So due to that reason only 5000 rows(taken at random) will be used.
df = df.iloc[np.random.randint(0, 34892, size=5000), :-1]
df.head()

Unnamed: 0,Title,Plot
21970,My Dog Vincent,The film is centered on O'Brien (played by Chu...
15260,The Last Sin Eater,The Last Sin Eater is a story that takes place...
29523,Naalai Namadhe,"Separated by Ranjith (M. N. Nambiar), a killer..."
8033,The Third Day,Steve Mallory has been involved in a car crash...
31446,Pa Paandi,"Power Paandi (Rajkiran), an ex-stuntmaster in ..."


In [None]:
df.isna().sum()

Title    0
Plot     0
dtype: int64

In [None]:
df.dropna(inplace=True)
df.isna().sum()

Title    0
Plot     0
dtype: int64

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
cv = CountVectorizer(max_df=0.92, min_df=3, stop_words='english')

In [None]:
dtm = cv.fit_transform(df['Plot'])

In [None]:
dtm

<5000x21503 sparse matrix of type '<class 'numpy.int64'>'
	with 636223 stored elements in Compressed Sparse Row format>

In [None]:
from sklearn.decomposition import LatentDirichletAllocation

In [None]:
number_of_topics=6
LDA = LatentDirichletAllocation(n_components=number_of_topics)

In [None]:
# This will take some 2 mins to run...so wait
LDA.fit(dtm)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
                          evaluate_every=-1, learning_decay=0.7,
                          learning_method='batch', learning_offset=10.0,
                          max_doc_update_iter=100, max_iter=10,
                          mean_change_tol=0.001, n_components=6, n_jobs=None,
                          perp_tol=0.1, random_state=None,
                          topic_word_prior=None, total_samples=1000000.0,
                          verbose=0)

In [None]:
cv.get_feature_names()[2000]

'belated'

In [None]:
len(LDA.components_)

6

In [None]:
LDA.components_.shape

(6, 21503)

In [None]:
def findTopic(i, topic):
    topic_name = list()
    for t in topic:
        topic_name.append(cv.get_feature_names()[t])
    return f"Topic-{i}:\t{topic_name}\n\n"

i=1
for topic in LDA.components_:
    topic = topic.argsort()[-15:] # get the indices of the top 15 words in each topic
    print(findTopic(i, topic))
    i+=1

Topic-1:	['father', 'away', 'jim', 'home', 'finds', 'gang', 'town', 'killed', 'later', 'new', 'tells', 'men', 'man', 'john', 'police']


Topic-2:	['comes', 'wife', 'tom', 'married', 'mother', 'day', 'marriage', 'house', 'gets', 'life', 'daughter', 'son', 'family', 'father', 'love']


Topic-3:	['help', 'later', 'son', 'away', 'death', 'night', 'finds', 'home', 'young', 'tells', 'house', 'mother', 'father', 'man', 'king']


Topic-4:	['woman', 'wife', 'tells', 'later', 'police', 'finds', 'home', 'love', 'house', 'man', 'time', 'paul', 'life', 'new', 'film']


Topic-5:	['goes', 'old', 'mother', 'later', 'school', 'life', 'money', 'day', 'family', 'house', 'time', 'home', 'father', 'tells', 'new']


Topic-6:	['death', 'group', 'kills', 'help', 'tells', 'dr', 'escape', 'time', 'jack', 'kill', 'new', 'later', 'police', 'killed', 'ship']




In [None]:
topic_results = LDA.transform(dtm)
topic_results.shape

(5000, 6)

In [None]:
topics_pred = [
    "Genre-1",
    "Genre-2",
    "Genre-3",
    "Genre-4",
    "Genre-5",
    "Genre-6",
]

In [None]:
topic_index = topic_results.argmax(axis=1)
topic_index[:10]

array([3, 2, 1, 0, 1, 1, 5, 0, 0, 1])

In [None]:
pred = np.array([topics_pred[ind] for ind in topic_index])
pred

array(['Genre-4', 'Genre-3', 'Genre-2', ..., 'Genre-1', 'Genre-2',
       'Genre-5'], dtype='<U7')

In [None]:
df["Predicted Topics"] = pred

In [None]:
df.head()

Unnamed: 0,Title,Plot,Predicted Topics
21970,My Dog Vincent,The film is centered on O'Brien (played by Chu...,Genre-4
15260,The Last Sin Eater,The Last Sin Eater is a story that takes place...,Genre-3
29523,Naalai Namadhe,"Separated by Ranjith (M. N. Nambiar), a killer...",Genre-2
8033,The Third Day,Steve Mallory has been involved in a car crash...,Genre-1
31446,Pa Paandi,"Power Paandi (Rajkiran), an ex-stuntmaster in ...",Genre-2
