# Inferring Topics from IMDB Reviews

In [23]:
import numpy as np
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
import pandas as pd
import matplotlib.pyplot as plt

## Exploring the Dataset: [Large Movie Review Dataset](https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz)

In [24]:
ROOT = 'C:\\technology\pythonlearning\\semantic_processing\\Topic Modelling\data\\aclImdb\\train\\pos\\'

  ROOT = 'C:\\technology\pythonlearning\\semantic_processing\\Topic Modelling\data\\aclImdb\\train\\pos\\'


In [25]:
reviews = []
for file in os.listdir(ROOT):
    path = os.path.join(ROOT, file)
    if os.path.isfile(path):
        with open(path, 'r',encoding='utf-8') as fin:
            reviews.append(fin.read())

In [26]:
len(reviews)

12500

In [27]:
for i in range(3):
    print(reviews[i])
    print('=' * 150)

Bromwell High is a cartoon comedy. It ran at the same time as some other programs about school life, such as "Teachers". My 35 years in the teaching profession lead me to believe that Bromwell High's satire is much closer to reality than is "Teachers". The scramble to survive financially, the insightful students who can see right through their pathetic teachers' pomp, the pettiness of the whole situation, all remind me of the schools I knew and their students. When I saw the episode in which a student repeatedly tried to burn down the school, I immediately recalled ......... at .......... High. A classic line: INSPECTOR: I'm here to sack one of your teachers. STUDENT: Welcome to Bromwell High. I expect that many adults of my age think that Bromwell High is far fetched. What a pity that it isn't!
Homelessness (or Houselessness as George Carlin stated) has been an issue for years but never a plan to help those on the street that were once considered human who did everything from going to

## Feature Extraction

In [28]:
vect = TfidfVectorizer(stop_words='english')
X = vect.fit_transform(reviews)

pd.DataFrame(X.toarray(), columns=vect.get_feature_names_out())

Unnamed: 0,00,000,000s,003830,006,007,0079,0080,0083,0093638,...,élan,émigré,émigrés,était,état,étc,êxtase,ís,østbye,über
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12495,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12496,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12497,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12498,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## NMF Decomposition

In [29]:
N_TOPICS = 15
nmf = NMF(n_components=N_TOPICS)
W = nmf.fit_transform(X)  # Document-topic matrix
H = nmf.components_       # Topic-term matrix



In [30]:
# Top 10 words per topic

words = np.array(vect.get_feature_names_out())
topic_words = pd.DataFrame(np.zeros((N_TOPICS, 10)), index=[f'Topic {i + 1}' for i in range(N_TOPICS)],
                           columns=[f'Word {i + 1}' for i in range(10)]).astype(str)
for i in range(N_TOPICS):
    ix = H[i].argsort()[::-1][:10]
    topic_words.iloc[i] = words[ix]

topic_words

Unnamed: 0,Word 1,Word 2,Word 3,Word 4,Word 5,Word 6,Word 7,Word 8,Word 9,Word 10
Topic 1,br,10,ll,spoilers,end,simply,yes,plot,just,spoiler
Topic 2,movie,movies,watch,recommend,saw,10,definitely,enjoyed,watching,makes
Topic 3,film,films,director,scenes,characters,cinema,plot,festival,work,art
Topic 4,series,episode,episodes,season,tv,characters,trek,seasons,shows,television
Topic 5,man,role,character,performance,plays,does,john,played,best,scene
Topic 6,good,action,pretty,bad,story,acting,really,plot,scenes,nice
Topic 7,war,world,documentary,people,american,history,soldiers,men,women,hitler
Topic 8,funny,comedy,laugh,hilarious,fun,eddie,jokes,humor,funniest,murphy
Topic 9,like,think,just,really,don,people,know,say,didn,watch
Topic 10,family,kids,old,children,disney,years,little,kid,child,time


In [31]:
# Create a topic mapping

topic_mapping = {
    'Topic 4': 'TV',
    'Topic 7': 'War',
    'Topic 8': 'Comedy',
    'Topic 12': 'Book Adaptation',
    'Topic 13': 'Horror',
    'Topic 15': 'Martial Arts / Action'
}

In [32]:
# Recall the document-topic matrix, W

W = pd.DataFrame(W, columns=[f'Topic {i + 1}' for i in range(N_TOPICS)])
W['max_topic'] = W.apply(lambda x: topic_mapping.get(x.idxmax()), axis=1)
W[pd.notnull(W['max_topic'])].head(10)

Unnamed: 0,Topic 1,Topic 2,Topic 3,Topic 4,Topic 5,Topic 6,Topic 7,Topic 8,Topic 9,Topic 10,Topic 11,Topic 12,Topic 13,Topic 14,Topic 15,max_topic
18,0.032773,0.001603,0.029962,0.000498,0.011455,0.014085,0.0,0.0,0.008262,0.0,0.003557,0.045433,0.006759,0.0,0.0,Book Adaptation
23,0.0,0.0,0.0,0.0,0.018628,0.002257,0.0,0.0,0.00116,0.0,0.00536,0.0,0.042922,0.001772,0.0,Horror
24,0.026136,0.003008,0.00861,0.000755,0.007506,0.0,0.0,0.000435,0.0,0.032899,0.005128,0.0,0.064689,0.0,0.000194,Horror
33,0.024801,0.0,0.020627,0.00289,0.002435,0.002199,0.0,1.6e-05,0.011949,0.004041,0.0,0.00756,0.064298,0.001011,0.004255,Horror
44,0.000274,0.0,0.0,0.0,0.02054,0.003393,0.013007,0.0,0.002863,0.022494,0.021253,0.0,0.037005,0.0,0.001563,Horror
55,0.037779,0.003526,0.030206,0.014999,0.025738,0.01758,0.003009,0.0,0.002606,0.007403,0.007357,0.0,0.070059,0.002533,0.0,Horror
66,0.034312,0.014863,0.000253,2.4e-05,0.002483,0.013258,0.003768,0.000841,0.003278,0.0,0.0,0.061405,0.0,0.0,0.006574,Book Adaptation
70,6.6e-05,0.0,0.013509,0.0,0.008197,0.001264,0.005441,0.0,0.0,0.0,0.015628,0.001632,0.0,0.017974,0.022355,Martial Arts / Action
72,0.047969,0.006551,0.026579,0.0,0.0,0.0,0.06875,0.0,0.013993,0.017714,0.004155,0.0,0.0,0.0,0.001773,War
77,0.014719,0.0,0.035619,0.101538,0.0,0.00845,0.0,0.0,0.00685,0.0,0.0,0.08391,0.0,0.0,0.0,TV


In [33]:
reviews[58]

'It\'s heart-warming to see a movie that doesn\'t bash males. In this one the wife/mother leaves her family to "get in touch" with herself - or pursue her libido. The father stays with and nurtures the kids, letting neither his work nor his love life interfere with his love of and responsibility to them.'