### Webscraping

In [None]:
# Make soup
import requests
from bs4 import BeautifulSoup
request = requests.get('https://films.criterionchannel.com/')
soup = BeautifulSoup(request.content, 'html.parser')

In [None]:
# Scrape titles, get rid of tabs and new lines
titles = []
for title in soup.findAll(class_ = "criterion-channel__td criterion-channel__td--title"):
    nt = title.get_text()
    no_t = nt.replace('\t', '')
    no_nt = no_t.replace('\n', '')
    titles.append(no_nt)
print(len(titles))

In [None]:
# Scrape urls
urls = []
for url in soup.findAll('a', href = True):
    urls.append(url.get('href'))
# Only keep urls that correspond to films
urls = urls[3:]
urls = urls[1:-21]
print(len(urls))

In [None]:
# Scrape directors
directors = []
for director in soup.findAll(class_ = 'criterion-channel__td criterion-channel__td--director'):
    nt = director.get_text()
    no_t = nt.replace('\t', '')
    no_nt = no_t.replace('\n', '')
    directors.append(no_nt)
print(len(directors))

In [None]:
# Scrape countries
countries = []
for country in soup.findAll(class_ = 'criterion-channel__td criterion-channel__td--country'):
    nt = country.get_text()
    no_t = nt.replace('\t', '')
    no_nt = no_t.replace('\n', '')
    no_comma = no_nt[:-1]
    countries.append(no_comma)
print(len(countries))

In [None]:
# Scrape years
years = []
for year in soup.findAll(class_ = 'criterion-channel__td criterion-channel__td--year'):
    nt = year.get_text()
    no_t = nt.replace('\t', '')
    no_nt = no_t.replace('\n', '')
    years.append(no_nt)
print(len(years))

In [None]:
# Create dataframe
import pandas as pd
data = pd.DataFrame({'Title': titles, 'Director': directors, 'Country': countries, 'Year': years, 'Url': urls})
# Remove rows without durations (parts > 1 of a film)
data = data[~data['Url'].str.contains('/videos/')]
# Remove two rows with urls that don't work
# ....
data = data.reset_index(drop = True)
print(len(data))

In [None]:
# # Check for broken links, do not run this, it takes a long time
# fourohfour = []
# for url in data['Url']:
#     # 200 = working, 404 = broken
#     fourohfour.append(requests.get(url))
#     print(url)
# print(len(fourohfour))
# # Save as text file (Excel often incorrectly reformats csv files upon opening)
# with open('data/Fourohfour.txt', 'w') as file:
#     for line in fourohfour:
#         file.write("%s\n" % line)
# print(len(fourohfour))

In [None]:
# Open pre-scraped 404 file
with open('data\Fourohfour.txt') as file:
    fourohfour = file.read().splitlines()
# Insert 404 column
data.insert(5, '404', fourohfour)
# Convert from BeautifulSoup type to string
data['404'] = data['404'].astype(str)
# Remove 404 rows from data
data = data[~data['404'].str.contains('404')]
print(len(data)) # Removed 52 broken links

In [None]:
# Reset index after filtering out rows
data = data.reset_index(drop = True)

In [None]:
data.head()

In [None]:
# # Scrape durations, do not run this, it takes a long time
# durations = []
# for url in data['Url']:
#     request = requests.get(url)
#     soup = BeautifulSoup(request.content, 'html.parser')
#     for duration in soup.findAll(class_ = 'duration-container')[:1]:
#         durations.append(duration.get_text())
#     print(url)
# # Save as text file
# with open('data/Durations.txt', 'w') as file:
#     for line in durations:
#         file.write("%s\n" % line)
# print(len(durations))

In [None]:
# Open pre-scraped duration file
with open('data\Durations.txt') as file:
    durations = file.read().splitlines()

In [None]:
# Clean durations
durations = durations[1:]
durations = durations[::3]
durations = [x.strip(' ') for x in durations]

In [None]:
# Insert duration column
try:
    data.insert(4, 'Duration', durations)
except:
    pass

In [None]:
# Remove seconds, keep only hours and minutes
data['Duration'] = data['Duration'].str[:-3]

In [None]:
# Append '0:' to beginning of duration to indicate 0 hours for all films < 1 hour
# that are not formatted consistently with the rest of the data
for i, duration in enumerate(data['Duration']):
    if ':' not in duration:
        data.loc[i, 'Duration'] = '0:' + duration

In [None]:
# Split duration by colon
hours_minutes = data['Duration'].str.split(':', expand = True)

In [None]:
# Insert hours and minutes columns
data.insert(5, 'Hours', hours_minutes[0])
data['Hours'] = data['Hours'].astype(int)
data.insert(6, 'Minutes', hours_minutes[1])
data['Minutes'] = data['Minutes'].astype(int)

In [None]:
# Calculate and insert total hours
total_hours = []
for i in range(len(data)):
    hours = (data.loc[i]['Hours'].astype(int) + data.loc[i]['Minutes'].astype(int)/60).round(2)
    total_hours.append(hours)
try:
    data.insert(7, 'Total Hours', total_hours)
except:
    pass
# Drop old columns
try:
    data = data.drop(['Minutes', 'Hours', '404'], axis = 1)
except:
    pass

In [None]:
# # Scrape descriptions, do not run this, it takes a long time
# descriptions = []
# for url in data['Url']:
#     request = requests.get(url)
#     soup = BeautifulSoup(request.content, 'html.parser')
#     paragraphs = soup.findAll('p')
#     # Select paragraph containing the description
#     paragraphs = paragraphs[1]
#     string = []
#     for x in paragraphs:
#         string.append(str(x))
#     descriptions.append(string[0])
#     print(url)
# # Save to csv (list is incorrectly loaded as text file)
# descriptions = pd.DataFrame({'Description': descriptions})
# descriptions.to_csv('data/Descriptions.csv', index = False)

In [None]:
# Open pre-scraped description file
descriptions = pd.read_csv('data\Descriptions.csv')

In [None]:
# Insert description column
data.insert(5, 'Description', descriptions)

In [None]:
# Remove films < 1 hour, as these are mostly shorts, not films
data = data[data['Total Hours'] > 1]

In [None]:
# Create decade column
import numpy as np
try:
    data.insert(4, 'Decade', (data['Year'].astype(int)/10).apply(np.floor))
except:
    pass
data['Decade'] = data['Decade'].astype(str)
data['Decade'] = data['Decade'].str.replace('.', '')
data['Decade'] = data['Decade'].astype(str) + 's'

In [None]:
# Replace NaN with 'None'
data = data.replace(np.nan, 'None', regex = True)

In [None]:
# Save to csv
data.to_csv('data\Criterion.csv', index = False)

### Merge with IMDB ratings dataset

In [1]:
# Read csv
import pandas as pd
data = pd.read_csv('data\Criterion.csv')

In [2]:
data['Year'] = data['Year'].astype(str)

In [3]:
data['titleYear'] = data['Title'] + ' ' + data['Year']

In [4]:
len(data)

1620

In [5]:
data.head()

Unnamed: 0,Title,Director,Country,Year,Decade,Duration,Description,Total Hours,Url,titleYear
0,2 or 3 Things I Know About Her,Jean-Luc Godard,France,1967,1960s,1:27,In 2 OR 3 THINGS I KNOW ABOUT HER (2 OU 3 CHOS...,1.45,https://www.criterionchannel.com/2-or-3-things...,2 or 3 Things I Know About Her 1967
1,3 Faces,Jafar Panahi,Iran,2018,2010s,1:40,Iranian master Jafar Panahi’s fourth feature s...,1.67,https://www.criterionchannel.com/3-faces,3 Faces 2018
2,"4 Months, 3 Weeks and 2 Days",Cristian Mungiu,Romania,2007,2000s,1:53,Romanian filmmaker Cristian Mungiu shot to int...,1.88,https://www.criterionchannel.com/4-months-3-we...,"4 Months, 3 Weeks and 2 Days 2007"
3,"The VI Olympic Winter Games, Oslo 1952",Tankred Ibsen,Norway,1952,1950s,1:43,Director Tancred Ibsen's penchant for depictin...,1.72,https://www.criterionchannel.com/the-vi-olympi...,"The VI Olympic Winter Games, Oslo 1952 1952"
4,8½,Federico Fellini,Italy,1963,1960s,2:19,"Marcello Mastroianni plays Guido Anselmi, a di...",2.32,https://www.criterionchannel.com/81-2,8½ 1963


In [6]:
ratings = pd.read_csv('title.ratings.tsv', sep = '\t')

In [7]:
basics = pd.read_csv('title.basics.tsv', sep = '\t', low_memory = False)

In [8]:
basics = basics.drop(['titleType', 'isAdult', 'endYear', 'runtimeMinutes', 'genres'], axis = 1)

In [9]:
imdb_merged = ratings.merge(basics, left_on = 'tconst', right_on = 'tconst')

In [10]:
imdb_merged['titleYear'] = imdb_merged['primaryTitle'] + ' ' + imdb_merged['startYear']

In [11]:
len(imdb_merged)

1134796

In [12]:
imdb_merged = imdb_merged.sort_values(['titleYear', 'numVotes'], ascending = False)

In [14]:
imdb_merged['isDuplicated'] = imdb_merged.duplicated('titleYear', keep = 'first')

In [15]:
# With duplicate titleYear, keep the one with the most numVotes
imdb_merged = imdb_merged[imdb_merged['isDuplicated'] == False]

In [16]:
len(imdb_merged)

1061173

In [17]:
imdb_merged.head()

Unnamed: 0,tconst,averageRating,numVotes,primaryTitle,originalTitle,startYear,titleYear,isDuplicated
596512,tt1381887,7.1,83,Солодкі мрії,Sweet Dreams,2008,Солодкі мрії 2008,False
735985,tt2234575,6.8,32,ö,ö,2012,ö 2012,False
577900,tt13273532,8.2,8,êmîcêtôcêt: Many Bloodlines,êmîcêtôcêt: Many Bloodlines,2020,êmîcêtôcêt: Many Bloodlines 2020,False
184529,tt0317414,5.6,15,él,él,2001,él 2001,False
809206,tt3207532,5.7,17,éX-Driver the Movie,éX-Driver the Movie,2002,éX-Driver the Movie 2002,False


In [18]:
imdb_merged[imdb_merged['titleYear'] == 'Weekend 2011']

Unnamed: 0,tconst,averageRating,numVotes,primaryTitle,originalTitle,startYear,titleYear,isDuplicated
660678,tt1714210,7.6,28314,Weekend,Weekend,2011,Weekend 2011,False


### TD-IDF String Matching
https://github.com/Bergvca/string_grouper

In [19]:
from string_grouper import match_strings, \
match_most_similar, group_similar_strings, \
compute_pairwise_similarities, StringGrouper

To do: Find a way to match on both primaryTitle and originalTitle

In [20]:
matches = match_most_similar(imdb_merged['titleYear'], data['titleYear'])

In [21]:
matches

Unnamed: 0,most_similar_index,most_similar_titleYear
0,38571.0,2 or 3 Things I Know About Her 1967
1,1079513.0,3 Faces 2018
2,454250.0,"4 Months, 3 Weeks and 2 Days 2007"
3,137372.0,"The VI Olympic Winter Games, Oslo 1952 1952"
4,35517.0,8½ 1963
...,...,...
1615,149722.0,Y Tu Mamá También 2001
1616,42766.0,Z 1969
1617,33489.0,Zazie dans le Métro 1960
1618,141852.0,Zero Focus 1961


In [22]:
nas = matches[matches['most_similar_index'].isna()]

In [23]:
nas = nas['most_similar_titleYear']

In [24]:
nas

10                                 21 Days 1940
15                    THE 47 RONIN: Part 1 1941
19                                  Abouna 2002
22                      An Actor’s Revenge 1963
24                 Adventures of a Dentist 1965
                         ...                   
1593               WORLD ON A WIRE: Part 1 1973
1598                               Xiao Wu 1997
1600                                Yeelen 1987
1611                         Youth in Fury 1960
1613    You Were Like a Wild Chrysanthemum 1955
Name: most_similar_titleYear, Length: 186, dtype: object

In [25]:
matches = matches.dropna(axis = 0)

In [26]:
len(matches)

1434

In [27]:
matches

Unnamed: 0,most_similar_index,most_similar_titleYear
0,38571.0,2 or 3 Things I Know About Her 1967
1,1079513.0,3 Faces 2018
2,454250.0,"4 Months, 3 Weeks and 2 Days 2007"
3,137372.0,"The VI Olympic Winter Games, Oslo 1952 1952"
4,35517.0,8½ 1963
...,...,...
1615,149722.0,Y Tu Mamá También 2001
1616,42766.0,Z 1969
1617,33489.0,Zazie dans le Métro 1960
1618,141852.0,Zero Focus 1961


In [28]:
data['newTitleYear'] = ''

In [29]:
import warnings
warnings.filterwarnings("ignore", 'This pattern has match groups')
data = data[~data['titleYear'].str.contains('|'.join(nas))]

In [30]:
len(data)

1437

In [31]:
pd.options.mode.chained_assignment = None
for i in range(0, data.index[-1] + 1):
    try:
        index = int(matches.loc[i, 'most_similar_index'])
    except:
        pass
    data.loc[i, 'newTitleYear'] = imdb_merged.loc[index, 'titleYear']

In [32]:
data = data.dropna(axis = 0)

In [33]:
len(data)

1436

In [34]:
data

Unnamed: 0,Title,Director,Country,Year,Decade,Duration,Description,Total Hours,Url,titleYear,newTitleYear
0,2 or 3 Things I Know About Her,Jean-Luc Godard,France,1967,1960s,1:27,In 2 OR 3 THINGS I KNOW ABOUT HER (2 OU 3 CHOS...,1.45,https://www.criterionchannel.com/2-or-3-things...,2 or 3 Things I Know About Her 1967,2 or 3 Things I Know About Her 1967
1,3 Faces,Jafar Panahi,Iran,2018,2010s,1:40,Iranian master Jafar Panahi’s fourth feature s...,1.67,https://www.criterionchannel.com/3-faces,3 Faces 2018,3 Faces 2018
2,"4 Months, 3 Weeks and 2 Days",Cristian Mungiu,Romania,2007,2000s,1:53,Romanian filmmaker Cristian Mungiu shot to int...,1.88,https://www.criterionchannel.com/4-months-3-we...,"4 Months, 3 Weeks and 2 Days 2007","4 Months, 3 Weeks and 2 Days 2007"
3,"The VI Olympic Winter Games, Oslo 1952",Tankred Ibsen,Norway,1952,1950s,1:43,Director Tancred Ibsen's penchant for depictin...,1.72,https://www.criterionchannel.com/the-vi-olympi...,"The VI Olympic Winter Games, Oslo 1952 1952","The VI Olympic Winter Games, Oslo 1952 1952"
4,8½,Federico Fellini,Italy,1963,1960s,2:19,"Marcello Mastroianni plays Guido Anselmi, a di...",2.32,https://www.criterionchannel.com/81-2,8½ 1963,8½ 1963
...,...,...,...,...,...,...,...,...,...,...,...
1615,Y tu mamá también,Alfonso Cuarón,Mexico,2001,2000s,1:45,This smash road comedy from Oscar-winning dire...,1.75,https://www.criterionchannel.com/y-tu-mama-tam...,Y tu mamá también 2001,Y Tu Mamá También 2001
1616,Z,Costa-Gavras,Greece,1969,1960s,2:07,"A pulse-pounding political thriller, Greek exp...",2.12,https://www.criterionchannel.com/z-1,Z 1969,Z 1969
1617,Zazie dans le métro,Louis Malle,France,1960,1960s,1:32,A brash and precocious ten-year-old (Catherine...,1.53,https://www.criterionchannel.com/zazie-dans-le...,Zazie dans le métro 1960,Zazie dans le Métro 1960
1618,Zero Focus,Yoshitaro Nomura,Japan,1961,1960s,1:35,After her husband disappears on a business tri...,1.58,https://www.criterionchannel.com/zero-focus,Zero Focus 1961,Zero Focus 1961


In [35]:
criterion_merged = data.merge(imdb_merged, left_on = 'newTitleYear', right_on = 'titleYear')

In [38]:
len(criterion_merged)

1436

In [36]:
criterion_merged.sort_values('averageRating', ascending = False)

Unnamed: 0,Title,Director,Country,Year,Decade,Duration,Description,Total Hours,Url,titleYear_x,newTitleYear,tconst,averageRating,numVotes,primaryTitle,originalTitle,startYear,titleYear_y,isDuplicated
722,Look Back in Anger,Tony Richardson,United Kingdom,1958,1950s,1:40,Jimmy Porter (Richard Burton) is a university ...,1.67,https://www.criterionchannel.com/look-back-in-...,Look Back in Anger 1958,Look Back in Anger 1958,tt0272009,9.5,8,Look Back in Anger,Blick zurück im Zorn,1958,Look Back in Anger 1958,False
942,Pather Panchali,Satyajit Ray,India,1955,1950s,2:05,With the release in 1955 of Satyajit Ray’s deb...,2.08,https://www.criterionchannel.com/pather-panchali,Pather Panchali 1955,Pather Panchali 1955,tt0048473,8.6,23962,Pather Panchali,Pather Panchali,1955,Pather Panchali 1955,False
520,Harakiri,Masaki Kobayashi,Japan,1962,1960s,2:12,"Following the collapse of his clan, an unemplo...",2.20,https://www.criterionchannel.com/harakiri,Harakiri 1962,Hara-Kiri 1962,tt0056058,8.6,43601,Hara-Kiri,Seppuku,1962,Hara-Kiri 1962,False
1110,Seven Samurai,Akira Kurosawa,Japan,1954,1950s,3:27,One of the most thrilling movie epics of all t...,3.45,https://www.criterionchannel.com/seven-samurai,Seven Samurai 1954,Seven Samurai 1954,tt0047478,8.6,318830,Seven Samurai,Shichinin no samurai,1954,Seven Samurai 1954,False
1304,Le trou,Jacques Becker,France,1960,1960s,2:11,"In a Paris prison cell, five inmates use every...",2.18,https://www.criterionchannel.com/le-trou-1,Le trou 1960,Le Trou 1960,tt0054407,8.5,15675,Le Trou,Le trou,1960,Le Trou 1960,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
611,I Was a Teenage Zombie,John Elias Michalakis,United States,1987,1980s,1:31,A group of teens looking to score some weed un...,1.52,https://www.criterionchannel.com/i-was-a-teena...,I Was a Teenage Zombie 1987,I Was a Teenage Zombie 1987,tt0093238,4.5,729,I Was a Teenage Zombie,I Was a Teenage Zombie,1987,I Was a Teenage Zombie 1987,False
333,Dont Look Back,D. A. Pennebaker,United States,1967,1960s,1:36,Bob Dylan is captured on-screen as he never wo...,1.60,https://www.criterionchannel.com/dont-look-back,Dont Look Back 1967,Dont Look Back 2018,tt9095892,4.2,24,Dont Look Back,Dont Look Back,2018,Dont Look Back 2018,False
98,Beware! The Blob,Larry Hagman,United States,1972,1970s,1:27,The Blob returns—and is more outrageous than e...,1.45,https://www.criterionchannel.com/beware-the-blob,Beware! The Blob 1972,Beware! The Blob 1972,tt0068271,4.1,1841,Beware! The Blob,Beware! The Blob,1972,Beware! The Blob 1972,False
25,All Monsters Attack,Ishiro Honda,Japan,1969,1960s,1:09,Director Ishiro Honda returned again for the f...,1.15,https://www.criterionchannel.com/all-monsters-...,All Monsters Attack 1969,All Monsters Attack 1969,tt0064373,3.9,4047,All Monsters Attack,Gojira-Minira-Gabara: Oru kaijû daishingeki,1969,All Monsters Attack 1969,False


In [37]:
# Save to csv
criterion_merged.to_csv('data\Merged.csv', index = False)

### LDA Topic Modeling

In [None]:
corpus = data['Description']

In [None]:
len(corpus)

In [None]:
import re
f = open('C:/Users/HP/Documents/NLP/MySQL_stopwords.txt', 'r', encoding = 'utf-8')
stop_words = f.read()
stop_words = re.split(' \t|\n', stop_words)

In [None]:
import nltk
from nltk import TweetTokenizer
# stop_words = nltk.corpus.stopwords.words('english')
tokenizer = TweetTokenizer()
lemmatizer = nltk.stem.wordnet.WordNetLemmatizer()
def normalize_corpus(corpus):
    normalized_corpus = []
    for i, document in enumerate(corpus):
        # Lowercase
        document = document.lower()
        # Replace 
        document = document.replace("/", " ")
        document = document.replace("’", "'")
        document = document.replace("'s", "")
        # Remove numbers
        document = re.sub('[^A-Za-z0-9\']+', ' ', document)
        # Strip spaces
        document_tokens = tokenizer.tokenize(document)
        # Remove stopwords
        document_tokens = [token for token in document_tokens if token not in stop_words]
        # Lemmatize
        document_tokens = [lemmatizer.lemmatize(token) for token in document_tokens if not token.isnumeric()]
        # Remove single characters
        document_tokens = [token for token in document_tokens if len(token) > 1]
        # Remove empty corpus
        if document_tokens:
            normalized_corpus.append(document_tokens)
    return normalized_corpus
normalized_corpus = normalize_corpus(corpus)

In [None]:
import gensim
bigram = gensim.models.Phrases(normalized_corpus, min_count = 5, threshold = 5, delimiter = b'_')
bigram_model = gensim.models.phrases.Phraser(bigram)

In [None]:
normalized_corpus_bigrams = [bigram_model[post] for post in normalized_corpus]
# Create a dictionary representation of the documents.
dictionary = gensim.corpora.Dictionary(normalized_corpus_bigrams)
print('Total Vocabulary Size:', len(dictionary))

In [None]:
# Filter out words that occur less than 20 documents, or more than 50% of the documents.
dictionary.filter_extremes(no_below = 5, no_above = 0.5)
print('Total Vocabulary Size:', len(dictionary))

In [None]:
# Transforming corpus into bag of words vectors
bow_corpus = [dictionary.doc2bow(text) for text in normalized_corpus_bigrams]

In [None]:
MALLET_PATH = 'C:/mallet-2.0.8/bin/mallet'
import os
from gensim.models.wrappers import LdaMallet
from tqdm import tqdm
os.environ['MALLET_HOME'] = 'C:/mallet-2.0.8'

def topic_model_coherence_generator(corpus, texts, dictionary, 
                                    start_topic_count = 1, end_topic_count = 10, step = 1,
                                    cpus = 8):
    models = []
    coherence_scores = []
    for topic_nums in tqdm(range(start_topic_count, end_topic_count + 1, step)):
        mallet_lda_model = gensim.models.wrappers.LdaMallet(mallet_path = MALLET_PATH, corpus = corpus,
                                                            num_topics = topic_nums, id2word = dictionary,
                                                            iterations = 100, workers = cpus, random_seed = 20210224)
        cv_coherence_model_mallet_lda = gensim.models.CoherenceModel(model = mallet_lda_model, corpus = corpus, 
                                                                     texts = texts, dictionary = dictionary, 
                                                                     coherence = 'c_v')
        coherence_score = cv_coherence_model_mallet_lda.get_coherence()
        coherence_scores.append(coherence_score)
        models.append(mallet_lda_model)
    return models, coherence_scores

### 40 Topics

In [None]:
end_topic_count = 40
lda_models, coherence_scores = topic_model_coherence_generator(corpus = bow_corpus, texts = normalized_corpus_bigrams,
                                                               dictionary = dictionary, start_topic_count = 1,
                                                               end_topic_count = end_topic_count, step = 1, cpus = 8)

In [None]:
import pickle
pickle.dump(lda_models, open('40_topics_lda_models.pkl', 'wb'))
pickle.dump(coherence_scores, open('40_topics_coherence_scores.pkl', 'wb'))

In [None]:
import pickle
lda_models = pickle.load(open('40_topics_lda_models.pkl','rb'))
coherence_scores = pickle.load(open('40_topics_coherence_scores.pkl','rb'))

### 80 Topics

In [None]:
end_topic_count = 80
lda_models, coherence_scores = topic_model_coherence_generator(corpus = bow_corpus, texts = normalized_corpus_bigrams,
                                                               dictionary = dictionary, start_topic_count = 1,
                                                               end_topic_count = end_topic_count, step = 1, cpus = 8)

In [None]:
import pickle
pickle.dump(lda_models, open('80_topics_lda_models.pkl', 'wb'))
pickle.dump(coherence_scores, open('80_topics_coherence_scores.pkl', 'wb'))

In [None]:
import pickle
lda_models = pickle.load(open('80_topics_lda_models.pkl','rb'))
coherence_scores = pickle.load(open('80_topics_coherence_scores.pkl','rb'))

### 160 Topics

In [None]:
end_topic_count = 160
lda_models, coherence_scores = topic_model_coherence_generator(corpus = bow_corpus, texts = normalized_corpus_bigrams,
                                                               dictionary = dictionary, start_topic_count = 1,
                                                               end_topic_count = end_topic_count, step = 1, cpus = 8)

In [None]:
import pickle
pickle.dump(lda_models, open('160_topics_lda_models.pkl', 'wb'))
pickle.dump(coherence_scores, open('160_topics_coherence_scores.pkl', 'wb'))

In [None]:
import pickle
lda_models = pickle.load(open('160_topics_lda_models.pkl','rb'))
coherence_scores = pickle.load(open('160_topics_coherence_scores.pkl','rb'))

### Analyze Model

In [None]:
import numpy as np
import pandas as pd
end_topic_count = 160
coherence_df = pd.DataFrame({'Number of Topics': range(1, end_topic_count + 1, 1),
                             'Coherence Score': np.round(coherence_scores, 4)})
coherence_df = coherence_df.sort_values(by = 'Coherence Score', ascending = False).head(10)

In [None]:
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
%matplotlib inline

x_ax = range(1, end_topic_count + 1, 1)
y_ax = coherence_scores
plt.figure(figsize=(12, 6))
plt.plot(x_ax, y_ax, c = 'r')
plt.rcParams['figure.facecolor'] = 'white'
xl = plt.xlabel('Number of Topics')
yl = plt.ylabel('Coherence Score')

In [None]:
best_model_idx = coherence_df['Number of Topics'].index[0]
best_lda_model = lda_models[best_model_idx]
best_lda_model.num_topics

In [None]:
topics = [[(term, round(wt, 3)) 
               for term, wt in best_lda_model.show_topic(n, topn=20)] 
                   for n in range(0, best_lda_model.num_topics)]

In [None]:
pd.set_option('display.max_colwidth', None)
topics_df = pd.DataFrame([', '.join([term for term, wt in topic])  
                              for topic in topics],
                         columns = ['Topic Desc'],
                         index = range(1, best_lda_model.num_topics + 1)
                         )
topics_df.head()

In [None]:
tm_results = best_lda_model[bow_corpus]

In [None]:
corpus_topics = [sorted(topics, key = lambda record: -record[1])[0] for topics in tm_results]

In [None]:
corpus_topic_df = pd.DataFrame()
corpus_topic_df['Document'] = range(0, len(corpus))
corpus_topic_df['Dominant Topic'] = [item[0] + 1 for item in corpus_topics]
corpus_topic_df['Contribution %'] = [round(item[1] * 100, 2) for item in corpus_topics]
corpus_topic_df['Topic Desc'] = [topics_df.iloc[t[0]]['Topic Desc'] for t in corpus_topics]
corpus_topic_df['Description'] = corpus
corpus_topic_df.head()

In [None]:
corpus_topic_df[corpus_topic_df['Dominant Topic'] == 36].head()

In [None]:
topic_stats_df = corpus_topic_df.groupby('Dominant Topic').count()
topic_stats_df = topic_stats_df.drop(['Contribution %', 'Topic Desc', 'Description'], axis = 1)
topic_stats_df.columns = ['# of Docs']
topic_stats_df['% Total Docs'] = round(100 * topic_stats_df['# of Docs'] / sum(topic_stats_df['# of Docs']), 2)
topic_stats_df['Topic Desc'] = topics_df['Topic Desc']
topic_stats_df.sort_values('% Total Docs', ascending = False).head()

In [None]:
topic_stats_df.sort_values('% Total Docs', ascending = False).tail()

In [None]:
relevant_posts = corpus_topic_df.groupby('Dominant Topic') \
.apply(lambda topic_set: (topic_set.sort_values(by=['Contribution %'], ascending=False).iloc[0]))
relevant_posts.sort_values('Contribution %', ascending = False).head()