# 

---
    Title: subtitle analysis with python
    Author: Bart Deijkers (bartdeijkers@gmail.com)
    Date: 2022-10-19
---


# STEP 1: Load essentials

In [None]:
import json
import os
import re
import sys

import matplotlib.pyplot as plt
import pandas as pd
import spacy
import textacy
import nltk

nltk.download('stopwords')

from nltk.corpus import stopwords
from wordcloud import WordCloud


ON_COLAB = 'google.colab' in sys.modules

if ON_COLAB:
    BASE_DIR = "/content"
    print("You are working on Google Colab.")
    print(f'Files will be downloaded to "{BASE_DIR}".')
else:
    BASE_DIR = ".."
    print("You are working on a local system.")
    print(f'Files will be searched relative to "{BASE_DIR}".')


if spacy.prefer_gpu():
 print("Working on GPU.")
else:
 print("No GPU found, working on CPU.")

stopword_list = stopwords.words('dutch')

# Load dutch tokenizer, tagger, parser and NER
# download and install the model with: 
# $ python -m spacy download nl_core_news_lg
nlp = spacy.load("nl_core_news_lg")

# increase the maximum length of the corpus if available/needed (default is 1M)
# based on step 3 below
nlp.max_length = 2000000

print("done")


# STEP 2: Load dataset

In [None]:

# provide text from files in the data directory
text = ''

# skip lines that contain these strings
skip = ["888","LIVEPROGRAMMA,\nONDERTITELING KAN ACHTERLOPEN","NPO ONDERTITELING TT888, 2022\ninformatie: service.npo.nl","DIT PROGRAMMA WERD LIVE ONDERTITELD"]

# read all json files in folder (change to your own folder if needed)
dataset = "./data/json/"

# loop files
for filename in os.listdir(dataset):
    # open file
    with open(dataset + filename, "r") as raw_data:
        # read file as json object
        data = json.load(raw_data)

        # loop json objects and append text to text variable (skipping last item)
        for item in data:
            if not any(re.search(x, item['text']) for x in skip):
                text += item['text']+' '
            
# remove sentence continuation markers (Dutch TT888 convention)        
text = text.replace('...',' ')

# remove newlines
text = text.replace('\n', ' ')

# regex 2 or more spaces to 1 space
text = re.sub(' +', ' ', text)

print("done loading json data")

# STEP 3 : Check dataset boundaries (should be under 2000000)

In [None]:
print("Text length:", len(text))

if len(text) > 2000000:
    print("this text is too long for spacy, please split it up in smaller chunks, or increase the maximum length of the corpus")
else:
    print("No boundaries reached, processing text")

print('done')

# STEP 4: Process text

In [None]:
doc = nlp(text)

# add stopwords common in dutch subtitles
nlp.Defaults.stop_words.add('bedankt')
nlp.Defaults.stop_words.add('888')
nlp.Defaults.stop_words.add('hee')
nlp.Defaults.stop_words.add('gelach')
nlp.Defaults.stop_words.add('applaus')
nlp.Defaults.stop_words.add('applaus en gejuich')
nlp.Defaults.stop_words.add('gejuich')
nlp.Defaults.stop_words.add('mens')
nlp.Defaults.stop_words.add('een beetje')
nlp.Defaults.stop_words.add('dank')
nlp.Defaults.stop_words.add('dank u')
nlp.Defaults.stop_words.add('dank u wel')
nlp.Defaults.stop_words.add('dank u wel allemaal')

events = [t for t in doc.ents if t.label_ == 'EVENT']
persons = [t for t in doc.ents if t.label_ == 'PERSON']
locations = [t for t in doc.ents if t.label_ == 'LOC']
organizations = [t for t in doc.ents if t.label_ == 'ORG']
nouns =  [t for t in doc if t.pos_ in ['NOUN', 'PROPN']]

print('done')


# STEP 5: Get top 25 mentioned events, persons, organizations, locations and nouns

In [None]:
# loop through all words in doc and add to word_freq dictionary
# if word is already in dictionary, add 1 to value
# filter out stopwords
def get_word_freq(doc):
    word_freq = {}
    for word in doc:
        if word.lemma_ not in nlp.Defaults.stop_words:
            if word.lemma_ not in stopword_list:
                if word.lemma_ in word_freq.keys():
                    word_freq[word.lemma_] += 1
                else:
                    word_freq[word.lemma_] = 1
    result = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)
    # return first column of result
    return result
    
# sort word frequency
sorted_event_freq = get_word_freq(events)
sorted_person_freq = get_word_freq(persons)
sorted_organization_freq = get_word_freq(organizations)
sorted_location_freq = get_word_freq(locations)
sorted_noun_freq = get_word_freq(nouns)

data = [sorted_event_freq[:25]
       ,sorted_person_freq[:25]
       ,sorted_organization_freq[:25]
       ,sorted_location_freq[:25]
       ,sorted_noun_freq[:25]]

# create dataframe
df = pd.DataFrame(list(zip(*data)), columns =['Events', 'Persons', 'Organizations', 'Locations', 'Nouns'])
print(df)

print('done')

# STEP 6: Wordclouds

In [None]:
# create wordcloud from events (top 50)
wordcloud = WordCloud(width = 1024, height = 1024,
                background_color ='white',
                min_font_size = 10).generate_from_frequencies(dict(sorted_event_freq[:50]))

# plot the WordCloud image
plt.figure(figsize = (10, 10), facecolor = None)
plt.imshow(wordcloud)
plt.axis("off")
plt.tight_layout(pad = 0)

plt.show()
print('done')

In [None]:
# create wordcloud from persons (top 50)
wordcloud = WordCloud(width = 1024, height = 1024,
                background_color ='white',
                min_font_size = 10).generate_from_frequencies(dict(sorted_person_freq[:50]))

# plot the WordCloud image
plt.figure(figsize = (10, 10), facecolor = None)
plt.imshow(wordcloud)
plt.axis("off")
plt.tight_layout(pad = 0)

plt.show()
print('done')

In [None]:
# create wordcloud from organizations (top 50)
wordcloud = WordCloud(width = 1024, height = 1024,
                background_color ='white',
                min_font_size = 10).generate_from_frequencies(dict(sorted_organization_freq[:50]))

# plot the WordCloud image
plt.figure(figsize = (10, 10), facecolor = None)
plt.imshow(wordcloud)
plt.axis("off")
plt.tight_layout(pad = 0)

plt.show()
print('done')

In [None]:
# create wordcloud from locations (top 50)
wordcloud = WordCloud(width = 1024, height = 1024,
                background_color ='white',
                min_font_size = 10).generate_from_frequencies(dict(sorted_location_freq[:50]))

# plot the WordCloud image
plt.figure(figsize = (10, 10), facecolor = None)
plt.imshow(wordcloud)
plt.axis("off")
plt.tight_layout(pad = 0)

plt.show()
print('done')

In [None]:
# create wordcloud from all noun chunks (top 50)
wordcloud = WordCloud(width = 1024, height = 1024,
                background_color ='white',
                min_font_size = 10).generate_from_frequencies(dict(get_word_freq(nouns)[:50]))
                
# plot the WordCloud image
plt.figure(figsize = (10, 10), facecolor = None)
plt.imshow(wordcloud)
plt.axis("off")
plt.tight_layout(pad = 0)

plt.show()
print('done')

