In [47]:
import pandas as pd
import re
import spacy
import gensim
import pyLDAvis
import pyLDAvis.gensim

In [48]:
python -m spacy download en_core_web_lg

## import data

In [49]:
love_is_blind = []
for i in range(1,14):
    if i < 10:
        i = pd.read_csv("loveisblind4/Love.Is.Blind.S04E0"+str(i)+".WEBRip.NF.en.srt.csv").text
    else:
        i = pd.read_csv("loveisblind4/Love.Is.Blind.S04E"+str(i)+".WEBRip.NF.en.srt.csv").text
    love_is_blind.append(i)

In [50]:
lib = pd.DataFrame(pd.concat(love_is_blind, axis=0, ignore_index=True))
lib.shape

(15938, 1)

In [51]:
love_island = []
for i in range(1,30):
    if i < 10:
        i = pd.read_csv("loveislandaus4/Love.Island.Australia.S04E0"+str(i)+".WEBRip.NF.en.srt.csv").text
    else:
        i = pd.read_csv("loveislandaus4/Love.Island.Australia.S04E"+str(i)+".WEBRip.NF.en.srt.csv").text
    love_island.append(i)

In [52]:
lisl = pd.DataFrame(pd.concat(love_island, axis=0, ignore_index=True))
lisl.shape

(30984, 1)

In [53]:
too_hot_to_handle = []
for i in range(1,9):
    i = pd.read_csv("toohot1/Too.Hot.to.Handle.S01E0"+str(i)+".NF.WEBRip-HI.srt.csv").text
    too_hot_to_handle.append(i)

In [54]:
hot = pd.DataFrame(pd.concat(too_hot_to_handle, axis=0, ignore_index=True))
hot.shape

(6681, 1)

## function | clean symbols

In [55]:
# helper function
def clean_text(subtitle_lst):
    
  cleaned_text = []

  for subtitle_str in subtitle_lst:

    # clean symbols
    cleanhtml = re.compile('<.*?>')
    clean1 = re.sub(cleanhtml, '', subtitle_str)

    cleanbrackets = re.compile('\[.*?\]')
    clean2 = re.sub(cleanbrackets, '', clean1)

    cleancurly = re.compile('{.*?}')
    clean3 = re.sub(cleancurly, '', clean2)

    cleandash = re.compile('-')
    clean4 = re.sub(cleandash, '', clean3)

    cleanellipse = re.compile('\…')
    clean5 = re.sub(cleanellipse, '', clean4)

    if "\n" in clean5:
      clean5 = clean5.replace("\n"," ")

    if "\xa0" in clean5:
      clean5 = clean5.replace("\xa0"," ")

    clean5 = clean5.lower() # lowercase 

    clean5 = clean5.lstrip().strip() # strip leading and ending spaces
    
    cleaned_text.append(clean5)

  cleaned_text = list(filter(None, cleaned_text)) # remove empty strings
  cleaned_text = [string for string in cleaned_text if string[0] != "♪"] # remove music subtitles


  return cleaned_text

## function | tokenization

In [56]:
nlp = spacy.load("en_core_web_lg")

In [73]:
def tokenize_text(sub_lst):
    
    cleaned_lst = []

    for subtitle in sub_lst:
        
        # replace names
        for token in nlp(subtitle).ents:
             if token.label_ == "PERSON":
                subtitle = token.text.replace(token.text, "PERSON")
        
        doc = nlp(subtitle)

        # remove punctuation, stop words
        doc_cleaned_one = [token for token in doc if token.is_punct == False and token.text != ' ' and token.is_stop == False]
        # remove additional words
        remove_words = ["gon", "na", "um", "wanna", "mmhmm", "oh", "uh", "ooh", "y'", "'em", "ta", "wo", "ya", # filler sounds
                        "yes", "yeah", "okay", "hi",  "hello", "right", "hey", "guys", "lot", "like", "bit", "'cause", # filler words
                        "know", "feel", "let", "think", "love", "good", "want", "going", "got", "mean", "said", # too common
                        # "fuck", "shit", "girl", "girls", "fuck", "guy", "man", "bro", #too common, don't use for love is blind
                        "micah", "amber", "tiffany", "PERSON", "zack", "stella", "irina", "chelsea", "irina", "shelby","marshall", # names
                        "rhonda", "francesca", "yoni", "sharron", "lana", "fran", "connor", "austen", "jordan", "maddy", "callum", # names
                        "mitch", "phoebes", "majorca", "god", "browns", "phoebe", "yonis" # names
                        "arizona", "mexico" # places
                        ]
        
        doc_cleaned_two = [token for token in doc_cleaned_one if str(token) not in remove_words]

        cleaned_lst.append(doc_cleaned_two)

    cleaned_lst = [x for x in cleaned_lst if x]

    return cleaned_lst

In [58]:
def make_docs(show_subtitles):
    docs = tokenize_text(clean_text(show_subtitles.text))
    docs_str = []
    for doc in docs:
        docs_str.append([str(token) for token in doc])
    docs = docs_str
    return docs

# clean subtitles

In [74]:
lib_docs = make_docs(lib)

In [60]:
len(lib_docs)

11061

In [61]:
lisl_docs = make_docs(lisl)

In [62]:
len(lisl_docs)

22593

In [63]:
hot_docs = make_docs(hot)

In [64]:
len(hot_docs)

4555

# function | combine docs

In [65]:
def combine(docs, group_size):
    combined_docs = []
    num_groups = len(docs) // group_size

    for i in range(num_groups):
        docs_big = docs[i*group_size : (i+1)*group_size]
        combined_docs.append([word for doc in docs_big for word in doc])

    remaining_docs = docs[num_groups*group_size:]
    if remaining_docs:
        combined_docs.append([word for doc in remaining_docs for word in doc])
    
    return combined_docs

# love is blind

In [67]:
import warnings
warnings.filterwarnings("ignore")

In [75]:
group_size = 5
num_topics = 4

lib_combined = combine(lib_docs, group_size)

dictionary = gensim.corpora.dictionary.Dictionary(lib_combined)
dictionary.filter_extremes(no_below=3)
corpus = [dictionary.doc2bow(doc) for doc in lib_combined] 
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=num_topics, id2word = dictionary, random_state = 1)

%matplotlib inline
vis = pyLDAvis.gensim.prepare(topic_model=ldamodel, corpus=corpus, dictionary=dictionary)
pyLDAvis.enable_notebook()
pyLDAvis.save_html(vis, 'love_is_blind.html')

# love island 

In [70]:
group_size = 5
num_topics = 3

lisl_combined = combine(lisl_docs, group_size)

dictionary = gensim.corpora.dictionary.Dictionary(lisl_combined)
dictionary.filter_extremes(no_below=3)
corpus = [dictionary.doc2bow(doc) for doc in lisl_combined] 
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=num_topics, id2word = dictionary, random_state = 1)

%matplotlib inline
vis = pyLDAvis.gensim.prepare(topic_model=ldamodel, corpus=corpus, dictionary=dictionary)
pyLDAvis.enable_notebook()
pyLDAvis.save_html(vis, 'love_island.html')

# too hot to handle

In [71]:
group_size = 5
num_topics = 3

hot_combined = combine(hot_docs, group_size)

dictionary = gensim.corpora.dictionary.Dictionary(hot_combined)
dictionary.filter_extremes(no_below=2)
corpus = [dictionary.doc2bow(doc) for doc in hot_combined] 
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=num_topics, id2word = dictionary, random_state = 1)

%matplotlib inline
vis = pyLDAvis.gensim.prepare(topic_model=ldamodel, corpus=corpus, dictionary=dictionary)
pyLDAvis.enable_notebook()
pyLDAvis.save_html(vis, 'too_hot_to_handle.html')