In [24]:
from transformers import pipeline
from nltk import sent_tokenize
import nltk
import torch
from glob import glob
import pandas as pd
import numpy as np

In [4]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/yonatanamaru/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [5]:
model_name ='facebook/bart-large-mnli'

In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cpu


In [7]:
def load_model(model_name, device):
    model = pipeline('zero-shot-classification', model=model_name, device=device)
    return model

In [8]:
theme_classifier = load_model(model_name, device)



In [40]:
theme_list = ["friendship","hope","sacrifice","battle","self development","betrayal","love","dialogue"]
for x in theme_list:
    print(x, end=', ')

friendship, hope, sacrifice, battle, self development, betrayal, love, dialogue, 

In [10]:
theme_classifier(
    "i punched him in the face",
    theme_list,
    multi_label=True
)

{'sequence': 'i punched him in the face',
 'labels': ['fight', 'hate', 'anticipation', 'love', 'friendship'],
 'scores': [0.9951337575912476,
  0.468103289604187,
  0.0042537059634923935,
  0.000228555261855945,
  0.00012069544027326629]}

In [15]:
files = glob('../data/subtitle/*.ass')

In [16]:
with open(files[0], 'r') as file:
    lines = file.readlines()
    lines = lines[27:]
    lines = [",".join(line.split(',')[9:]) for line in lines]

In [21]:
lines = [line.replace('\\N', ' ') for line in lines]

In [25]:
def load_subtitles_dataset(dataset_path):
    subtitles_paths = glob(dataset_path+'/*.ass')

    scripts=[]
    episode_num=[]

    for path in subtitles_paths:

        #Read Lines
        with open(path,'r') as file:
            lines = file.readlines()
            lines = lines[27:]
            lines =  [ ",".join(line.split(',')[9:])  for line in lines ]
        
        lines = [ line.replace('\\N',' ') for line in lines]
        script = " ".join(lines)

        episode = int(path.split('-')[-1].split('.')[0].strip())

        scripts.append(script)
        episode_num.append(episode)

    df = pd.DataFrame.from_dict({"episode":episode_num, "script":scripts })
    return df

In [26]:
dataset_path = '../data/subtitle'
df = load_subtitles_dataset(dataset_path)

In [27]:
df.head()

Unnamed: 0,episode,script
0,94,We are Fighting Dreamers aiming high\n Fightin...
1,80,We are Fighting Dreamers aiming high\n Fightin...
2,32,"Press down hard on the gas\n That’s right, the..."
3,185,"Rock away your existence,\n Shouting that you ..."
4,191,"Rock away your existence,\n Shouting that you ..."


In [28]:
script = df.iloc[0]['script']
script

'We are Fighting Dreamers aiming high\n Fighting Dreamers don\'t care what people think about them\n Fighting Dreamers follow what they believe\n Oli Oli Oli Oh! Just go my way\n Right here right now (Bang) Hit it straight like a line drive!\n Right here right now (Burn)\n Down a difficult road filled with endless struggles\n Where do you think you are going following someone else\'s map?\n An insightful crow comes along to tear up the map\n Now open your eyes and take a look at the truth (Yeah!)\n There\'s nothing to lose, so let\'s GO!!!\n We are Fighting Dreamers aiming high\n Fighting Dreamers don\'t care what people think about them\n Fighting Dreamers follow what they believe\n Oli Oli Oli Oh! Just go my way\n Right here right now (Bang) Hit it straight like a line drive!\n Right here right now (Burn) We\'re gonna do it and do our best!\n Right here right now (Bang) Hit it straight like a line drive!\n Right here right now (Burn) We\'re gonna do it and do our best! BANG!\n My bod

In [29]:
sentences = sent_tokenize(script)
sentences[:3]

["We are Fighting Dreamers aiming high\n Fighting Dreamers don't care what people think about them\n Fighting Dreamers follow what they believe\n Oli Oli Oli Oh!",
 'Just go my way\n Right here right now (Bang) Hit it straight like a line drive!',
 "Right here right now (Burn)\n Down a difficult road filled with endless struggles\n Where do you think you are going following someone else's map?"]

In [30]:
sentence_batch_size=20
script_batches = []
for index in range(0, len(sentences), sentence_batch_size):
    sent = " ".join(sentences[index:index+sentence_batch_size])
    script_batches.append(sent)

In [31]:
sent

'Wha--?! I-It can’t be. Rasengan! I believe your promise, its a true heart give me courage\n Always I feel it the precious time with you\n On your still shoulder\n I feel the wind that blows towards tomorrow\n The city lights are like pieces of stars\n That hold us together\n “Each one of us glows differently,”\n you say laughingly\n but you look most dazzling\n Like a comet,\n like the rainbow after the storm\n You bring light into my heart\n Being lost can be the answer\n Let’s promise we won’t cheat\n Uh... like a shooting star, we do go ahead. I never forget my ideal…\n to develop the Village Hidden in the Leaf and protect its people. The Hokages have sacrificed their lives for that dream. And now, I also will put my life on the line. From this moment, I am the Fifth Hokage! Next time: "The Fifth Hokage! A Life on the Line!"'

In [32]:
theme_output =  theme_classifier(
    script_batches[:2],
    theme_list,
    multi_label=True
)

In [33]:
theme_output

[{'sequence': "We are Fighting Dreamers aiming high\n Fighting Dreamers don't care what people think about them\n Fighting Dreamers follow what they believe\n Oli Oli Oli Oh! Just go my way\n Right here right now (Bang) Hit it straight like a line drive! Right here right now (Burn)\n Down a difficult road filled with endless struggles\n Where do you think you are going following someone else's map? An insightful crow comes along to tear up the map\n Now open your eyes and take a look at the truth (Yeah!) There's nothing to lose, so let's GO!!! We are Fighting Dreamers aiming high\n Fighting Dreamers don't care what people think about them\n Fighting Dreamers follow what they believe\n Oli Oli Oli Oh! Just go my way\n Right here right now (Bang) Hit it straight like a line drive! Right here right now (Burn) We're gonna do it and do our best! Right here right now (Bang) Hit it straight like a line drive! Right here right now (Burn) We're gonna do it and do our best! BANG! My body movemen

In [35]:
def get_theme_inference(script):
    sentences = sent_tokenize(script)
    sentence_batch_size=20
    script_batches = []
    for index in range(0, len(sentences), sentence_batch_size):
        sent = " ".join(sentences[index:index+sentence_batch_size])
        script_batches.append(sent)

    theme_output =  theme_classifier(
        script_batches[:2],
        theme_list,
        multi_label=True
    )

    # wrangle output
    themes = {}
    for output in theme_output:
        for label, score in zip(output['labels'], output['scores']):
            if label not in themes:
                themes[label] = []
            themes[label].append(score)

    themes = {key:np.mean(value) for key,value in themes.items()}
    return themes

In [36]:
df.head(2)

Unnamed: 0,episode,script
0,94,We are Fighting Dreamers aiming high\n Fightin...
1,80,We are Fighting Dreamers aiming high\n Fightin...


In [38]:
output_themes = df['script'].apply(get_theme_inference)


KeyboardInterrupt: 

In [None]:
theme_df = pd.DataFrame(output_themes.tolist())

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
sns.barplot(data = theme_output ,x="theme",y="score" )
plt.xticks(rotation=45)
plt.show()