In [3]:
from transformers import pipeline
import nltk
import torch
from glob import glob
import tf_keras as keras
import pandas as pd
from nltk import sent_tokenize
import numpy as np

In [4]:
nltk.download('punkt_tab')
nltk.download('punkt')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Yuvraj\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Yuvraj\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [5]:

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [6]:
model_name = "facebook/bart-large-mnli"

def load_model(device):
    theme_classifier = pipeline(
        "zero-shot-classification",
        model=model_name,
        device=device
    )

    return theme_classifier

In [7]:
theme_classifier = load_model(device)



In [8]:
theme_list = ["Friendship","Perseverance","Betrayal","Revenge","Redemption","Leadership","Hard Work","Fate vs. Free Will","Sacrifice","Loneliness","War","Peace","Power","Legacy","Family","Identity","Discrimination","Love","Brotherhood","Mentorship","Destiny","Hope","Courage","Honor","Duty","Growth","Pain","Forgiveness","Loyalty","Darkness vs. Light"]

In [9]:
theme_classifier(
    "first i gave him left punch then right punch",
    theme_list,
    multi_label = True
)

  attn_output = torch.nn.functional.scaled_dot_product_attention(


{'sequence': 'first i gave him left punch then right punch',
 'labels': ['Power',
  'Revenge',
  'Family',
  'Hard Work',
  'Duty',
  'Leadership',
  'Destiny',
  'War',
  'Pain',
  'Perseverance',
  'Growth',
  'Brotherhood',
  'Redemption',
  'Legacy',
  'Honor',
  'Identity',
  'Discrimination',
  'Betrayal',
  'Fate vs. Free Will',
  'Courage',
  'Darkness vs. Light',
  'Loyalty',
  'Sacrifice',
  'Hope',
  'Mentorship',
  'Friendship',
  'Loneliness',
  'Love',
  'Peace',
  'Forgiveness'],
 'scores': [0.8209035992622375,
  0.7312631011009216,
  0.5647425055503845,
  0.5496121048927307,
  0.47235631942749023,
  0.4217102527618408,
  0.3963577449321747,
  0.38336506485939026,
  0.3623467683792114,
  0.33490651845932007,
  0.2953099012374878,
  0.24557647109031677,
  0.23947873711585999,
  0.23876646161079407,
  0.21402662992477417,
  0.21078836917877197,
  0.20708662271499634,
  0.1629658192396164,
  0.15669062733650208,
  0.13681310415267944,
  0.07609722018241882,
  0.060043428093

In [10]:
files = glob('../data/Subtitles/*.ass')

In [11]:
def load_subtitle_dataset(dataset_path):
    subtitle_paths = glob(dataset_path + '/*.ass')

    scripts = []
    episode_num = []

    for path in subtitle_paths:

        with open(path, 'r', encoding='utf8') as file:
            lines = file.readlines()
            lines = lines[27:]
            lines = [",".join(line.split(',')[9:]) for line in lines]
        
        lines = [line.replace('\\N', ' ') for line in lines]
        script = " ".join(lines)

        episode = int(path.split('-')[-1].split('.')[0].strip())

        scripts.append(script)
        episode_num.append(episode)

    data = pd.DataFrame.from_dict({"episode" : episode_num, "scripts" : scripts})
    return data

In [12]:
dataset_path = '../data/Subtitles'
data = load_subtitle_dataset(dataset_path)

In [13]:
data.head()

Unnamed: 0,episode,scripts
0,1,"A long time ago, a powerful demon fox appeared..."
1,2,"C'mon!\n Running like a fugitive,\n Being chas..."
2,3,"C'mon!\n Running like a fugitive,\n Being chas..."
3,4,"C'mon!\n Running like a fugitive,\n Being chas..."
4,5,"C'mon!\n Running like a fugitive,\n Being chas..."


In [14]:
script = data.iloc[0]['scripts']
script

'A long time ago, a powerful demon fox appeared with nine tails.\n With its powerful tails,\n it could smash mountains and create tidal waves.\n A band of Ninjas rose to defend their village from attack.\n We have to wait until the Fourth Hokage gets here!\n We can\'t let it get any closer to our village!\n One great Ninja was able to imprison the monster,\n but died in the process.\n This Ninja was known as… the Fourth Hokage.\n Naruto!\n Why did you do such a thing?!\n You\'re really gonna get it this time!\n I don\'t care!\n You know your problem?\n You can\'t do the things I do!\n Only I can do this!\n I\'m better than all of you! Believe it!\n There\'s a problem, sir!\n Lord Hokage!\n What is it?\n Did that Naruto do something again?\n Yes. He climbed onto the Mountainside Images…\n And he vandalized and graffitied all over them!\n Wait!\n Ha ha…\n Why should I?\n Hey, Naruto!\n How did you suddenly get here, lruka Sensei?\n The question is what are you doing here when you should 

In [15]:
# Tokenize the script into sentences
script_sentences = sent_tokenize(script)
script_sentences[:3]

['A long time ago, a powerful demon fox appeared with nine tails.',
 'With its powerful tails,\n it could smash mountains and create tidal waves.',
 'A band of Ninjas rose to defend their village from attack.']

In [16]:
sentence_batch_size = 20
script_batches = []
for index in range(0, len(script_sentences), sentence_batch_size):
    sent = " ".join(script_sentences[index : index + sentence_batch_size])
    script_batches.append(sent)


In [17]:
script_batches[:2]

["A long time ago, a powerful demon fox appeared with nine tails. With its powerful tails,\n it could smash mountains and create tidal waves. A band of Ninjas rose to defend their village from attack. We have to wait until the Fourth Hokage gets here! We can't let it get any closer to our village! One great Ninja was able to imprison the monster,\n but died in the process. This Ninja was known as… the Fourth Hokage. Naruto! Why did you do such a thing?! You're really gonna get it this time! I don't care! You know your problem? You can't do the things I do! Only I can do this! I'm better than all of you! Believe it! There's a problem, sir! Lord Hokage! What is it? Did that Naruto do something again?",
 'Yes. He climbed onto the Mountainside Images…\n And he vandalized and graffitied all over them! Wait! Ha ha…\n Why should I? Hey, Naruto! How did you suddenly get here, lruka Sensei? The question is what are you doing here when you should be in class now? Now listen, Naruto. You failed t

In [29]:
theme_output = theme_classifier(
    script_batches[:4],
    theme_list,
    multi_label = True

)

In [26]:
theme_output

[{'sequence': "A long time ago, a powerful demon fox appeared with nine tails. With its powerful tails,\n it could smash mountains and create tidal waves. A band of Ninjas rose to defend their village from attack. We have to wait until the Fourth Hokage gets here! We can't let it get any closer to our village! One great Ninja was able to imprison the monster,\n but died in the process. This Ninja was known as… the Fourth Hokage. Naruto! Why did you do such a thing?! You're really gonna get it this time! I don't care! You know your problem? You can't do the things I do! Only I can do this! I'm better than all of you! Believe it! There's a problem, sir! Lord Hokage! What is it? Did that Naruto do something again?",
  'labels': ['Leadership',
   'Betrayal',
   'Mentorship',
   'Power',
   'Loyalty',
   'Hard Work',
   'Courage',
   'War',
   'Discrimination',
   'Perseverance',
   'Revenge',
   'Duty',
   'Honor',
   'Identity',
   'Destiny',
   'Family',
   'Redemption',
   'Legacy',
   

In [30]:
themes = {}
for output in theme_output:
    for label, score in zip(output['labels'], output['scores']):
        if label not in themes:
            themes[label] = []
            themes[label].append(score)

In [31]:
themes

{'Leadership': [0.9002184867858887],
 'Betrayal': [0.8034593462944031],
 'Mentorship': [0.7910366654396057],
 'Power': [0.7896084189414978],
 'Loyalty': [0.6325691342353821],
 'Hard Work': [0.6186954975128174],
 'Courage': [0.609144926071167],
 'War': [0.5611845850944519],
 'Discrimination': [0.5497225522994995],
 'Perseverance': [0.5229366421699524],
 'Revenge': [0.5099170804023743],
 'Duty': [0.4737992584705353],
 'Honor': [0.4579600393772125],
 'Identity': [0.4276641011238098],
 'Destiny': [0.4188220202922821],
 'Family': [0.4138220548629761],
 'Redemption': [0.3999367356300354],
 'Legacy': [0.39962470531463623],
 'Pain': [0.3709356188774109],
 'Sacrifice': [0.36569178104400635],
 'Growth': [0.3496747612953186],
 'Darkness vs. Light': [0.3145155608654022],
 'Fate vs. Free Will': [0.24192823469638824],
 'Brotherhood': [0.2285873144865036],
 'Loneliness': [0.12809304893016815],
 'Hope': [0.09071825444698334],
 'Forgiveness': [0.06819117069244385],
 'Friendship': [0.06264957785606384],

In [35]:
def get_themes_inference(script):
    script_sentences = sent_tokenize(script)

    sentence_batch_size = 20
    script_batches = []
    for index in range(0, len(script_sentences), sentence_batch_size):
        sent = " ".join(script_sentences[index : index + sentence_batch_size])
        script_batches.append(sent)

    
    theme_output = theme_classifier(
        script_batches,
        theme_list, 
        multi_label = True
    )

    themes = {}
    for output in theme_output:
        for label, score in zip(output['labels'], output['scores']):
            if label not in themes:
                themes[label] = []
                themes[label].append(score)
    
    themes = {key : np.mean(np.array(value)) for key, value in themes.items()}

    return themes