In [43]:
from transformers import pipeline
from nltk import sent_tokenize
import nltk
import torch
from glob import glob
import pandas as pd
import numpy as np

In [7]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/burii/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [8]:
## Load the model

model_name = "facebook/bart-large-mnli"
device = 0 if torch.cuda.is_available() else 'cpu'

In [9]:
def load_model(device):
    theme_classifier = pipeline(
        "zero-shot-classification", 
        model=model_name, 
        device=device)
    return theme_classifier

In [10]:
theme_classifier = load_model(device)



In [11]:
theme_list = ["friendship","hope","sacrifice","battle","self development","betrayal","love","dialogue"]

In [12]:
theme_classifier(
    "I gave him a right hook then a left jab",
    theme_list,
    multi_label=True
)

{'sequence': 'I gave him a right hook then a left jab',
 'labels': ['battle',
  'self development',
  'hope',
  'sacrifice',
  'dialogue',
  'betrayal',
  'love',
  'friendship'],
 'scores': [0.9121253490447998,
  0.47500181198120117,
  0.08781816065311432,
  0.04500045254826546,
  0.020132921636104584,
  0.012040478177368641,
  0.0042923795990645885,
  0.002817221451550722]}

In [29]:
import os
from pathlib import Path

# Get all files and sort them properly
files = glob('../data/Subtitles/*.ass')
files = sorted(files, key=lambda x: (
    int(x.split('Season')[1].split('-')[0]),  # Sort by season number
    int(x.split('-')[1].split('.')[0].strip())  # Sort by episode number
))

# Verify first and last episodes
print("First episode:", files[0])
print("Last episode:", files[-1])
print("Total episodes:", len(files))

First episode: ../data/Subtitles/Naruto Season 1 - 01.ass
Last episode: ../data/Subtitles/Naruto Season 9 - 220.ass
Total episodes: 218


In [30]:
print(len(files))  # To see total number of files

218


In [31]:
files[:5]

['../data/Subtitles/Naruto Season 1 - 01.ass',
 '../data/Subtitles/Naruto Season 1 - 02.ass',
 '../data/Subtitles/Naruto Season 1 - 03.ass',
 '../data/Subtitles/Naruto Season 1 - 04.ass',
 '../data/Subtitles/Naruto Season 1 - 05.ass']

In [32]:
with open(files[0], 'r') as file:
    lines = file.readlines()
    lines = lines[27:]
    lines = [ ",".join(line.split(',')[9:]) for line in lines]

In [33]:
lines[:2]

['A long time ago, a powerful demon fox\\Nappeared with nine tails.\n',
 'With its powerful tails,\n']

In [34]:
lines = [ line.replace('\\N', ' ')for line in lines ]

In [35]:
lines[:3]

['A long time ago, a powerful demon fox appeared with nine tails.\n',
 'With its powerful tails,\n',
 'it could smash mountains and create tidal waves.\n']

In [36]:
" ".join(lines[:10])

"A long time ago, a powerful demon fox appeared with nine tails.\n With its powerful tails,\n it could smash mountains and create tidal waves.\n A band of Ninjas rose to defend their village from attack.\n We have to wait until the Fourth Hokage gets here!\n We can't let it get any closer to our village!\n One great Ninja was able to imprison the monster,\n but died in the process.\n This Ninja was known as… the Fourth Hokage.\n Naruto!\n"

In [41]:
int(files[0].split('-')[-1].split('.')[0].strip())

1

In [44]:
def load_subtitles_dataset(dataset_path):
    subtitles_paths = glob(dataset_path+'/*.ass')

    scripts=[]
    episode_num=[]

    for path in subtitles_paths:

        #Read Lines
        with open(path,'r') as file:
            lines = file.readlines()
            lines = lines[27:]
            lines =  [ ",".join(line.split(',')[9:])  for line in lines ]
        
        lines = [ line.replace('\\N',' ') for line in lines]
        script = " ".join(lines)

        episode = int(path.split('-')[-1].split('.')[0].strip())

        scripts.append(script)
        episode_num.append(episode)

    df = pd.DataFrame.from_dict({"episode":episode_num, "script":scripts })
    return df