# General stuff

In [66]:
import glob
import re
import pandas as pd
import numpy as np
import moviepy
from moviepy.editor import *
import subprocess
import shlex
import difflib
import os

MOVIES_PATH = "./Data/synced"


In [67]:
SEASON_EPISODE_REGEXES = [
    'S(?P<season>\d\d)\s?E(?P<episode>\d\d)', 
    '\[(?P<season>\d+)x(?P<episode>\d+)\]',
    '(?P<season>\d+)x(?P<episode>\d+)\s'
] 
def get_season_episode(s):
    m = None
    for regex in SEASON_EPISODE_REGEXES:
        m = re.search(regex, s)
        if m: break
    
    if m:
        return (m.group("season"), m.group("episode"))
    else:
        return None
        

movie_files = [file for suffix in ["mkv", "avi"] for file in glob.glob(MOVIES_PATH+"/*."+suffix)]
srt_files = [file for suffix in ["srt"] for file in glob.glob(MOVIES_PATH+"/*."+suffix)]


videos_df = pd.DataFrame(
    [ [r[0], *r[1]] for r in [ [f,get_season_episode(f)] for f in movie_files ] if r[1] is not None ],
    columns=["video_filename", "season", "episode"]
)
if np.any(videos_df.groupby(["season", "episode"]).video_filename.transform("size") > 1):
    raise Exception("Double season+episode in video files")

videos_df.set_index(['season', 'episode'])


srts_df = pd.DataFrame(
    [ [r[0], *r[1]] for r in [ [f,get_season_episode(f)] for f in srt_files ] if r[1] is not None ],
    columns=["srt_filename", "season", "episode"]
)
if np.any(srts_df.groupby(["season", "episode"]).srt_filename.transform("size") > 1):
    raise Exception("Double season+episode in srt files")

srts_df.set_index(['season', 'episode'])

episodes = pd.merge(videos_df, srts_df).set_index(['season', 'episode'])


In [68]:
episodes

Unnamed: 0_level_0,Unnamed: 1_level_0,video_filename,srt_filename
season,episode,Unnamed: 2_level_1,Unnamed: 3_level_1
26,1,./Data/synced/S26E01- Clown in the Dumps.mkv,./Data/synced/S26E01- Clown in the Dumps.srt
26,2,./Data/synced/S26E02- The Wreck of the Relatio...,./Data/synced/S26E02- The Wreck of the Relatio...
26,3,./Data/synced/S26E03- Super Franchise Me.mkv,./Data/synced/S26E03- Super Franchise Me.srt
26,4,./Data/synced/S26E04- Treehouse of Horror XXV.mkv,./Data/synced/S26E04- Treehouse of Horror XXV.srt
26,5,./Data/synced/S26E05- Opposites A-Frack.mkv,./Data/synced/S26E05- Opposites A-Frack.srt
26,6,./Data/synced/S26E06- Simpsorama.mkv,./Data/synced/S26E06- Simpsorama.srt
26,7,./Data/synced/S26E07- Blazed and Confused.mkv,./Data/synced/S26E07- Blazed and Confused.srt
26,8,./Data/synced/S26E08- Covercraft.mkv,./Data/synced/S26E08- Covercraft.srt
26,9,./Data/synced/S26E09- I Wont be Home for Chris...,./Data/synced/S26E09- I Wont be Home for Chris...
26,10,./Data/synced/S26E10- The Man Who Came to be D...,./Data/synced/S26E10- The Man Who Came to be D...


# Concatenating Segments

In [77]:
TMP_DIR = "/tmp"

def create_movie(output_name, segments, movie_files_df):
    #output_filename = output_name+".mp4"
    tmp_dir = "./" + output_name
    subprocess.call("rm -fr " + tmp_dir, shell=True)
    subprocess.call("mkdir " + tmp_dir, shell=True)
    
    out_segments = []
    
    for i, segment in enumerate(segments):
        season, episode, from_sec, to_sec, text = segment
        input_file = episodes.ix[season, episode].video_filename
        if not input_file: 
            raise Exception("no video file for season {} episode {}".format(season, episode))

        print("Extracting segment #{} ({}-{} in file #{})...".format(i, from_sec, to_sec, input_file))

        interim_output_file = "{}/{}.mp4".format(tmp_dir, i)

        ffmpeg_command = "ffmpeg -i {} -ss {} -to {} -c:v libx264 -c:a aac -strict experimental -b:a 128k  -pix_fmt yuv420p -vf scale=480:320 {}".format(shlex.quote(input_file), from_sec, to_sec, interim_output_file)
        #ffmpeg_command = "mencoder -ss {} -endpos {} -oac copy -ovc copy {} -o {}".format(from_sec, to_sec, shlex.quote(input_file), interim_output_file)
        out_segments.append([text, interim_output_file])
        
        subprocess.call(ffmpeg_command, shell=True)


    # create M3U file
    print("Creating M3U file...")
    with open(tmp_dir + "/movie.m3u","w") as f:
        f.write("#EXTM3U\n")
        f.write("#EXTINF:0,{}\n".format(output_name))
        f.write(os.path.abspath("opening.mp4"))
        for text, filename in out_segments:
            title = re.sub(r"\s", " ", text)
            title = re.sub(r",", ";", title)
            
            f.write("#EXTINF:0,{}\n".format(title))
            f.write(os.path.abspath(filename) + "\n")
            
    #print("Concatenating all files...")
    #subprocess.call("rm -fr {}".format(output_filename), shell=True)
    #subprocess.call("ffmpeg -safe 0 -f concat -i {} -c copy {}".format(list_file, output_filename), shell=True)
    #print("Done. {} was created".format(output_filename))

# Analyzing SRTs - Building Segments

In [70]:
%run subtitles.py

srt_pds = []
for index, row in episodes.iterrows():
    episode_segments = pd.DataFrame(produce_dict(row.srt_filename))
    episode_segments['season'] = index[0]
    episode_segments['episode'] = index[1]
    
    srt_pds.append(episode_segments)
    
segments = pd.concat(srt_pds)
segments

Unnamed: 0,end,start,text,season,episode
0,12.410,10.377,(exclaiming),26,01
1,18.783,16.617,(school bell ringing),26,01
2,22.387,21.321,(barney belches),26,01
3,24.356,22.389,(whistle blows),26,01
4,26.591,24.358,(yells),26,01
5,32.030,30.531,(beeping),26,01
6,41.473,39.239,(playing the harp),26,01
7,45.377,43.877,(tires screeching),26,01
8,48.013,46.713,D'oh!,26,01
9,49.147,48.015,(tires screeching),26,01


In [71]:
def normalize_srt_text(text):
    text = text.lower()
    text = re.sub(r"[^a-z0-9\s]", "", text)  # remove non alpha-numeric
    text = re.sub(r"\s+", " ", text) # remove sequences of whitespaces
    return text

segments['norm_text'] = segments.text.apply(normalize_srt_text)


In [74]:
segments

Unnamed: 0,end,start,text,season,episode,norm_text
0,12.410,10.377,(exclaiming),26,01,exclaiming
1,18.783,16.617,(school bell ringing),26,01,school bell ringing
2,22.387,21.321,(barney belches),26,01,barney belches
3,24.356,22.389,(whistle blows),26,01,whistle blows
4,26.591,24.358,(yells),26,01,yells
5,32.030,30.531,(beeping),26,01,beeping
6,41.473,39.239,(playing the harp),26,01,playing the harp
7,45.377,43.877,(tires screeching),26,01,tires screeching
8,48.013,46.713,D'oh!,26,01,doh
9,49.147,48.015,(tires screeching),26,01,tires screeching


In [8]:
def search_closest_segment(segments, sentence):
    norm_sentence = normalize_srt_text(sentence)
    tmp = segments.copy()
    tmp['score'] = tmp.norm_text.apply(
        lambda t: difflib.SequenceMatcher(None, t, norm_sentence).ratio()
    )
    return tmp.sort_values(by="score", ascending=False).iloc[0]

def create_movie_orders(script):
    orders = []
    for speaker, sentence in script:
        # TODO: segments[segments.speaker == speaker]
        res = search_closest_segment(segments, sentence)
        orders.append([res.season, res.episode, res.start, res.end, res.text])
    return orders
        

In [9]:
search_closest_segment(segments, "good night everybody")

end                         895.504
start                       893.912
text         Good night, everybody.
season                            3
episode                          13
norm_text      good night everybody
score                             1
Name: 342, dtype: object

# Adding Speaker

In [93]:
for index, row in segments[:10].iterrows():
    ep = episodes.ix[row.season, row.episode]
    print(row)
    print(ep)

end                 12.41
start              10.377
text         (exclaiming)
season                 26
episode                01
norm_text      exclaiming
Name: 0, dtype: object
video_filename    ./Data/synced/S26E01- Clown in the Dumps.mkv
srt_filename      ./Data/synced/S26E01- Clown in the Dumps.srt
Name: (26, 01), dtype: object
end                         18.783
start                       16.617
text         (school bell ringing)
season                          26
episode                         01
norm_text      school bell ringing
Name: 1, dtype: object
video_filename    ./Data/synced/S26E01- Clown in the Dumps.mkv
srt_filename      ./Data/synced/S26E01- Clown in the Dumps.srt
Name: (26, 01), dtype: object
end                    22.387
start                  21.321
text         (barney belches)
season                     26
episode                    01
norm_text      barney belches
Name: 2, dtype: object
video_filename    ./Data/synced/S26E01- Clown in the Dumps.mkv
srt_filena

In [75]:
movie_script = [
    ["bart", "what's for dinner?"],
    ["homer", "don't tell your mother but I brought you some pizza"],
    ["marge", "bart! you had pizza for lunch"],
    ["bart", "okay, no big deal"],
    ["homer", "bring forth all the cookies from the kitchen"],
    ["lisa", "ah, okay"],
    ["lisa", "well, are you going to help me or what?!"],
    ["lisa", "Thank You!"],
    ["homer", "I'm going to sleep"],
]


movie_script = [
    ["", "I'm tired"],
    ["", "Let's go to sleep"],
    ["", "Is it working?"],
    ["", "Really??"],
    ["", "Great!!"],
    ["", "Let's have another beer!"],
]

movie_orders = create_movie_orders(movie_script)

In [78]:
create_movie("dummy_movie", movie_orders, episodes)

Extracting segment #0 (672.305-673.338 in file #./Data/synced/S26E03- Super Franchise Me.mkv)...
Extracting segment #1 (620.654-622.587 in file #./Data/synced/S26E11- Barts New Friend.mkv)...
Extracting segment #2 (87.52-88.644 in file #./Data/synced/S26E12- The Musk who Fell to Earth.mkv)...
Extracting segment #3 (206.541-207.807 in file #./Data/synced/The Simpsons S27E04 [720p] ~{KiNg}.mkv)...
Extracting segment #4 (363.496-364.962 in file #./Data/synced/The Simpsons S27E09 [720p] ~{KiNg}.mkv)...
Extracting segment #5 (189.921-194.294 in file #./Data/synced/S26E15- The Princess Guide.mkv)...
Creating M3U file...
