# General stuff

In [45]:
import glob
import re
import pandas as pd
import numpy as np
import moviepy
from moviepy.editor import *
import subprocess
import shlex

MOVIES_PATH = "./data"


In [2]:
MOVIE_REGEX = '.*The[.]Simpsons[.]S(?P<season>\d\d)E(?P<episode>\d\d)[.].*'
movie_files = glob.glob(MOVIES_PATH+"/*.mkv")

movie_files_df = pd.DataFrame(
    [ [m.group(0), m.group("season"), m.group("episode")] for m in [ re.search(MOVIE_REGEX, f) for f in movie_files] ],
    columns=["filename", "season", "episode"]
)

movie_files_df.to_csv("data/movie_files.csv")

In [3]:
movie_files_df.groupby("season").count()

Unnamed: 0_level_0,filename,episode
season,Unnamed: 1_level_1,Unnamed: 2_level_1
1,1,1
2,1,1
3,1,1
6,3,3
7,1,1
8,1,1
11,22,22
14,1,1
16,1,1
17,5,5


# Concatenating Segments

In [4]:
TMP_DIR = "/tmp"

def create_movie(output_name, segments, movie_files_df=movie_files_df):
    tmp_dir = TMP_DIR + "/" + output_name
    # create temp dir
    subprocess.call("rm -fr " + tmp_dir, shell=True)
    subprocess.call("mkdir " + tmp_dir, shell=True)
    
    list_file = "{}/list".format(tmp_dir)
    with open(list_file, 'w') as f:
        for i, segment in enumerate(segments):
            file_id, from_sec, to_sec = segment
            print("Extracting segment #{} ({}-{} in file #{})...".format(i, from_sec, to_sec, file_id))

            input_file = movie_files_df.ix[file_id].filename
            output_file = "{}/{}.mkv".format(tmp_dir, i)
            
            ffmpeg_command = "ffmpeg -i {} -ss {} -to {} -async 1 -strict -2 {}".format(shlex.quote(input_file), from_sec, to_sec, output_file)
            f.write("file {}\n".format(output_file))

            subprocess.call(ffmpeg_command, shell=True)

    print("Concatenating all files...")
    subprocess.call("ffmpeg -safe 0 -f concat -i {} -async 1 -strict -2 {}".format(list_file, output_name), shell=True)
    print("Done. {} was created".format(output_name))

In [94]:
segments = [[0,100,102], [1,500,503], [2,700,704]]

create_movie("test.mkv", segments)

Extracting segment #0 (100-102 in file #0)...
Extracting segment #1 (500-503 in file #1)...
Extracting segment #2 (700-704 in file #2)...
Concatenating all files...
Done. test.mkv was created


# Analyzing SRTs

In [5]:
from difflib import SequenceMatcher
SequenceMatcher(None, 'the cat is going home', 'the cats is going home').ratio()

0.9767441860465116

In [54]:
%run subtitles.py

srt_pds = []
srt_files = glob.glob(MOVIES_PATH+"/*.srt")
for srt in srt_files:
    d = produce_dict(srt)
    srt_pds.append(pd.DataFrame(d))

all_srts = pd.concat(srt_pds)

In [57]:
all_srts[:10]


Unnamed: 0,end,filename,start,text
0,5.218,./data/The.Simpsons.S19E01.PROPER.PDTV.XviD-Ye...,2.152,The Simpsons S19E01 (JABF20)\nHe Loves to Fly ...
1,9.521,./data/The.Simpsons.S19E01.PROPER.PDTV.XviD-Ye...,5.756,I will not wait 20 years\nto make another movie
2,44.483,./data/The.Simpsons.S19E01.PROPER.PDTV.XviD-Ye...,42.653,My summer love.
3,54.146,./data/The.Simpsons.S19E01.PROPER.PDTV.XviD-Ye...,52.853,"All right,"
4,55.798,./data/The.Simpsons.S19E01.PROPER.PDTV.XviD-Ye...,54.147,next on my shoping list...
5,58.7,./data/The.Simpsons.S19E01.PROPER.PDTV.XviD-Ye...,56.511,a new phone.
6,61.486,./data/The.Simpsons.S19E01.PROPER.PDTV.XviD-Ye...,59.894,"Attention, shoppers,"
7,63.516,./data/The.Simpsons.S19E01.PROPER.PDTV.XviD-Ye...,61.487,iPhones now 20 cents.
8,66.254,./data/The.Simpsons.S19E01.PROPER.PDTV.XviD-Ye...,63.517,What happened to that mini-cell\nphone I gave ...
9,69.24,./data/The.Simpsons.S19E01.PROPER.PDTV.XviD-Ye...,66.257,"Ooh, I thought that\nwas a lemon drop."
