In [105]:
import pandas as pd
import numpy as np
import pysrt
import re

In [286]:
BASE_DIR = 'T:/pycharm_repo/Working_dir/Deepdub'
OUTPUT_DIR = BASE_DIR
SAMPLE_DIR = BASE_DIR + '/tale_of_nine_tailed'
SUBTITLE_DIR = SAMPLE_DIR + '/subtitles/'

In [287]:
subs = pysrt.open(SUBTITLE_DIR + 'ep1_eng.srt')
subs[0]

<pysrt.srtitem.SubRipItem at 0x2bc390a95e0>

# Exploration 
1. The subtitles contains, text like `...`.
   - For example: 
   ```
   0: When a fox turns 100,
   1: it can transform into   
   a beautiful woman...
   ```
2. Text contains `\n` used to add newline as viewer can't read a long line of subtitle
   - For example in second line insert a `\n` after `into`: 
   ```
    0: When a fox turns 100,
    1: it can transform into   
    a beautiful woman...
   ```  

   ```
   subs[1] 'it can transform into\na beautiful woman...'
   ```    
      
3. Currently preprocessing creates sentences where one end in single view of subtitle and another sentence begin in same view.

    This way we can't really know when exactly did the sentence ended and next began, as both came in view at once and in subtitles both has a single starting and ending point. Like for instance (First is spoken by Jo Bo-ah second by Kim Bum):
    
    ```
    698
    00:58:49,825 --> 00:58:51,924
    - Did you miss me? - As if, brother.
    ```

    <br>**-** generally means there are two **Different people speaking at that moment. **So, we shouldn't remove `-` during preprocessing and split sentences, but when did one sentence end and another began is problem**.
    
    
4. Sometimes subs also translate what is visually written **but not spoken** inside brackets, like:

    ```
    85
    00:25:34,433 --> 00:25:38,902
    (Crackdown on undocumented spirits are being enforced.)

    286
    00:25:38,902 --> 00:25:41,973
    (Afterlife Immigration Office)
    ```
    
    <br>Or explaining some terms (In Hindi subtitles; not found in English one):
    
    ```
    247
    00:26:31,470 --> 00:26:35,000
    वाह, क्या नज़ारा है की सर्वशक्तिमान ताल उइ पा भी कंप्युटर इस्तेमाल करने के लिए जूझ रही हैं। <i>(ताल ई पा: भगवान जो 'सामदोचोन', ज़िंदगी और मौत के बीच की जगह पर नजर रखती हैं।)</i>
    ```
    

In [288]:
print(subs[0])
print(f"Starting time -> {subs[0].start.seconds}:{subs[0].start.milliseconds}")
print(f"Ending time   -> {subs[0].end.seconds}:{subs[0].end.milliseconds}")

1
00:00:44,172 --> 00:00:45,973
When a fox turns 100,

Starting time -> 44:172
Ending time   -> 45:973


In [289]:
subs[0].text

'When a fox turns 100,'

In [290]:
subs[0].start

SubRipTime(0, 0, 44, 172)

In [291]:
for i, sub in enumerate(subs[:5]):
  print(f"{i}:{sub.text}")

0:When a fox turns 100,
1:it can transform into a beautiful woman...
2:or become a man who has relations with one.
3:However, a fox of 1,000 years receives the sky's blessing...
4:and becomes a celestial fox.


In [292]:
print(subs[697])

698
00:58:49,825 --> 00:58:51,924
- Did you miss me? - As if, brother.



In [293]:
# print(str(subs[0].start))
pd.to_datetime(str(subs[0].start), format="%H:%M:%S,%f")

Timestamp('1900-01-01 00:00:44.172000')

In [294]:
pd.to_datetime(subs_df["start"], format="%H:%M:%S,%f")[1]

Timestamp('1900-01-01 00:00:45.973000')

## Preprocessing and removing

In [295]:
# Replace using regex without writing regex expression
replacements = {"|": " ", "\n": " ", "...": ";", "\((.*)\)": ""}
replacements = dict((re.escape(k), v) for k, v in replacements.items())
pattern = re.compile("|".join(replacements.keys()))

subs_df = pd.DataFrame([[sub.start, sub.end,
                         pattern.sub(lambda m: replacements[re.escape(m.group(0))], sub.text)] for sub in subs],
                       columns=["start", "end", "text"])

# Point 4
subs_df[282:287]

Unnamed: 0,start,end,text
282,"00:25:10,372","00:25:12,102",- Okay? - Okay!
283,"00:25:29,693","00:25:34,062",(Afterlife Immigration Office)
284,"00:25:34,433","00:25:38,902",(Crackdown on undocumented spirits are being e...
285,"00:25:38,902","00:25:41,973",(Afterlife Immigration Office)
286,"00:26:08,632","00:26:10,193","How have you been, Granny?"


In [296]:
pd.util.hash_pandas_object(subs_df["text"], index=False)

0      11666022566518399743
1      17960271684186646883
2      15748956420119968844
3       1316365539741181613
4      11654452799880118098
               ...         
756     4454171381933255613
757    13182271712768067078
758    14200525708433450145
759    14947191855219668614
760      436339418285105795
Length: 761, dtype: uint64

In [297]:
subs_df["start"] = pd.to_datetime(subs_df["start"], format="%H:%M:%S,%f")
subs_df.head()

Unnamed: 0,start,end,text
0,1900-01-01 00:00:44.172,"00:00:45,973","When a fox turns 100,"
1,1900-01-01 00:00:45.973,"00:00:48,112",it can transform into a beautiful woman;
2,1900-01-01 00:00:48.743,"00:00:51,642",or become a man who has relations with one.
3,1900-01-01 00:00:52.412,"00:00:55,153","However, a fox of 1,000 years receives the sky..."
4,1900-01-01 00:00:55.153,"00:00:56,522",and becomes a celestial fox.


In [298]:
subs_df["end"]  = pd.to_datetime(subs_df["end"], format="%H:%M:%S,%f")
subs_df.set_index(["start", "end"], inplace=True)
subs_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,text
start,end,Unnamed: 2_level_1
1900-01-01 00:00:44.172,1900-01-01 00:00:45.973,"When a fox turns 100,"
1900-01-01 00:00:45.973,1900-01-01 00:00:48.112,it can transform into a beautiful woman;
1900-01-01 00:00:48.743,1900-01-01 00:00:51.642,or become a man who has relations with one.
1900-01-01 00:00:52.412,1900-01-01 00:00:55.153,"However, a fox of 1,000 years receives the sky..."
1900-01-01 00:00:55.153,1900-01-01 00:00:56.522,and becomes a celestial fox.


In [299]:
subs_df.shape

(761, 1)

In [300]:
re.sub("\((.*)\)", "", subs_df.iloc[283].text)

''

In [301]:
print(re.sub("\((.*)\)","", subs_df.iloc[286].text))

How have you been, Granny?


In [302]:
print(re.sub("\((.*)\)", "",
             "वाह, क्या नज़ारा है की सर्वशक्तिमान ताल उइ पा भी कंप्युटर इस्तेमाल करने के लिए जूझ रही हैं। <i>(ताल ई पा: भगवान जो 'सामदोचोन', ज़िंदगी और मौत के बीच की जगह पर नजर रखती हैं।)"))

वाह, क्या नज़ारा है की सर्वशक्तिमान ताल उइ पा भी कंप्युटर इस्तेमाल करने के लिए जूझ रही हैं। <i>


In [303]:
subs_df.iloc[282:287]

Unnamed: 0_level_0,Unnamed: 1_level_0,text
start,end,Unnamed: 2_level_1
1900-01-01 00:25:10.372,1900-01-01 00:25:12.102,- Okay? - Okay!
1900-01-01 00:25:29.693,1900-01-01 00:25:34.062,(Afterlife Immigration Office)
1900-01-01 00:25:34.433,1900-01-01 00:25:38.902,(Crackdown on undocumented spirits are being e...
1900-01-01 00:25:38.902,1900-01-01 00:25:41.973,(Afterlife Immigration Office)
1900-01-01 00:26:08.632,1900-01-01 00:26:10.193,"How have you been, Granny?"


In [304]:
subs_df[282:287][["text"]].applymap(lambda row: re.sub("\((.*)\)", "", row))

Unnamed: 0_level_0,Unnamed: 1_level_0,text
start,end,Unnamed: 2_level_1
1900-01-01 00:25:10.372,1900-01-01 00:25:12.102,- Okay? - Okay!
1900-01-01 00:25:29.693,1900-01-01 00:25:34.062,
1900-01-01 00:25:34.433,1900-01-01 00:25:38.902,
1900-01-01 00:25:38.902,1900-01-01 00:25:41.973,
1900-01-01 00:26:08.632,1900-01-01 00:26:10.193,"How have you been, Granny?"


In [305]:
# Since probably some text also has
# text in it which is supposed to be spoken
# Below isn't what we want.
subs_df[282:287][["text"]].replace(r'\((.*)\)', np.nan, regex=True).dropna()

Unnamed: 0_level_0,Unnamed: 1_level_0,text
start,end,Unnamed: 2_level_1
1900-01-01 00:25:10.372,1900-01-01 00:25:12.102,- Okay? - Okay!
1900-01-01 00:26:08.632,1900-01-01 00:26:10.193,"How have you been, Granny?"


In [306]:
# This will make sure we replace anything inside () with "" and drop "" so as to not include it
subs_df[282:287][["text"]].applymap(lambda row: re.sub("\\((.*)\\)", "", row)).replace("", np.nan).dropna()

Unnamed: 0_level_0,Unnamed: 1_level_0,text
start,end,Unnamed: 2_level_1
1900-01-01 00:25:10.372,1900-01-01 00:25:12.102,- Okay? - Okay!
1900-01-01 00:26:08.632,1900-01-01 00:26:10.193,"How have you been, Granny?"


In [307]:
subs_df[["text"]] = subs_df[["text"]].applymap(lambda row: re.sub("\\((.*)\\)", "", row)).replace("", np.nan)
subs_df.dropna(inplace=True)
print(subs_df.shape)
subs_df.head()

(734, 1)


Unnamed: 0_level_0,Unnamed: 1_level_0,text
start,end,Unnamed: 2_level_1
1900-01-01 00:00:44.172,1900-01-01 00:00:45.973,"When a fox turns 100,"
1900-01-01 00:00:45.973,1900-01-01 00:00:48.112,it can transform into a beautiful woman;
1900-01-01 00:00:48.743,1900-01-01 00:00:51.642,or become a man who has relations with one.
1900-01-01 00:00:52.412,1900-01-01 00:00:55.153,"However, a fox of 1,000 years receives the sky..."
1900-01-01 00:00:55.153,1900-01-01 00:00:56.522,and becomes a celestial fox.


In [308]:
subs_df[270:280]

Unnamed: 0_level_0,Unnamed: 1_level_0,text
start,end,Unnamed: 2_level_1
1900-01-01 00:24:47.443,1900-01-01 00:24:50.483,"Besides, humans die too early to properly have..."
1900-01-01 00:24:57.152,1900-01-01 00:24:58.763,"I'm saying that life is short,"
1900-01-01 00:24:59.223,1900-01-01 00:25:01.433,so try your best in life;
1900-01-01 00:25:01.592,1900-01-01 00:25:03.763,but know to let go when something's too unbear...
1900-01-01 00:25:04.733,1900-01-01 00:25:07.062,"It extends to people, love,"
1900-01-01 00:25:08.503,1900-01-01 00:25:09.533,and so on and so forth.
1900-01-01 00:25:10.372,1900-01-01 00:25:12.102,- Okay? - Okay!
1900-01-01 00:26:08.632,1900-01-01 00:26:10.193,"How have you been, Granny?"
1900-01-01 00:26:24.582,1900-01-01 00:26:26.342,You haven't been around much.
1900-01-01 00:26:27.513,1900-01-01 00:26:29.213,"I've been busy, thanks to you."


In [309]:
subs_df.iloc[693:698]

Unnamed: 0_level_0,Unnamed: 1_level_0,text
start,end,Unnamed: 2_level_1
1900-01-01 01:01:18.644,1900-01-01 01:01:19.745,"Shin Ju, is that you?"
1900-01-01 01:01:21.814,1900-01-01 01:01:23.044,Why are all the doors open?
1900-01-01 01:01:23.944,1900-01-01 01:01:25.084,Did you miss me?
1900-01-01 01:01:25.084,1900-01-01 01:01:26.584,"- As if, brother. - Brother?"
1900-01-01 01:01:26.584,1900-01-01 01:01:29.155,"It's a long story, but the family has a dirty ..."


In [310]:
count = 0
for index, sub in subs_df.iterrows():
  if count == 1:
    break
  count += 1
  print(sub.index)

Index(['text'], dtype='object')


In [311]:
sentence = ""
result = []

for i, sub in subs_df.iterrows():
  if sentence == "":
    start = sub.name[0]
  sentence = sub.text if sentence == "" else sentence + " " + sub.text
  if re.search('[.?!]', sub.text):
    result.append((start, sub.name[1], sentence))
    sentence = ""

In [316]:
sentence_df = pd.DataFrame(result, columns=["start", "end", "sentence"])
sentence_df.index = pd.util.hash_pandas_object(sentence_df["sentence"], index=False)
sentence_df.head()

Unnamed: 0,start,end,sentence
1424207769709898913,1900-01-01 00:00:44.172,1900-01-01 00:00:51.642,"When a fox turns 100, it can transform into a ..."
13755835859204238867,1900-01-01 00:00:52.412,1900-01-01 00:00:56.522,"However, a fox of 1,000 years receives the sky..."
74703595880608168,1900-01-01 00:00:57.083,1900-01-01 00:01:02.393,Its abilities match that of a powerful shaman;...
13894478965224293292,1900-01-01 00:01:10.403,1900-01-01 00:01:15.702,"I was going to wait until we get home, but her..."
11229840987905826232,1900-01-01 00:01:27.683,1900-01-01 00:01:29.013,Do you like your gift?


In [317]:
pd.DataFrame(result, columns=["start", "end", "sentence"]).set_index(["start", "end"]).head()

Unnamed: 0_level_0,Unnamed: 1_level_0,sentence
start,end,Unnamed: 2_level_1
1900-01-01 00:00:44.172,1900-01-01 00:00:51.642,"When a fox turns 100, it can transform into a ..."
1900-01-01 00:00:52.412,1900-01-01 00:00:56.522,"However, a fox of 1,000 years receives the sky..."
1900-01-01 00:00:57.083,1900-01-01 00:01:02.393,Its abilities match that of a powerful shaman;...
1900-01-01 00:01:10.403,1900-01-01 00:01:15.702,"I was going to wait until we get home, but her..."
1900-01-01 00:01:27.683,1900-01-01 00:01:29.013,Do you like your gift?


In [318]:
sentence_df.iloc[2].sentence

'Its abilities match that of a powerful shaman; which allows it to see what are miles ahead.'

In [319]:
sentence_df.iloc[658].sentence

'Do you really want to see the world I live in?'

In [320]:
sentence_df.iloc[658].end - sentence_df.iloc[658].start

Timedelta('0 days 00:00:02.530000')

In [321]:
# Point no 4. anything inside () or subs with only () has been removed
sentence_df.iloc[240:250]

Unnamed: 0,start,end,sentence
3856983730943893837,1900-01-01 00:24:57.152,1900-01-01 00:25:03.763,"I'm saying that life is short, so try your bes..."
7946187886336389767,1900-01-01 00:25:04.733,1900-01-01 00:25:09.533,"It extends to people, love, and so on and so f..."
7314675150750314319,1900-01-01 00:25:10.372,1900-01-01 00:25:12.102,- Okay? - Okay!
9651332039963499136,1900-01-01 00:26:08.632,1900-01-01 00:26:10.193,"How have you been, Granny?"
6533484677845535548,1900-01-01 00:26:24.582,1900-01-01 00:26:26.342,You haven't been around much.
9259619257072481335,1900-01-01 00:26:27.513,1900-01-01 00:26:29.213,"I've been busy, thanks to you."
14438434740191741987,1900-01-01 00:26:31.523,1900-01-01 00:26:34.892,Never did I imagine; seeing the great Taluipa ...
12587812263603286725,1900-01-01 00:26:35.122,1900-01-01 00:26:37.693,What can I do when the world has changed?
15339389982142967231,1900-01-01 00:26:39.493,1900-01-01 00:26:41.793,The Afterlife should have a five-day workweek ...
8051282188957687221,1900-01-01 00:26:43.362,1900-01-01 00:26:44.902,Did you get my text?


In [322]:
sentence_df.iloc[604:610]

Unnamed: 0,start,end,sentence
13065545797657020242,1900-01-01 00:59:08.774,1900-01-01 00:59:10.644,This is why kids must be disciplined.
9088408106118007737,1900-01-01 00:59:37.374,1900-01-01 00:59:39.345,How many people have you killed?
8039897433280513526,1900-01-01 00:59:39.345,1900-01-01 00:59:41.044,Are you worried I'll be sucked into the Underw...
11069143831905122878,1900-01-01 00:59:42.274,1900-01-01 00:59:43.385,"No, it's because you embarrass me."
8705082159783812700,1900-01-01 00:59:44.115,1900-01-01 00:59:46.345,This is because I don't want a pathetic life l...
4097597925988755082,1900-01-01 00:59:46.345,1900-01-01 00:59:48.984,What's more pathetic is a grown man who whines.


# Export to file

In [4]:
%%writefile "sentences.py"
import pysrt
import pandas as pd
import re


class Sentences:
  def __init__(self, file, hashed=True):
    self.subs = pysrt.open(file)
    self.sentence_df = self.__to_sentences(self.__regex(), hashed)
    return self.sentence_df


  def __regex(self, subs):
    # Replace using regex without writing regex expression
    replacements = {"|": " ", "\n": " ", "...": ";"}
    replacements = dict((re.escape(k), v) for k, v in replacements.items())
    pattern = re.compile("|".join(replacements.keys()))

    subs_df = pd.DataFrame([[sub.start, sub.end,
                             pattern.sub(lambda m: replacements[
                               re.escape(m.group(0))], sub.text)]
                            for sub in self.subs],
                           columns=["start", "end", "text"])

    # Replace anything inside () with '' and if whole row is '' replace with NaN and drop them
    subs_df[["text"]] = subs_df[["text"]].applymap(lambda row: re.sub("\\((.*)\\)", "", row)).replace("", np.nan)
    subs_df.dropna(inplace=True)
    return subs_df

  def __to_sentences(self, subs_df, hashed):
    sentence = ""
    result = []

    for i, sub in subs_df.iterrows():
      if sentence == "":
        start = sub.start
      sentence = sub.text if sentence == "" else sentence + " " + sub.text 
      if re.search('[.?!]', sub.text):
        result.append((start, sub.end, sentence))
        sentence = ""

    sentence_df = pd.DataFrame(result, columns=["start", "end", "sentence"])

    if hashed:
      # Set index as the hash of sentence.
      sentence_df.index = pd.util.hash_pandas_object(sentence_df["sentence"], index=False)
      sentence_df["start"] = pd.to_datetime(sentence_df["start"], format="%H:%M:%S,%f")
      sentence_df["end"] = pd.to_datetime(sentence_df["end"], format="%H:%M:%S,%f")
    else:
      # Otherwise set "start" and "end" and index for easy indexing
      sentence_df.set_index(["start", "end"], inplace=True)

    return sentence_df

Overwriting sentences.py
