In [2]:
import pandas as pd
import pysrt
import re

In [3]:
BASE_DIR = 'T:/pycharm_repo/Working_dir/Deepdub'
OUTPUT_DIR = BASE_DIR
SAMPLE_DIR = BASE_DIR + '/tale_of_nine_tailed'
SUBTITLE_DIR = SAMPLE_DIR + '/subtitles/'

In [22]:
subs = pysrt.open(SUBTITLE_DIR + 'ep1_eng.srt')
subs[0]

<pysrt.srtitem.SubRipItem at 0x2bc30591190>

# Exploration 
1. The subtitles contains, text like `...`.
   - For example: 
   ```
   0: When a fox turns 100,
   1: it can transform into   
   a beautiful woman...
   ```
2. Text contains `\n` used to add newline as viewer can't read a long line of subtitle
   - For example in second line insert a `\n` after `into`: 
   ```
    0: When a fox turns 100,
    1: it can transform into   
    a beautiful woman...
   ```  

   ```
   subs[1] 'it can transform into\na beautiful woman...'
   ```    
      
3. Currently preprocessing creates sentences where one end in single view of subtitle and another sentence begin in same view.

    This way we can't really know when exactly did the sentence ended and next began, as both came in view at once and in subtitles both has a single starting and ending point. Like for instance (First is spoken by Jo Bo-ah second by Kim Bum):
    
    ```
    698
    00:58:49,825 --> 00:58:51,924
    - Did you miss me? - As if, brother.
    ```

    <br>**-** generally means there are two **Different people speaking at that moment. **So, we shouldn't remove `-` during preprocessing and split sentences, but when did one sentence end and another began is problem**.
    
    
4. Sometimes subs also translate what is visually written **but not spoken** inside brackets, like:

    ```
    85
    00:25:34,433 --> 00:25:38,902
    (Crackdown on undocumented spirits are being enforced.)

    286
    00:25:38,902 --> 00:25:41,973
    (Afterlife Immigration Office)
    ```
    
    <br>Or explaining some terms (In Hindi subtitles; not found in English one):
    
    ```
    247
    00:26:31,470 --> 00:26:35,000
    वाह, क्या नज़ारा है की सर्वशक्तिमान ताल उइ पा भी कंप्युटर इस्तेमाल करने के लिए जूझ रही हैं। <i>(ताल ई पा: भगवान जो 'सामदोचोन', ज़िंदगी और मौत के बीच की जगह पर नजर रखती हैं।)</i>
    ```
    

In [24]:
print(subs[0])
print(f"Starting time -> {subs[0].start.seconds}:{subs[0].start.milliseconds}")
print(f"Ending time   -> {subs[0].end.seconds}:{subs[0].end.milliseconds}")

1
00:00:44,172 --> 00:00:45,973
When a fox turns 100,

Starting time -> 44:172
Ending time   -> 45:973


In [25]:
subs[0].text

'When a fox turns 100,'

In [26]:
subs[0].start

SubRipTime(0, 0, 44, 172)

In [27]:
for i, sub in enumerate(subs[:5]):
  print(f"{i}:{sub.text}")

0:When a fox turns 100,
1:it can transform into a beautiful woman...
2:or become a man who has relations with one.
3:However, a fox of 1,000 years receives the sky's blessing...
4:and becomes a celestial fox.


In [30]:
print(subs[697])

698
00:58:49,825 --> 00:58:51,924
- Did you miss me? - As if, brother.



## Preprocessing and removing

In [87]:
# Replace using regex without writing regex expression
replacements = {"|": " ", "\n": " ", "...": ";", "\((.*)\)": ""}
replacements = dict((re.escape(k), v) for k, v in replacements.items())
pattern = re.compile("|".join(replacements.keys()))

subs_df = pd.DataFrame([[sub.start, sub.end, 
                         pattern.sub(lambda m: replacements[re.escape(m.group(0))], sub.text)] for sub in subs],
                       columns=["start", "end", "text"]).apply(
                      lambda row: re.sub("\((.*)\)", "", row.text) , axis=1).dropna()

# Point 4
subs_df[282:287]

Unnamed: 0,start,end,text
282,"00:25:10,372","00:25:12,102",- Okay? - Okay!
283,"00:25:29,693","00:25:34,062",(Afterlife Immigration Office)
284,"00:25:34,433","00:25:38,902",(Crackdown on undocumented spirits are being e...
285,"00:25:38,902","00:25:41,973",(Afterlife Immigration Office)
286,"00:26:08,632","00:26:10,193","How have you been, Granny?"


In [65]:
re.sub("\((.*)\)", "", subs_df.iloc[283].text)

''

In [67]:
print(re.sub("\((.*)\)","", subs_df.iloc[286].text))

How have you been, Granny?


In [75]:
print(re.sub("\((.*)\)", "", 
             "वाह, क्या नज़ारा है की सर्वशक्तिमान ताल उइ पा भी कंप्युटर इस्तेमाल करने के लिए जूझ रही हैं। <i>(ताल ई पा: भगवान जो 'सामदोचोन', ज़िंदगी और मौत के बीच की जगह पर नजर रखती हैं।)"
            ))

वाह, क्या नज़ारा है की सर्वशक्तिमान ताल उइ पा भी कंप्युटर इस्तेमाल करने के लिए जूझ रही हैं। <i>


In [70]:
subs_df.iloc[282:287]

Unnamed: 0,start,end,text
282,"00:25:10,372","00:25:12,102",- Okay? - Okay!
283,"00:25:29,693","00:25:34,062",(Afterlife Immigration Office)
284,"00:25:34,433","00:25:38,902",(Crackdown on undocumented spirits are being e...
285,"00:25:38,902","00:25:41,973",(Afterlife Immigration Office)
286,"00:26:08,632","00:26:10,193","How have you been, Granny?"


In [90]:
subs_df[282:287].applymap(lambda row: re.sub("\((.*)\)", "", row.text))

AttributeError: 'SubRipTime' object has no attribute 'text'

In [49]:
subs_df.iloc[693:698]

Unnamed: 0,start,end,text
693,"00:58:25,504","00:58:26,535",Bait?
694,"00:58:29,035","00:58:32,444","Hey, you. Didn't I reject your proposal earlier?"
695,"00:58:32,674","00:58:35,444",What you're looking for will be at my house.
696,"00:58:36,044","00:58:37,385",Shouldn't you get busy?
697,"00:58:49,825","00:58:51,924","- Did you miss me? - As if, brother."


In [32]:
sentence = ""
result = []

for i, sub in subs_df.iterrows():
  if sentence == "":
    start = sub.start
  sentence = sub.text if sentence=="" else sentence + " " + sub.text 
  if re.search('[.?!]', sub.text):
    result.append((start, sub.end, sentence))
    sentence = ""

In [34]:
sentence_df = pd.DataFrame(result, columns=["start", "end", "sentence"])
sentence_df.iloc[604:610]

Unnamed: 0,start,end,sentence
604,"00:58:36,044","00:58:37,385",Shouldn't you get busy?
605,"00:58:49,825","00:58:51,924","- Did you miss me? - As if, brother."
606,"00:58:52,394","00:58:54,135","- Brother? - It's a long story,"
607,"00:58:54,194","00:58:55,935",but the family has a dirty past.
608,"00:59:08,774","00:59:10,644",This is why kids must be disciplined.
609,"00:59:37,374","00:59:39,345",How many people have you killed?


In [35]:
sentence_df.iloc[2].sentence

'Its abilities match that of a powerful shaman; which allows it to see what are miles ahead.'

In [36]:
sentence_df.iloc[658].sentence

"So this is where it's buried?"

In [37]:
sentence_df.iloc[658].end - sentence_df.iloc[658].start

SubRipTime(0, 0, 2, 300)

In [38]:
sentence_df.iloc[604:610]

Unnamed: 0,start,end,sentence
604,"00:58:36,044","00:58:37,385",Shouldn't you get busy?
605,"00:58:49,825","00:58:51,924","- Did you miss me? - As if, brother."
606,"00:58:52,394","00:58:54,135","- Brother? - It's a long story,"
607,"00:58:54,194","00:58:55,935",but the family has a dirty past.
608,"00:59:08,774","00:59:10,644",This is why kids must be disciplined.
609,"00:59:37,374","00:59:39,345",How many people have you killed?


# Export to file

In [40]:
%%writefile "sentences.py"

import pysrt
import pandas as pd
import re


class Sentences:
  def __init__(self, file):
    self.subs = pysrt.open(file)
    self.sentence_df = self.__to_sentences(self.__regex())
    return self.sentence_df
    
    
  def __regex(self, subs):
    # Replace using regex without writing regex expression
    replacements = {"|": " ", "\n": " ", "...": ";"}
    replacements = dict((re.escape(k), v) for k, v in replacements.items())
    pattern = re.compile("|".join(replacements.keys()))

    subs_df = pd.DataFrame([[sub.start, sub.end, 
                             pattern.sub(lambda m: replacements[
                               re.escape(m.group(0))], sub.text)
                            ] for sub in self.subs],
                           columns=["start", "end", "text"])
    return subs_df
  
  def __to_sentences(self, subs_df):
    sentence = ""
    result = []

    for i, sub in subs_df.iterrows():
      if sentence == "":
        start = sub.start
      sentence = sub.text if sentence=="" else sentence + " " + sub.text 
      if re.search('[.?!]', sub.text):
        result.append((start, sub.end, sentence))
        sentence = ""
      
    return pd.DataFrame(result, columns=["start", "end", "sentence"])

Writing sentences.py
