Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Added interactive version of sub-clean.sh
Solves #5 May also be handy for use cases such as other languages (as per #4) where regex has unwanted matches and the user is not automating the script
- Loading branch information
1 parent
9fef833
commit e9a6145
Showing
1 changed file
with
73 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,73 @@ | ||
#!/bin/bash | ||
# cleans srt formatted subtitles of common blocks that may be considered unwanted | ||
# please consider leaving or modifying this regex to properly credit the hard work that is put into providing these subtitles | ||
|
||
### usage: | ||
## Download this file from the command line to your current directory: | ||
# curl https://raw.githubusercontent.com/brianspilner01/media-server-scripts/master/sub-clean-interactive.sh > sub-clean.sh && chmod +x sub-clean.sh | ||
## Test out what lines this script would remove: | ||
# REGEX_TO_REMOVE='opensubtitles|sub(scene|text|rip)|podnapisi|addic7ed|yify|napisy|bozxphd|sazu489|anoxmous|(br|dvd|web).?(rip|scr)|english (- )?us|sdh|srt|(sub(title)?(bed)?(s)?(fix)?|encode(d)?|correct(ed|ion(s)?)|caption(s|ed)|sync(ed|hroniz(ation|ed))?|english)(.pr(esented|oduced))?.?(by|&)|[^a-z]www\.|http|\.( )?(com|co|link|org|net|mp4|mkv|avi)([^a-z]|$)|©|™' | ||
# awk 'tolower($0) ~ '"/$REGEX_TO_REMOVE/" RS='' ORS='\n\n' "/path/to/sub.srt" | ||
|
||
SUB_FILEPATH="$1" | ||
|
||
# check usage | ||
[ ! -f "$SUB_FILEPATH" ] && { echo "usage: sub-clean.sh [FILE]" ; echo "Warning: subtitle file does not exist" ; exit 1 ; } | ||
|
||
# lowercase list of regex (gore/magic?) that will be removed from srt | ||
REGEX_TO_REMOVE='opensubtitles|sub(scene|rip)|podnapisi|addic7ed|titlovi|bozxphd|sazu489|psagmeno|normita|anoxmous|(br|dvd|web).?(rip|scr)|english (- )?us|sdh|srt|(yahoo|mail|book|fb|4m|hd)\. ?com|(sub(title)?(bed)?(s)?(fix)?|encode(d)?|correct(ed|ion(s)?)|caption(s|ed)|sync(ed|hroniz(ation|ed))?|english)(.pr(esented|oduced))?.?(by|&)|[^a-z]www\.|http|\. ?(co|pl|link|org|net|mp4|mkv|avi|pdf)([^a-z]|$)|©|™' | ||
|
||
if [[ $SUB_FILEPATH =~ \.srt$ ]] # only operate on srt files | ||
then | ||
|
||
# convert any DOS formatted files to UNIX (remove carriage return line endings) | ||
sed -i.bak 's/\r$//' "$SUB_FILEPATH" && rm "${SUB_FILEPATH}.bak" | ||
|
||
### each record (in awk) is defined as a block of srt formatted subs (record seperator RS is essentially \n\n+, see docs), with each line of the block a seperate field .i.e.: | ||
# LINE NUMBER | ||
# TIMESTAMP --> TIMESTAMP | ||
# SUB LINE 1 | ||
# SUB LINE 2 | ||
# ... | ||
# | ||
|
||
LINES_TO_REMOVE=$(awk 'tolower($0) ~ '"/$REGEX_TO_REMOVE/" RS='' ORS='\n\n' "$SUB_FILEPATH") | ||
|
||
if [[ $LINES_TO_REMOVE ]] | ||
then | ||
|
||
echo "The following lines have been marked for removal:" | ||
echo | ||
echo "#################################################" | ||
echo | ||
echo "$LINES_TO_REMOVE" | ||
echo | ||
echo "#################################################" | ||
echo | ||
echo "Press enter if this is ok" | ||
echo "Type 'exit' to abort" | ||
echo "Or, type a comma seperated list of srt line numbers that should be kept (false matches)" | ||
read -p "$ " USER_INPUT | ||
|
||
[[ $USER_INPUT == "exit" ]] && exit | ||
[[ $USER_INPUT ]] || USER_INPUT="ignore" | ||
|
||
USER_INPUT=$(echo "$USER_INPUT" | sed -E 's/([0-9]+)[^0-9]/\1|/g' | sed -E 's/[0-9]+/\^&\$/g') | ||
|
||
awk 'tolower($0) !~ /'"$REGEX_TO_REMOVE"'/ || $1 ~ /'"$USER_INPUT"'/ { $1 = VAR++ ; print }' RS='' FS='\n' OFS='\n' ORS='\n\n' VAR=1 "$SUB_FILEPATH" > "$SUB_FILEPATH.tmp" && \ | ||
mv "$SUB_FILEPATH.tmp" "$SUB_FILEPATH" && \ | ||
chmod 666 "$SUB_FILEPATH" && \ | ||
echo "sub-clean.sh succesfully processed $SUB_FILEPATH" | ||
|
||
else | ||
|
||
echo "Sub looks clean!" | ||
exit | ||
|
||
fi | ||
|
||
else | ||
echo "Provided file must be .srt" | ||
exit 1 | ||
fi | ||
|