# Taking Each File From Subtitle Folder, Cleaning It & Saving It To Another Folder

In [1]:
import os
import nltk
import re
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

In [2]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\adykh\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# Function Will Take Each Subtitle File From Specified Folder, Clean It, & Return Clean File

In [3]:
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string 

def clean_subtitle_file(subtitle_text):
    # Check if the subtitle text follows the first format
    if 'Dialogue:' in subtitle_text:
        return clean_first_format(subtitle_text)
    # Check if the subtitle text follows the second format
    elif re.search(r'\d{2}:\d{2}:\d{2},\d{3} --> \d{2}:\d{2}:\d{2},\d{3}\n?', subtitle_text):
        return clean_second_format(subtitle_text)
    else:
        # If the format is not recognized, return the original text
        return subtitle_text
#####################################################################################################
def clean_first_format(subtitle_text):
    # Split the subtitle text into lines
    lines = subtitle_text.split('\n')
    # Initialize an empty list to store cleaned lines
    cleaned_lines = []
    # Iterate through each line
    for line in lines:
        # Find the index of the first occurrence of 'Dialogue: '
        start_index = line.find('Dialogue: ')
        if start_index != -1:
            # Find the index of the closing curly brace '}'
            end_index = line.find('}', start_index)
            if end_index != -1:
                # Keep the part of the line after '}'
                cleaned_line = line[end_index + 1:].replace('\\N', ' ').strip()
                # Remove text within curly braces and the braces themselves
                cleaned_line = re.sub(r'\{.*?\}', '', cleaned_line)
                # Remove numbers at the beginning of the line
                cleaned_line = re.sub(r'^\d+\s*', '', cleaned_line)
                # Remove special characters and punctuation
                cleaned_line = re.sub(r'[^\w\s]', '', cleaned_line)
                # Convert text to lowercase
                cleaned_line = cleaned_line.lower()
                # Tokenize the text
                words = word_tokenize(cleaned_line)
                # Remove stop words
                stop_words = set(stopwords.words('english'))
                words = [word for word in words if word not in stop_words]
                # Join the words back into a single string
                cleaned_line = ' '.join(words)
                cleaned_lines.append(cleaned_line)
    # Join the cleaned lines into a single string
    cleaned_text = ' '.join(cleaned_lines)

    return cleaned_text
#######################################################################################################
def clean_second_format(subtitle_text):
    # Remove UTF-8 Byte Order Mark (BOM)
    cleaned_text = subtitle_text.strip('\ufeff')
    # Regular expression to match timestamps
    timestamp_regex = r'\d{2}:\d{2}:\d{2},\d{3} --> \d{2}:\d{2}:\d{2},\d{3}'    
    # Remove timestamps
    cleaned_text = re.sub(timestamp_regex, '', subtitle_text)    
    # Remove numbers
    cleaned_text = re.sub(r'\d+', '', cleaned_text)    
    # Remove punctuation
    cleaned_text = cleaned_text.translate(str.maketrans('', '', string.punctuation))    
    # Tokenize the text
    tokens = word_tokenize(cleaned_text)    
    # Remove stop words
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word.lower() not in stop_words]    
    # Join the tokens back into a string
    cleaned_text = ' '.join(filtered_tokens)    
    # Remove any leading or trailing whitespace
    cleaned_text = cleaned_text.strip()    
    return cleaned_text

# METADATA Extracting Function

In [4]:
import re

def extract_metadata(text):
    # Initialize default values for metadata fields
    show_name = "Unknown"
    show_title = "Unknown"
    year = "Unknown"
    season = "Unknown"
    episode = "Unknown"
    subtitle_language = "Unknown"

    # Define regular expression patterns for extracting metadata
    pattern1 = re.compile(r"^(.*?)\.s(\d{2})\.e(\d{2})\.(.*?)\((\d{4})\)\.(.*?)\.txt$")
    pattern2 = re.compile(r"^(.*?)\((\d{4})\)\.(.*?)\.(\d+)cd\.txt$")
    pattern3 = re.compile(r"^\(?([^)]*)\)?\.(\w+)\.(\d+)cd\.txt$")
    pattern4 = re.compile(r"^(.*?)\.txt$")

    match = pattern1.match(text)
    if match:
        show_name = match.group(1).strip() if match.group(1) else "Unknown"
        show_title = match.group(4).strip() if match.group(4) else "Unknown"
        season = match.group(2) if match.group(2) else "Unknown"
        episode = match.group(3) if match.group(3) else "Unknown"
        year = match.group(5)
        subtitle_language = match.group(6).split(".")[0] if match.group(6) else "Unknown"
    else:
        match = pattern2.match(text)
        if match:
            show_name = match.group(1).strip() if match.group(1) else "Unknown"
            subtitle_language = match.group(3).split(".")[0] if match.group(3) else "Unknown"
            year = match.group(2)
        else:
            match = pattern3.match(text)
            if match:
                show_name = match.group(1).strip() if match.group(1) else "Unknown"
                show_title = "Unknown"
                season = "Unknown"  # Set season to "Unknown"
                year = "Unknown"  # Set year to "Unknown"
                subtitle_language = match.group(2) if match.group(2) else "Unknown"
            else:
                match = pattern4.match(text)
                if match:
                    show_name = match.group(1).strip() if match.group(1) else "Unknown"
                    year = "Unknown"  # Year is unknown for this pattern
                    subtitle_language = "Unknown"  # Language is unknown for this pattern
                else:
                    print(f"Could not extract metadata from '{text}'")

    return {
        'show_name': show_name,
        'show_title': show_title,
        'year': year,
        'season': season,
        'episode': episode,
        'subtitle_language': subtitle_language
    }

# Function To Return DataFrame

**Function Will Create Folder To Store Cleaned Subtitles Files, Also, In Order To Store Records Of Each File, A DataFrame Will Be Created Which Will Store METADATA Of Each Subtitle File, Along With, Unique ID For Each file Along With Its Name.**

In [5]:
def clean_subtitle_folder(input_folder, output_folder):
    # Create output folder if not present
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    # Initialize list to store data
    data = []

    # Iterate through each file in the input folder
    for file_name in os.listdir(input_folder):
        if file_name.endswith('.txt'):
            file_path = os.path.join(input_folder, file_name)
            # Read the content of the subtitle file
            with open(file_path, 'r', encoding='utf-8-sig') as file:
                subtitle_text = file.read()

            # Clean the subtitle file content
            cleaned_text = clean_subtitle_file(subtitle_text)
            
            # Save cleaned text to output folder with the same filename
            clean_file_path = os.path.join(output_folder, file_name)
            with open(clean_file_path, 'w', encoding='utf-8-sig') as file:
                file.write(cleaned_text)
            
            # Extract metadata
            metadata = extract_metadata(file_name)
            
            # Create entry for DataFrame
            entry = {
                'id': len(data),
                'metadata': metadata,
                'file_name': file_name
            }
            
            # Append entry to data list
            data.append(entry)
    
    # Create DataFrame from collected data
    df = pd.DataFrame(data)
    
    return df

In [None]:
# Example usage:
input_folder = r"C:\Users\adykh\Desktop\subs_db\subtitles\subtitle_30\subtitles_data_30%"
output_folder = r"C:\Users\adykh\Desktop\subs_db\subtitles\subtitle_30\cleaned_subtitles_data_30%"
df = clean_subtitle_folder(input_folder, output_folder)

In [None]:
df

In [None]:
 # Export DataFrame to CSV
df.to_csv('subtitles_data_30%.csv', index=False)

In [None]:
df['metadata'][0]

In [None]:
df['metadata'][1]

In [None]:
df['metadata'][21]

==============================================================================