<a href="https://colab.research.google.com/github/adhilbinmujeeb/Business-Model-Evaluation-letsgetmoving/blob/main/Clean_subtitles.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
import re

def clean_youtube_subtitles(input_file, output_file):
    with open(input_file, 'r', encoding='utf-8') as file:
        lines = file.readlines()

    cleaned_lines = []
    for line in lines:
        # Remove timestamp and alignment tags
        line = re.sub(r'\d{2}:\d{2}:\d{2}\.\d{3} --> \d{2}:\d{2}:\d{2}\.\d{3}', '', line)
        line = re.sub(r'align:start position:\d+%', '', line)

        # Remove <c> tags and other tags like <...>
        line = re.sub(r'<[^>]*>', '', line)

        # Strip extra spaces and add cleaned line if it's not empty
        line = line.strip()
        if line:
            cleaned_lines.append(line)

    # Write the cleaned lines to the output file
    with open(output_file, 'w', encoding='utf-8') as file:
        file.write("\n".join(cleaned_lines))

    print(f"Cleaned subtitles saved to {output_file}")

def clean_subtitles_in_folder(folder_path, output_folder):
    # Loop through all files in the folder
    for filename in os.listdir(folder_path):
        input_file = os.path.join(folder_path, filename)
        if os.path.isfile(input_file) and filename.endswith('.vtt'):  # Only process .txt files
            output_file = os.path.join(output_folder, f"cleaned_{filename}")
            clean_youtube_subtitles(input_file, output_file)

# Example usage in Google Colab
folder_path = '/content/shark_tank_global'  # Replace with your folder path
output_folder = '/content/cln_sharktank_global'  # Replace with your output folder path

# Make sure the output folder exists
os.makedirs(output_folder, exist_ok=True)

clean_subtitles_in_folder(folder_path, output_folder)


Cleaned subtitles saved to /content/cln_sharktank_global/cleaned_2L9z6jL7TQ4.en.vtt
Cleaned subtitles saved to /content/cln_sharktank_global/cleaned_YsTO0ih9HPY.en.vtt
Cleaned subtitles saved to /content/cln_sharktank_global/cleaned_Zfv8FqzNoyQ.en.vtt
Cleaned subtitles saved to /content/cln_sharktank_global/cleaned_LCajDC1_d7E.en.vtt
Cleaned subtitles saved to /content/cln_sharktank_global/cleaned_O1B27Z2FKQQ.en.vtt
Cleaned subtitles saved to /content/cln_sharktank_global/cleaned_izHcx7oVqkk.en.vtt
Cleaned subtitles saved to /content/cln_sharktank_global/cleaned_MBDLlYiHTeM.en.vtt
Cleaned subtitles saved to /content/cln_sharktank_global/cleaned_Uej5cBdLreM.en.vtt
Cleaned subtitles saved to /content/cln_sharktank_global/cleaned_nkPrDsD4YmE.en.vtt
Cleaned subtitles saved to /content/cln_sharktank_global/cleaned_fuM1Vh7OgfY.en.vtt
Cleaned subtitles saved to /content/cln_sharktank_global/cleaned_afb9CNiNz08.en.vtt
Cleaned subtitles saved to /content/cln_sharktank_global/cleaned_wTz2mSnQUwQ

In [None]:
import shutil
import os
from google.colab import files

# Define the list of folders you want to zip
folders = ['/content/cln_dragon_den_canada', '/content/cln_dragon_den_global', '/content/cln_dragon_den_ireland','/content/cln_dragon_den_uk','/content/cln_sharktank_aus','/content/cln_sharktank_global']  # Replace with your folder paths

# Temporary directory to combine all folders
temp_dir = '/content/temp_folders'

# Create the temp directory
os.makedirs(temp_dir, exist_ok=True)

# Move all folders into the temp directory
for folder in folders:
    shutil.copytree(folder, os.path.join(temp_dir, os.path.basename(folder)))

# Zip the combined folder
output_zip = '/content/multiple_folders.zip'
shutil.make_archive(output_zip.replace('.zip', ''), 'zip', temp_dir)

# Download the zipped file
files.download(output_zip)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
import os

def clean_and_save_subtitles(file_path):
    """
    Cleans a single .vtt file by removing duplicate lines and saves the result as a .txt file.
    """
    unique_lines = []
    seen = set()

    # Read the .vtt file
    with open(file_path, "r", encoding="utf-8") as file:
        lines = file.readlines()

    # Remove duplicates while maintaining order
    for line in lines:
        stripped_line = line.strip()
        if stripped_line and stripped_line not in seen:
            seen.add(stripped_line)
            unique_lines.append(stripped_line)

    # Save the cleaned file as a .txt file
    cleaned_file_path = file_path.replace(".vtt", "_cleaned.txt")
    with open(cleaned_file_path, "w", encoding="utf-8") as cleaned_file:
        cleaned_file.write("\n".join(unique_lines))

    print(f"Cleaned and saved: {cleaned_file_path}")

def process_all_vtt_files(folder_path):
    """
    Processes all .vtt files in a folder, cleaning and saving them as .txt files.
    """
    # Get all .vtt files in the folder
    vtt_files = [f for f in os.listdir(folder_path) if f.endswith(".vtt")]

    if not vtt_files:
        print("No .vtt files found in the specified folder.")
        return

    print(f"Found {len(vtt_files)} .vtt files. Cleaning...")
    for vtt_file in vtt_files:
        full_path = os.path.join(folder_path, vtt_file)
        clean_and_save_subtitles(full_path)
    print("All .vtt files processed.")

# Specify the folder containing your .vtt files
folder_path = "/content/cln_dragon_den_canada"  # Replace with the path to your folder
process_all_vtt_files(folder_path)


Found 179 .vtt files. Cleaning...
Cleaned and saved: /content/cln_dragon_den_canada/cleaned_PI9lcw10cBo.en_cleaned.txt
Cleaned and saved: /content/cln_dragon_den_canada/cleaned_2CnPfB6Cyc8.en_cleaned.txt
Cleaned and saved: /content/cln_dragon_den_canada/cleaned_0sWq4Yedrxg.en_cleaned.txt
Cleaned and saved: /content/cln_dragon_den_canada/cleaned_zBgZL3WT_go.en_cleaned.txt
Cleaned and saved: /content/cln_dragon_den_canada/cleaned_XoqZyFFYxO8.en_cleaned.txt
Cleaned and saved: /content/cln_dragon_den_canada/cleaned_hOyw6ZUvHbI.en_cleaned.txt
Cleaned and saved: /content/cln_dragon_den_canada/cleaned_NdtFX862roc.en_cleaned.txt
Cleaned and saved: /content/cln_dragon_den_canada/cleaned_vuBGAlUuR0E.en_cleaned.txt
Cleaned and saved: /content/cln_dragon_den_canada/cleaned_yn6Pj4b_IKQ.en_cleaned.txt
Cleaned and saved: /content/cln_dragon_den_canada/cleaned_Gzo1Jav5QtI.en_cleaned.txt
Cleaned and saved: /content/cln_dragon_den_canada/cleaned_2Bb_jOiBQ0c.en_cleaned.txt
Cleaned and saved: /content/cln

In [None]:
import os

def clean_and_save_subtitles(file_path, output_folder):
    """
    Cleans a single .vtt file by removing duplicate lines and saves the result as a .txt file in a specified folder.
    """
    unique_lines = []
    seen = set()

    # Read the .vtt file
    with open(file_path, "r", encoding="utf-8") as file:
        lines = file.readlines()

    # Remove duplicates while maintaining order
    for line in lines:
        stripped_line = line.strip()
        if stripped_line and stripped_line not in seen:
            seen.add(stripped_line)
            unique_lines.append(stripped_line)

    # Save the cleaned file as a .txt file in the new folder
    file_name = os.path.basename(file_path).replace(".vtt", "_cleaned.txt")
    cleaned_file_path = os.path.join(output_folder, file_name)
    with open(cleaned_file_path, "w", encoding="utf-8") as cleaned_file:
        cleaned_file.write("\n".join(unique_lines))

    print(f"Cleaned and saved: {cleaned_file_path}")

def process_all_vtt_files(folder_path):
    """
    Processes all .vtt files in a folder, cleaning and saving them as .txt files in a new folder.
    """
    # Get all .vtt files in the folder
    vtt_files = [f for f in os.listdir(folder_path) if f.endswith(".vtt")]

    if not vtt_files:
        print("No .vtt files found in the specified folder.")
        return

    # Create a new folder to store cleaned .txt files
    output_folder = os.path.join(folder_path, "cleaned_txt_files")
    os.makedirs(output_folder, exist_ok=True)

    print(f"Found {len(vtt_files)} .vtt files. Cleaning...")
    for vtt_file in vtt_files:
        full_path = os.path.join(folder_path, vtt_file)
        clean_and_save_subtitles(full_path, output_folder)
    print(f"All .vtt files processed. Cleaned files are saved in: {output_folder}")

# Specify the folder containing your .vtt files
folder_path = "/content/cln_sharktank_global"  # Replace with the path to your folder
process_all_vtt_files(folder_path)


Found 321 .vtt files. Cleaning...
Cleaned and saved: /content/cln_sharktank_global/cleaned_txt_files/cleaned_nkPrDsD4YmE.en_cleaned.txt
Cleaned and saved: /content/cln_sharktank_global/cleaned_txt_files/cleaned_FbsKto4gg14.en_cleaned.txt
Cleaned and saved: /content/cln_sharktank_global/cleaned_txt_files/cleaned_Y_NFkbM5k7c.en_cleaned.txt
Cleaned and saved: /content/cln_sharktank_global/cleaned_txt_files/cleaned_3oTAH4iEhig.en_cleaned.txt
Cleaned and saved: /content/cln_sharktank_global/cleaned_txt_files/cleaned_MWkxom-LNOk.en_cleaned.txt
Cleaned and saved: /content/cln_sharktank_global/cleaned_txt_files/cleaned_gfDzaWJ6oBU.en_cleaned.txt
Cleaned and saved: /content/cln_sharktank_global/cleaned_txt_files/cleaned_Qj0SCCcrqP0.en_cleaned.txt
Cleaned and saved: /content/cln_sharktank_global/cleaned_txt_files/cleaned_jjaY_cWr8EQ.en_cleaned.txt
Cleaned and saved: /content/cln_sharktank_global/cleaned_txt_files/cleaned_pbcrnXWpJuE.en_cleaned.txt
Cleaned and saved: /content/cln_sharktank_global

In [None]:
import shutil

# Replace 'your_folder_name' with the name of your folder in Colab
folder_name = "/content/cln_sharktank_global/cleaned_txt_files"
shutil.make_archive(folder_name, 'zip', folder_name)


'/content/cln_sharktank_global/cleaned_txt_files.zip'

In [None]:
from google.colab import files

# Replace 'your_folder_name.zip' with the name of the zipped file
files.download(f"/content/cln_sharktank_global/cleaned_txt_files.zip")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>