In [1]:
import pandas as pd
import sqlite3

In [2]:
# Connect to the SQLite database
conn = sqlite3.connect('eng_subtitles_database.db')

In [3]:
# Read the contents of the 'zipfiles' table into a DataFrame
df = pd.read_sql_query("SELECT * FROM zipfiles", conn)

In [4]:
# Close the connection
conn.close()

In [5]:
df.shape

(82498, 3)

In [6]:
df.columns

Index(['num', 'name', 'content'], dtype='object')

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 82498 entries, 0 to 82497
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   num      82498 non-null  int64 
 1   name     82498 non-null  object
 2   content  82498 non-null  object
dtypes: int64(1), object(2)
memory usage: 1.9+ MB


In [8]:
df[df.duplicated()]

Unnamed: 0,num,name,content


**Looks No Duplicates Are There**

In [9]:
df.head()

Unnamed: 0,num,name,content
0,9180533,the.message.(1976).eng.1cd,b'PK\x03\x04\x14\x00\x00\x00\x08\x00\x1c\xa9\x...
1,9180583,here.comes.the.grump.s01.e09.joltin.jack.in.bo...,b'PK\x03\x04\x14\x00\x00\x00\x08\x00\x17\xb9\x...
2,9180592,yumis.cells.s02.e13.episode.2.13.(2022).eng.1cd,b'PK\x03\x04\x14\x00\x00\x00\x08\x00L\xb9\x99V...
3,9180594,yumis.cells.s02.e14.episode.2.14.(2022).eng.1cd,b'PK\x03\x04\x14\x00\x00\x00\x08\x00U\xa9\x99V...
4,9180600,broker.(2022).eng.1cd,b'PK\x03\x04\x14\x00\x00\x00\x08\x001\xa9\x99V...


### For Whole Data

In [10]:
import zipfile
import io
import os
from tqdm import tqdm

# Create a directory if it doesn't exist
output_folder = "subtitles_data"
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

# Function to process each row in the DataFrame
def process_row(row):
    compressed_data = row['content']
    file_name = row['name'] + '.txt'

    # Wrap the binary data in a BytesIO object
    compressed_data_io = io.BytesIO(compressed_data)

    # Create a ZipFile object from the BytesIO object
    with zipfile.ZipFile(compressed_data_io) as zip_file:
        # Assuming there's only one file in the ZIP archive
        # If there are multiple files, you'll need to specify the filename
        file_in_zip = zip_file.namelist()[0]

        # Read the contents of the file from the ZIP archive
        file_content = zip_file.read(file_in_zip)

        # Decode the file content using Latin-1 encoding
        decoded_data = file_content.decode('latin-1')

        # Write the decoded data to a text file
        output_path = os.path.join(output_folder, file_name)
        with open(output_path, 'w', encoding='utf-8') as f:
            f.write(decoded_data)

# Process each row in the sampled DataFrame with a progress bar
for _, row in tqdm(df.iterrows(), total=len(df)):
    process_row(row)

# Confirm completion
print("Subtitle files saved successfully.")

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 82498/82498 [12:50<00:00, 107.12it/s]

Subtitle files saved successfully.





In [11]:
import os
from tqdm import tqdm

def get_folder_size_with_progress(folder_path):
    total_size = 0
    num_files = 0
    with tqdm(desc='Calculating folder size') as pbar:
        for dirpath, dirnames, filenames in os.walk(folder_path):
            num_files += len(filenames)
            for filename in filenames:
                filepath = os.path.join(dirpath, filename)
                total_size += os.path.getsize(filepath)
                pbar.update(1)
    return total_size, num_files

folder_path = "subtitles_data"
size_in_bytes, num_items = get_folder_size_with_progress(folder_path)
size_in_megabytes = size_in_bytes / (1024 * 1024)

print(f"Calculating folder size: Done")
print(f"Size of folder '{folder_path}': {size_in_megabytes:.2f} MB")
print(f"Number of items in folder '{folder_path}': {num_items}")

Calculating folder size: 51929it [00:19, 2683.74it/s]

Calculating folder size: Done
Size of folder 'subtitles_data': 3117.75 MB
Number of items in folder 'subtitles_data': 51929





In [12]:
df[df.duplicated('name')]

Unnamed: 0,num,name,content
25,9181731,flowers.in.the.attic.the.origin.s01.e03.part.t...,b'PK\x03\x04\x14\x00\x00\x00\x08\x00Ix\x91V$\x...
34,9181933,alchemy.of.souls.s01.e12.episode.1.12.(2022).e...,b'PK\x03\x04\x14\x00\x00\x00\x08\x00\x00}\x91V...
47,9181978,the.governor.s01.e01.episode.1.1.(1995).eng.1cd,b'PK\x03\x04\x14\x00\x00\x00\x08\x003\xa9\x99V...
48,9181979,the.governor.s01.e02.episode.1.2.(1995).eng.1cd,b'PK\x03\x04\x14\x00\x00\x00\x08\x00\x18\xa9\x...
49,9181980,the.governor.s01.e03.episode.1.3.(1995).eng.1cd,b'PK\x03\x04\x14\x00\x00\x00\x08\x00\x91~\x91V...
...,...,...,...
82486,9521885,eiga.yurukyan.(2022).eng.1cd,b'PK\x03\x04\x14\x00\x00\x00\x08\x00\xc5\xa6\x...
82489,9521930,the.fearway.(2023).eng.1cd,b'PK\x03\x04\x14\x00\x00\x00\x08\x00\x0f\x97\x...
82491,9521932,star.trek.generations.(1994).eng.1cd,b'PK\x03\x04\x14\x00\x00\x00\x08\x00\xcc\x96\x...
82492,9521933,star.trek.insurrection.(1998).eng.1cd,b'PK\x03\x04\x14\x00\x00\x00\x08\x00\xe8\x96\x...


**There Are Duplicate Entries, Thats Why Processing Shows 82498 And Total No Of Files In Folder Are 51929**

## Taking 30% Data

In [13]:
# Sample 30% of the data
sampled_df = df.sample(frac=0.3)

In [14]:
sampled_df.shape

(24749, 3)

In [15]:
sampled_df.head()

Unnamed: 0,num,name,content
61269,9435114,a.different.world.s06.e09.faith.hope.and.chari...,b'PK\x03\x04\x14\x00\x00\x00\x08\x00\xc7=\x9aV...
40068,9345469,stargirl.s03.e13.frenemies.chapter.thirteen.th...,b'PK\x03\x04\x14\x00\x00\x00\x08\x00p\xbd\x99V...
18820,9257015,the.empress.s01.e02.the.arrival.(2022).eng.1cd,b'PK\x03\x04\x14\x00\x00\x00\x08\x00\x98\x9b\x...
61095,9434669,community.squad.s01.e08.casi.heroes.(2023).eng...,b'PK\x03\x04\x14\x00\x00\x00\x08\x00\xefL\x9aV...
9014,9219561,this.is.us.s01.e03.kyle.(2016).eng.1cd,b'PK\x03\x04\x14\x00\x00\x00\x08\x00\x98\x91\x...


### For 30% Data

In [16]:
import zipfile
import io
import os
from tqdm import tqdm

# Create a directory if it doesn't exist
output_folder = "subtitles_data_30%"
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

# Function to process each row in the DataFrame
def process_row(row):
    compressed_data = row['content']
    file_name = row['name'] + '.txt'

    # Wrap the binary data in a BytesIO object
    compressed_data_io = io.BytesIO(compressed_data)

    # Create a ZipFile object from the BytesIO object
    with zipfile.ZipFile(compressed_data_io) as zip_file:
        # Assuming there's only one file in the ZIP archive
        # If there are multiple files, you'll need to specify the filename
        file_in_zip = zip_file.namelist()[0]

        # Read the contents of the file from the ZIP archive
        file_content = zip_file.read(file_in_zip)

        # Decode the file content using Latin-1 encoding
        decoded_data = file_content.decode('latin-1')

        # Write the decoded data to a text file
        output_path = os.path.join(output_folder, file_name)
        with open(output_path, 'w', encoding='utf-8') as f:
            f.write(decoded_data)

# Process each row in the sampled DataFrame with a progress bar
for _, row in tqdm(sampled_df.iterrows(), total=len(sampled_df)):
    process_row(row)

# Confirm completion
print("Subtitle files saved successfully.")

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 24749/24749 [04:25<00:00, 93.09it/s]

Subtitle files saved successfully.





In [17]:
import os
from tqdm import tqdm

def get_folder_size_with_progress(folder_path):
    total_size = 0
    num_files = 0
    with tqdm(desc='Calculating folder size') as pbar:
        for dirpath, dirnames, filenames in os.walk(folder_path):
            num_files += len(filenames)
            for filename in filenames:
                filepath = os.path.join(dirpath, filename)
                total_size += os.path.getsize(filepath)
                pbar.update(1)
    return total_size, num_files

folder_path = "subtitles_data_30%"
size_in_bytes, num_items = get_folder_size_with_progress(folder_path)
size_in_megabytes = size_in_bytes / (1024 * 1024)

print(f"Calculating folder size: Done")
print(f"Size of folder '{folder_path}': {size_in_megabytes:.2f} MB")
print(f"Number of items in folder '{folder_path}': {num_items}")

Calculating folder size: 21430it [00:06, 3527.87it/s]

Calculating folder size: Done
Size of folder 'subtitles_data_30%': 1310.47 MB
Number of items in folder 'subtitles_data_30%': 21430





In [18]:
sampled_df[sampled_df.duplicated('name')]

Unnamed: 0,num,name,content
78088,9503871,csi.crime.scene.investigation.s14.e10.girls.go...,b'PK\x03\x04\x14\x00\x00\x00\x08\x00S\x87\x9aV...
75140,9490669,dr.savilles.horror.show.().eng.1cd,b'PK\x03\x04\x14\x00\x00\x00\x08\x00u\x81\x9aV...
14652,9242448,the.nanny.s03.e09.the.two.mrs.sheffields.(1995...,b'PK\x03\x04\x14\x00\x00\x00\x08\x00j\x98\x99V...
45585,9370573,bump.(2021).eng.1cd,b'PK\x03\x04\x14\x00\x00\x00\x08\x00\x84\x11\x...
44628,9365726,alice.in.borderland.s02.e05.episode.2.5.(2022)...,b'PK\x03\x04\x14\x00\x00\x00\x08\x00G\r\x9aV8\...
...,...,...,...
58718,9426845,family.guy.s16.e17.switch.the.flip.(2018).eng.1cd,b'PK\x03\x04\x14\x00\x00\x00\x08\x00b7\x9aV\x1...
39884,9344787,beauty.and.the.beast.s01.e06.worth.(2012).eng.1cd,b'PK\x03\x04\x14\x00\x00\x00\x08\x00\xf0\xbc\x...
39865,9344767,beauty.and.the.beast.s01.e09.bridesmaid.up.(20...,b'PK\x03\x04\x14\x00\x00\x00\x08\x00\xd9\xbc\x...
44983,9367899,90210.s02.e09.a.trip.to.the.moon.(2009).eng.1cd,b'PK\x03\x04\x14\x00\x00\x00\x08\x00Y\x0e\x9aV...


**There Are Duplicate Entries, Thats Why Processing Shows 24749 And Total No Of Files In Folder Are 21430**

In [19]:
df['name'].to_csv('names.txt', index=False, header=False)

# If you want to specify a different delimiter, like a newline character
# df['name'].to_csv('names.txt', index=False, header=False, sep='\n')