In [4]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [5]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\adykh\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# Cleaning Subtitle File

In [7]:
# Function to clean a single subtitle file
def clean_subtitle_file(subtitle_file):
    # Read the subtitle file
    with open(subtitle_file, 'r', encoding='utf-8') as file:
        subtitle_text = file.read()

        # Remove line numbers, timestamps, and speaker names
        clean_text = re.sub(r'\d+\n\d{2}:\d{2}:\d{2},\d{3} --> \d{2}:\d{2}:\d{2},\d{3}\n', '', subtitle_text)
        # Remove special characters and punctuation
        clean_text = re.sub(r'[^\w\s]', '', clean_text)
        # Convert text to lowercase
        clean_text = clean_text.lower()
        # Remove line breaks and join lines
        clean_text = ' '.join(clean_text.splitlines())
        
        # Tokenize the text
        words = word_tokenize(clean_text)
        
        # Remove stop words
        stop_words = set(stopwords.words('english'))
        words = [word for word in words if word not in stop_words]
        
        # Join the words back into a single string
        clean_text = ' '.join(words)

    return clean_text

In [8]:
# Path to the subtitle file to be cleaned
subtitle_file = "demo.txt"

# Read and print the original content of the subtitle file
print("Original content of the subtitle file:")
with open(subtitle_file, 'r', encoding='utf-8') as file:
    original_text = file.read()
    print(original_text)

print("\n")

# Clean the subtitle file
cleaned_text = clean_subtitle_file(subtitle_file)

# Print the cleaned content of the subtitle file
print("Cleaned content of the subtitle file:")
print(cleaned_text)

# Export the cleaned text to a new file
output_file = "clean_file.txt"
with open(output_file, 'w', encoding='utf-8') as file:
    file.write(cleaned_text)

print("\nCleaning completed. Cleaned text saved to clean_file.txt.")

Original content of the subtitle file:
1
00:00:29,359 --> 00:00:32,048
Ah! There's Princess
Dawn and Terry with the

2
00:00:32,248 --> 00:00:34,749
Blooney Looney soldiers
protecting them.

3
00:00:39,600 --> 00:00:42,414
Oh, Terry, look!
Here comes the Grump.

4
00:00:42,427 --> 00:00:45,795
Yeah! And looks like he's going to
shoot some darts at our balloon.

5
00:00:46,480 --> 00:00:51,911
Well, if they think king blooney's gas
bag guards can protect them from me.


Cleaned content of the subtitle file:
ah theres princess dawn terry blooney looney soldiers protecting oh terry look comes grump yeah looks like hes going shoot darts balloon well think king blooneys gas bag guards protect

Cleaning completed. Cleaned text saved to clean_file.txt.


===================================================================================

# Chunking The Cleaned File

****

### Why Chunking ?

    A very important step to improve the performance: Document Chunker.

    a. Consider the challenge of embedding large documents: Information Loss.
    
    b. It is often not practical to embed an entire document as a single vector, 
    particularly when dealing with long documents.

    c. Solution: Divide a large document into smaller, more manageable chunks for embedding.

    d. Another Problem: Let’s say we set the token window to be 500, then we’d expect 
    each chunk to be just below 500 tokens. One common concern of this method is that we might  
    accidentally cut off some important text between chunks, splitting up the context. To mitigate 
    this, we can set overlapping windows with a specified
    amount of tokens to overlap so we have tokens shared between chunks.

**There Are 2 Approaches We Can Use To Chunk The Text**

    simple chunk document: Divides the document into chunks of fixed size without overlap.
    
    overlapping chunk document: Divides the document into chunks with overlapping windows.

### Simple Chunk Document

In [9]:
def simple_chunk_document(file_path, chunk_size=500):
    chunks = []
    with open(file_path, 'r', encoding='utf-8') as file:
        document = file.read()
        words = document.split()
        for i in range(0, len(words), chunk_size):
            chunk = ' '.join(words[i:i+chunk_size])
            chunks.append(chunk)
    return chunks

### Overlapping Chunk Document

In [10]:
def overlapping_chunk_document(file_path, chunk_size=500, overlap=100):
    chunks = []
    with open(file_path, 'r', encoding='utf-8') as file:
        document = file.read()
        words = document.split()
        start_idx = 0
        end_idx = chunk_size

        while start_idx < len(words):
            chunk = ' '.join(words[start_idx:end_idx])
            chunks.append(chunk)
            start_idx = end_idx - overlap
            end_idx = start_idx + chunk_size
    return chunks

In [11]:
# Path to the .txt file
file_path = "clean_file.txt"

# Simple chunking
simple_chunks = simple_chunk_document(file_path, chunk_size=10)

# Overlapping chunking
overlapping_chunks = overlapping_chunk_document(file_path, chunk_size=10, overlap=3)

print("Simple Chunks:")
print(simple_chunks)
print("\nOverlapping Chunks:")
print(overlapping_chunks)

Simple Chunks:
['ah theres princess dawn terry blooney looney soldiers protecting oh', 'terry look comes grump yeah looks like hes going shoot', 'darts balloon well think king blooneys gas bag guards protect']

Overlapping Chunks:
['ah theres princess dawn terry blooney looney soldiers protecting oh', 'soldiers protecting oh terry look comes grump yeah looks like', 'yeah looks like hes going shoot darts balloon well think', 'balloon well think king blooneys gas bag guards protect', 'guards protect']


In [12]:
len(simple_chunks)

3

In [13]:
len(overlapping_chunks)

5

In [14]:
from sentence_transformers import SentenceTransformer
import numpy as np

In [15]:
# Load the MiniLM-L6-v2 model
model = SentenceTransformer('all-MiniLM-L6-v2')

In [16]:
# Generate embeddings for each chunk
def generate_embeddings(chunks):
    embeddings = []
    for chunk in chunks:
        # Generate embedding for the chunk
        chunk_embedding = model.encode(chunk, convert_to_tensor=True)
        embeddings.append(chunk_embedding)
    return np.array(embeddings)

In [17]:
# Generate embeddings for simple chunks
simple_embeddings = generate_embeddings(simple_chunks)

# Generate embeddings for overlapping chunks
overlapping_embeddings = generate_embeddings(overlapping_chunks)

In [32]:
simple_embeddings

array([[ 0.00433243,  0.03018818,  0.02947651, ..., -0.0400468 ,
         0.07735509,  0.00262731],
       [-0.05806385, -0.0010459 ,  0.03482844, ..., -0.08793204,
         0.04850123,  0.05647067],
       [-0.02243873,  0.08941442, -0.05871673, ..., -0.0480592 ,
        -0.02048676,  0.03290344]], dtype=float32)

In [18]:
len(simple_embeddings)

3

In [19]:
len(overlapping_embeddings)

5

In [22]:
len(simple_embeddings[0])

384

In [27]:
simple_embeddings[0]

array([ 4.33242740e-03,  3.01881842e-02,  2.94765104e-02,  1.23133119e-02,
        3.04811765e-02,  8.40023607e-02,  1.20493993e-01, -6.43626079e-02,
       -4.17789705e-02, -1.79993324e-02, -7.40393177e-02, -5.66496840e-03,
        9.93693843e-02, -1.87571539e-04, -2.18142085e-02,  1.07412897e-01,
        8.60837698e-02,  8.83835964e-05, -3.55100185e-02,  1.77315269e-02,
       -1.18543953e-02,  3.89467180e-03,  7.61400089e-02,  1.16805233e-01,
       -9.56934541e-02, -1.07222505e-01, -2.54148636e-02,  5.11322357e-03,
       -7.07235336e-02, -3.07796933e-02, -5.25602885e-02, -3.85234915e-02,
        1.01937484e-02,  1.24929314e-02,  7.77894333e-02,  2.74017993e-02,
        6.26061633e-02,  7.09481686e-02, -2.64573898e-02,  2.81319432e-02,
        2.55898982e-02, -7.13676363e-02, -2.41695587e-02, -1.25538022e-03,
       -2.64422596e-02, -1.43759008e-02,  2.12752949e-02, -2.68994719e-02,
        6.38009515e-03, -8.07586685e-03,  1.76586192e-02,  1.63258817e-02,
       -4.18406986e-02, -

**Each Chunk Represented As 384 Dimensional Numerical Vector**

### Instead Of Creating Database We Are Creating Dictionary For Demo

In [34]:
# Create dictionaries to store embeddings for different types of chunks
simple_chunk_embeddings = {}
overlapping_chunk_embeddings = {}

# Populate the dictionaries with embeddings for simple and overlapping chunks
for i, chunk_embedding in enumerate(simple_embeddings):
    simple_chunk_embeddings[f"simple_chunk_{i+1}"] = chunk_embedding

for i, chunk_embedding in enumerate(overlapping_embeddings):
    overlapping_chunk_embeddings[f"overlapping_chunk_{i+1}"] = chunk_embedding

# Combine the dictionaries into a single embeddings dictionary
embeddings_dict = {
    "simple_chunk_embeddings": simple_chunk_embeddings,
    "overlapping_chunk_embeddings": overlapping_chunk_embeddings
}

In [35]:
simple_chunk_embeddings.keys()

dict_keys(['simple_chunk_1', 'simple_chunk_2', 'simple_chunk_3'])

In [36]:
simple_chunk_embeddings

{'simple_chunk_1': array([ 4.33242740e-03,  3.01881842e-02,  2.94765104e-02,  1.23133119e-02,
         3.04811765e-02,  8.40023607e-02,  1.20493993e-01, -6.43626079e-02,
        -4.17789705e-02, -1.79993324e-02, -7.40393177e-02, -5.66496840e-03,
         9.93693843e-02, -1.87571539e-04, -2.18142085e-02,  1.07412897e-01,
         8.60837698e-02,  8.83835964e-05, -3.55100185e-02,  1.77315269e-02,
        -1.18543953e-02,  3.89467180e-03,  7.61400089e-02,  1.16805233e-01,
        -9.56934541e-02, -1.07222505e-01, -2.54148636e-02,  5.11322357e-03,
        -7.07235336e-02, -3.07796933e-02, -5.25602885e-02, -3.85234915e-02,
         1.01937484e-02,  1.24929314e-02,  7.77894333e-02,  2.74017993e-02,
         6.26061633e-02,  7.09481686e-02, -2.64573898e-02,  2.81319432e-02,
         2.55898982e-02, -7.13676363e-02, -2.41695587e-02, -1.25538022e-03,
        -2.64422596e-02, -1.43759008e-02,  2.12752949e-02, -2.68994719e-02,
         6.38009515e-03, -8.07586685e-03,  1.76586192e-02,  1.63258817

In [39]:
len(simple_chunk_embeddings)

3

In [38]:
len(simple_chunk_embeddings['simple_chunk_1'])

384

In [40]:
# Function to access embeddings for a specific type of chunk
def get_chunk_embeddings(chunk_type):
    if chunk_type == "simple_chunk_embeddings":
        return simple_chunk_embeddings
    elif chunk_type == "overlapping_chunk_embeddings":
        return overlapping_chunk_embeddings
    else:
        return None

In [41]:
# Example usage:
chunk_type = "simple_chunk_embeddings"  # or "overlapping_chunk_embeddings"
print(get_chunk_embeddings(chunk_type))

{'simple_chunk_1': array([ 4.33242740e-03,  3.01881842e-02,  2.94765104e-02,  1.23133119e-02,
        3.04811765e-02,  8.40023607e-02,  1.20493993e-01, -6.43626079e-02,
       -4.17789705e-02, -1.79993324e-02, -7.40393177e-02, -5.66496840e-03,
        9.93693843e-02, -1.87571539e-04, -2.18142085e-02,  1.07412897e-01,
        8.60837698e-02,  8.83835964e-05, -3.55100185e-02,  1.77315269e-02,
       -1.18543953e-02,  3.89467180e-03,  7.61400089e-02,  1.16805233e-01,
       -9.56934541e-02, -1.07222505e-01, -2.54148636e-02,  5.11322357e-03,
       -7.07235336e-02, -3.07796933e-02, -5.25602885e-02, -3.85234915e-02,
        1.01937484e-02,  1.24929314e-02,  7.77894333e-02,  2.74017993e-02,
        6.26061633e-02,  7.09481686e-02, -2.64573898e-02,  2.81319432e-02,
        2.55898982e-02, -7.13676363e-02, -2.41695587e-02, -1.25538022e-03,
       -2.64422596e-02, -1.43759008e-02,  2.12752949e-02, -2.68994719e-02,
        6.38009515e-03, -8.07586685e-03,  1.76586192e-02,  1.63258817e-02,
      