In [None]:
import os
import re
from sentence_transformers import SentenceTransformer
import numpy as np
import pickle

def clean_text(text):
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[^\w\s.,;:?!-]', '', text)
    return text.strip()

def chunk_text(text, max_words=100):
    words = text.split()
    chunks = []
    for i in range(0, len(words), max_words):
        chunk = ' '.join(words[i:i+max_words])
        chunks.append(chunk)
    return chunks

def main():
    try:
        # Use raw strings to avoid escape issues
        raw_file = r'raw file path here'
        cleaned_dir = r'clean file path here'
        
        # Create directory with full permissions (if it doesn't exist)
        os.makedirs(cleaned_dir, mode=0o777, exist_ok=True)

        # Read raw text
        with open(raw_file, 'r', encoding='utf-8') as f:
            raw_text = f.read()

        cleaned_text = clean_text(raw_text)
        chunks = chunk_text(cleaned_text, max_words=100)

        print(f"Total chunks created: {len(chunks)}")

        # Save cleaned text (optional)
        cleaned_file_path = os.path.join(cleaned_dir, 'cleaned_bangladesh.txt')
        with open(cleaned_file_path, 'w', encoding='utf-8') as f:
            f.write(cleaned_text)

        # Load embedding model
        model = SentenceTransformer('all-MiniLM-L6-v2')
        embeddings = model.encode(chunks, show_progress_bar=True)

        # Save chunks and embeddings
        with open(os.path.join(cleaned_dir, 'chunks.pkl'), 'wb') as f:
            pickle.dump(chunks, f)

        with open(os.path.join(cleaned_dir, 'embeddings.npy'), 'wb') as f:
            np.save(f, embeddings)

        print(f"Saved chunks and embeddings in {cleaned_dir}")
    except PermissionError as e:
        print(f"Permission Error: Make sure you have write access to {cleaned_dir}")
        print(f"Error details: {e}")
    except Exception as e:
        print(f"An error occurred: {e}")

if __name__ == "__main__":
    main()

Total chunks created: 101


  return forward_call(*args, **kwargs)
Batches: 100%|██████████| 4/4 [00:01<00:00,  2.38it/s]

Saved chunks and embeddings in E:\Important\Skills\LLM_Playground\data\cleaned\bangladesh.text





In [10]:
import numpy as np

# Load the embeddings
embeddings = r'E:\Important\Skills\LLM_Playground\data\cleaned\bangladesh.text\embeddings.npy'

In [12]:
embeddings = np.load(embeddings)

In [13]:
embeddings

array([[ 0.15232578,  0.01054193, -0.00479   , ...,  0.0292078 ,
        -0.00712487, -0.00822896],
       [ 0.04164567,  0.05191945, -0.0036308 , ...,  0.03541483,
         0.04761254,  0.00126079],
       [-0.01351444,  0.08020821,  0.04896864, ..., -0.04154345,
        -0.034859  , -0.03961822],
       ...,
       [ 0.00260654,  0.05764645, -0.02086941, ..., -0.04555659,
         0.08057153, -0.02243216],
       [ 0.02228773,  0.02549635, -0.0538968 , ...,  0.00209071,
         0.01514115,  0.00904383],
       [-0.01313959,  0.05006026, -0.0384109 , ..., -0.06587043,
        -0.0759312 , -0.02768164]], shape=(101, 384), dtype=float32)

In [14]:
embeddings.shape

(101, 384)

In [15]:
embeddings.dtype

dtype('float32')

In [16]:
print(embeddings[0][:10])  # First 10 values of the first chunk's embedding

[ 0.15232578  0.01054193 -0.00479     0.06588046 -0.01625234  0.00116485
 -0.00753198  0.00768737 -0.06866915 -0.0674295 ]
