In [1]:
!pip install datasets
!pip install ipywidgets
!jupyter nbextension enable --py widgetsnbextension --sys-prefix

Collecting fqdn (from jsonschema[format-nongpl]>=4.18.0->jupyter-events>=0.9.0->jupyter-server<3,>=2.4.0->notebook>=4.4.1->widgetsnbextension~=3.6.6->ipywidgets)
  Using cached fqdn-1.5.1-py3-none-any.whl.metadata (1.4 kB)
Collecting isoduration (from jsonschema[format-nongpl]>=4.18.0->jupyter-events>=0.9.0->jupyter-server<3,>=2.4.0->notebook>=4.4.1->widgetsnbextension~=3.6.6->ipywidgets)
  Using cached isoduration-20.11.0-py3-none-any.whl.metadata (5.7 kB)
Collecting uri-template (from jsonschema[format-nongpl]>=4.18.0->jupyter-events>=0.9.0->jupyter-server<3,>=2.4.0->notebook>=4.4.1->widgetsnbextension~=3.6.6->ipywidgets)
  Using cached uri_template-1.3.0-py3-none-any.whl.metadata (8.8 kB)
Collecting webcolors>=1.11 (from jsonschema[format-nongpl]>=4.18.0->jupyter-events>=0.9.0->jupyter-server<3,>=2.4.0->notebook>=4.4.1->widgetsnbextension~=3.6.6->ipywidgets)
  Downloading webcolors-24.11.1-py3-none-any.whl.metadata (2.2 kB)
Downloading webcolors-24.11.1-py3-none-any.whl (14 kB)
Usin

usage: jupyter [-h] [--version] [--config-dir] [--data-dir] [--runtime-dir]
               [--paths] [--json] [--debug]
               [subcommand]

Jupyter: Interactive Computing

positional arguments:
  subcommand     the subcommand to launch

options:
  -h, --help     show this help message and exit
  --version      show the versions of core jupyter packages and exit
  --config-dir   show Jupyter config dir
  --data-dir     show Jupyter data dir
  --runtime-dir  show Jupyter runtime dir
  --paths        show all Jupyter paths. Add --json for machine-readable
                 format.
  --json         output paths as machine-readable json
  --debug        output debug information about paths

Available subcommands: console dejavu events execute kernel kernelspec lab
labextension labhub migrate nbconvert notebook qtconsole run script server
troubleshoot trust

Jupyter command `jupyter-nbextension` not found.


In [1]:
from datasets import load_dataset

dataset = load_dataset('imdb', split='train[:1%]')

README.md:   0%|          | 0.00/7.81k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

unsupervised-00000-of-00001.parquet:   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [6]:
def clean_text(text):
    text = text.strip()  
    text = text.replace('\n', ' ')  
    return text

In [32]:
dataset = dataset.map(lambda x: {'text': clean_text(x['text'])})

def chunk_text_fixed_length(text, chunk_size=512):
    return [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]

import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize

def chunk_text_sentences(text):
    sentences = sent_tokenize(text)
    return sentences

def chunk_text_paragraphs(text):
    paragraphs = text.split('\n\n')
    return paragraphs
    
dataset = dataset.map(lambda x: {
    'fixed_length_chunks': chunk_text_fixed_length(x['text']),
    'sentence_chunks': chunk_text_sentences(x['text']),
    'paragraph_chunks': chunk_text_paragraphs(x['text'])
})


Map:   0%|          | 0/250 [00:00<?, ? examples/s]

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ayush\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Map:   0%|          | 0/250 [00:00<?, ? examples/s]

In [33]:
def calculate_information_density(text_chunk):
    return len(text_chunk)

dataset = dataset.map(lambda x: {
    'fixed_length_chunk_density': [calculate_information_density(chunk) for chunk in x['fixed_length_chunks']],
    'sentence_chunk_density': [calculate_information_density(chunk) for chunk in x['sentence_chunks']],
    'paragraph_chunk_density': [calculate_information_density(chunk) for chunk in x['paragraph_chunks']]
})


Map:   0%|          | 0/250 [00:00<?, ? examples/s]

In [34]:
fixed_length_density = [sum(x['fixed_length_chunk_density']) / len(x['fixed_length_chunk_density']) for x in dataset]
sentence_density = [sum(x['sentence_chunk_density']) / len(x['sentence_chunk_density']) for x in dataset]
paragraph_density = [sum(x['paragraph_chunk_density']) / len(x['paragraph_chunk_density']) for x in dataset]

print(f"Fixed-length chunks density: {sum(fixed_length_density) / len(fixed_length_density)}")
print(f"Sentence-based chunks density: {sum(sentence_density) / len(sentence_density)}")
print(f"Paragraph-based chunks density: {sum(paragraph_density) / len(paragraph_density)}")

Fixed-length chunks density: 409.80421443001444
Sentence-based chunks density: 137.92463026789432
Paragraph-based chunks density: 1272.572


In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [12]:
def calculate_semantic_coherence(chunks):
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(chunks)
    coherence_scores = []
    for i in range(len(chunks) - 1):
        vec1 = tfidf_matrix[i]
        vec2 = tfidf_matrix[i + 1]
        coherence = cosine_similarity(vec1, vec2)[0][0]
        coherence_scores.append(coherence)
    return coherence_scores

In [35]:
dataset = dataset.map(lambda x: {
    'fixed_length_chunk_coherence': calculate_semantic_coherence(x['fixed_length_chunks']),
    'sentence_chunk_coherence': calculate_semantic_coherence(x['sentence_chunks']),
    
})

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

In [36]:
fixed_length_coherence = [sum(x['fixed_length_chunk_coherence']) / len(x['fixed_length_chunk_coherence']) for x in dataset if len(x['fixed_length_chunk_coherence']) > 0]
sentence_coherence = [sum(x['sentence_chunk_coherence']) / len(x['sentence_chunk_coherence']) for x in dataset if len(x['sentence_chunk_coherence']) > 0]
paragraph_coherence = [sum(x['paragraph_chunk_coherence']) / len(x['paragraph_chunk_coherence']) for x in dataset if len(x['paragraph_chunk_coherence']) > 0]

print(f"Fixed-length chunks coherence: {sum(fixed_length_coherence) / len(fixed_length_coherence)}")
print(f"Sentence-based chunks coherence: {sum(sentence_coherence) / len(sentence_coherence)}")


Fixed-length chunks coherence: 0.24840265577300588
Sentence-based chunks coherence: 0.0969128254189781


In [16]:
def calculate_context_overlap(chunks, top_n=5):
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(chunks)
    feature_names = vectorizer.get_feature_names_out()
    context_overlap_scores = []
    for i in range(len(chunks) - 1):
        chunk1_top_words = set([feature_names[idx] for idx in np.argsort(tfidf_matrix[i].toarray())[0][-top_n:]])
        chunk2_top_words = set([feature_names[idx] for idx in np.argsort(tfidf_matrix[i + 1].toarray())[0][-top_n:]])
        overlap = chunk1_top_words.intersection(chunk2_top_words)
        context_overlap_scores.append(len(overlap) / top_n)
    return context_overlap_scores

In [37]:
dataset = dataset.map(lambda x: {
    'fixed_length_chunk_overlap': calculate_context_overlap(x['fixed_length_chunks']),
    'sentence_chunk_overlap': calculate_context_overlap(x['sentence_chunks']),
    
})

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

In [38]:
fixed_length_overlap = [sum(x['fixed_length_chunk_overlap']) / len(x['fixed_length_chunk_overlap']) for x in dataset if len(x['fixed_length_chunk_overlap']) > 0]
sentence_overlap = [sum(x['sentence_chunk_overlap']) / len(x['sentence_chunk_overlap']) for x in dataset if len(x['sentence_chunk_overlap']) > 0]
paragraph_overlap = [sum(x['paragraph_chunk_overlap']) / len(x['paragraph_chunk_overlap']) for x in dataset if len(x['paragraph_chunk_overlap']) > 0]

print(f"Fixed-length chunks overlap: {sum(fixed_length_overlap) / len(fixed_length_overlap)}")
print(f"Sentence-based chunks overlap: {sum(sentence_overlap) / len(sentence_overlap)}")


Fixed-length chunks overlap: 0.19180767855546615
Sentence-based chunks overlap: 0.04878517704614819


In [31]:
print(dataset.column_names)

['text', 'label', 'text_chunks', 'sentence_chunks', 'paragraph_chunks', 'text_chunk_density', 'sentence_chunk_density', 'paragraph_chunk_density', 'text_chunk_coherence', 'sentence_chunk_coherence', 'paragraph_chunk_coherence', 'text_chunk_overlap', 'sentence_chunk_overlap', 'paragraph_chunk_overlap']


In [23]:
!pip install nltk
!pip install transformers
!pip install torch
!pip install scikit-learn

Collecting transformers
  Downloading transformers-4.47.1-py3-none-any.whl.metadata (44 kB)
     ---------------------------------------- 0.0/44.1 kB ? eta -:--:--
     ----------------- -------------------- 20.5/44.1 kB 682.7 kB/s eta 0:00:01
     ----------------------------------- -- 41.0/44.1 kB 667.8 kB/s eta 0:00:01
     -------------------------------------- 44.1/44.1 kB 363.9 kB/s eta 0:00:00
Collecting tokenizers<0.22,>=0.21 (from transformers)
  Downloading tokenizers-0.21.0-cp39-abi3-win_amd64.whl.metadata (6.9 kB)
Collecting safetensors>=0.4.1 (from transformers)
  Downloading safetensors-0.4.5-cp312-none-win_amd64.whl.metadata (3.9 kB)
Downloading transformers-4.47.1-py3-none-any.whl (10.1 MB)
   ---------------------------------------- 0.0/10.1 MB ? eta -:--:--
    --------------------------------------- 0.2/10.1 MB 6.9 MB/s eta 0:00:02
   --- ------------------------------------ 0.8/10.1 MB 12.7 MB/s eta 0:00:01
   ------ --------------------------------- 1.7/10.1 MB 13.

In [24]:
from transformers import BertModel, BertTokenizer
import torch
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

In [25]:
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)

def get_chunk_embedding(chunk):
    inputs = tokenizer(chunk, return_tensors='pt', truncation=True, max_length=512)
    outputs = model(**inputs)
    # Use the embeddings of the [CLS] token
    return outputs.last_hidden_state[:, 0, :].detach().numpy()
    

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [26]:
def calculate_semantic_similarity(chunks):
    if not chunks:  # If there are no chunks, return an empty list
        return []
    
    embeddings = [get_chunk_embedding(chunk) for chunk in chunks]
    similarity_scores = []
    for i in range(len(embeddings) - 1):
        similarity = cosine_similarity(embeddings[i], embeddings[i+1])
        similarity_scores.append(similarity[0][0])
    return similarity_scores

In [39]:
fixed_length_similarity = calculate_semantic_similarity(dataset[0]['fixed_length_chunks'])
sentence_similarity = calculate_semantic_similarity(dataset[0]['sentence_chunks'])

In [40]:
if fixed_length_similarity:
    print(f"Fixed-length chunks semantic similarity: {sum(fixed_length_similarity) / len(fixed_length_similarity)}")
else:
    print("Fixed-length chunks semantic similarity: No data or chunks to compare")

if sentence_similarity:
    print(f"Sentence-based chunks semantic similarity: {sum(sentence_similarity) / len(sentence_similarity)}")
else:
    print("Sentence-based chunks semantic similarity: No data or chunks to compare")

Fixed-length chunks semantic similarity: 0.8573352495829264
Sentence-based chunks semantic similarity: 0.8245464861392975


In [41]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def calculate_token_count(text):
    tokens = tokenizer.encode(text, add_special_tokens=True)  # Add special tokens like [CLS], [SEP]
    return len(tokens)

In [42]:
def compare_token_efficiency(original_text, chunked_texts):
    original_token_count = calculate_token_count(original_text)
    
    chunk_token_counts = [calculate_token_count(chunk) for chunk in chunked_texts]
    
    token_efficiency = [chunk_token_count / original_token_count for chunk_token_count in chunk_token_counts]
    
    return original_token_count, chunk_token_counts, token_efficiency

In [43]:
original_text = dataset[0]['text']
fixed_length_chunks = dataset[0]['text_chunks']  
sentence_chunks = chunk_text_sentences(original_text)  

original_token_count, fixed_length_token_counts, fixed_length_token_efficiency = compare_token_efficiency(original_text, fixed_length_chunks)
_, sentence_token_counts, sentence_token_efficiency = compare_token_efficiency(original_text, sentence_chunks)

In [44]:
print(f"Original token count: {original_token_count}")
print(f"Fixed-length chunk token counts: {fixed_length_token_counts}")
print(f"Fixed-length chunk token efficiency: {fixed_length_token_efficiency}")
print(f"Sentence chunk token counts: {sentence_token_counts}")
print(f"Sentence chunk token efficiency: {sentence_token_efficiency}")

Original token count: 363
Fixed-length chunk token counts: [114, 113, 119, 28]
Fixed-length chunk token efficiency: [0.3140495867768595, 0.31129476584022037, 0.3278236914600551, 0.07713498622589532]
Sentence chunk token counts: [29, 75, 41, 61, 31, 25, 72, 29, 16]
Sentence chunk token efficiency: [0.07988980716253444, 0.2066115702479339, 0.11294765840220386, 0.16804407713498623, 0.08539944903581267, 0.06887052341597796, 0.19834710743801653, 0.07988980716253444, 0.0440771349862259]
