# Splitters

In [1]:
#import libraries
from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter

In [2]:
#define the chunk size
chunk_size = 26
chunk_overlap = 4

In [3]:
# Parameters in splitters
r_splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap
)

c_splitter = CharacterTextSplitter(
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap
)

In [4]:
text = 'abcdefghijklmnopqrstuvwxyzabcdefg'

# Split the text
r_splitter.split_text(text)


['abcdefghijklmnopqrstuvwxyz', 'wxyzabcdefg']

In [5]:
c_splitter.split_text(text)

['abcdefghijklmnopqrstuvwxyzabcdefg']

In [6]:
t = "LangChain was launched in October 2022 as an open source project by Harrison Chase, while working at machine learning startup Robust Intelligence. The project quickly garnered popularity,with improvements from hundreds of contributors on GitHub, trending discussions on Twitter, lively activity on the project's Discord server, many YouTube tutorials, and meetups in San Francisco and London. In April 2023, LangChain had incorporated and the new startup raised over $20 million in funding at a valuation of at least $200 million from venture firm Sequoia Capital, a week after announcing a $10 million seed investment from Benchmark."

In [7]:
# Import the SpacyTextSplitter
from langchain_text_splitters import SpacyTextSplitter

text_splitter = SpacyTextSplitter(chunk_size=10, chunk_overlap=5)
texts = text_splitter.split_text(t)

Created a chunk of size 146, which is longer than the specified 10
Created a chunk of size 245, which is longer than the specified 10


In [8]:
texts[0]

'LangChain was launched in October 2022 as an open source project by Harrison Chase, while working at machine learning startup Robust Intelligence.'

### PDF Loading and Splitting

In [10]:
from langchain_community.document_loaders import PyPDFLoader

loader = PyPDFLoader('AI_Engineer_Roadmap.pdf')
pages = loader.load_and_split()

In [12]:
len(pages)

13

In [13]:
for page in pages:
  chunks=r_splitter.split_text(page.page_content)
  print(len(chunks))

57
63
56
71
54
75
72
66
81
78
78
56
30


# Embeddings

In [14]:
from langchain_huggingface import HuggingFaceEmbeddings

In [16]:
embed_model = HuggingFaceEmbeddings(model_name='BAAI/bge-small-en-v1.5')

In [17]:
embed = embed_model.embed_query("how are you?")

In [18]:
len(embed)

384

In [19]:
embed

[-0.032435107976198196,
 -0.05979365110397339,
 0.036402467638254166,
 -0.026909997686743736,
 -0.023664141073822975,
 0.00647991057485342,
 0.13852958381175995,
 0.03314034268260002,
 -0.0025555279571563005,
 0.00205389317125082,
 0.0028572140727192163,
 -0.09472998976707458,
 0.05716608464717865,
 0.026267345994710922,
 0.061911579221487045,
 0.023629983887076378,
 0.04363708943128586,
 -0.12983235716819763,
 -0.14028267562389374,
 0.018216591328382492,
 -0.08630456030368805,
 -0.0006465499172918499,
 -0.03165007382631302,
 -0.04353580251336098,
 -0.031315647065639496,
 -0.002692099893465638,
 0.019646838307380676,
 -0.004783571232110262,
 -0.0530797578394413,
 0.0016580438241362572,
 -0.037133265286684036,
 0.0020964862778782845,
 0.049834221601486206,
 0.02568691596388817,
 0.028168465942144394,
 0.010681997053325176,
 -0.02091003954410553,
 0.0006299997330643237,
 -0.01644856669008732,
 0.006235383450984955,
 0.021730825304985046,
 -0.019932450726628304,
 -0.04406288266181946,
 -0

In [20]:
emb  = embed_model.embed_query(texts[0])

In [21]:
len(emb)

384

### Semantic Splitters

In [23]:
from langchain_experimental.text_splitter import SemanticChunker

In [24]:
text_splitter = SemanticChunker(
    embed_model, breakpoint_threshold_type="percentile"
)

In [25]:
sementic_splits = text_splitter.split_text(t)

In [26]:
len(sementic_splits)

2

In [27]:
sementic_splits[0]

"LangChain was launched in October 2022 as an open source project by Harrison Chase, while working at machine learning startup Robust Intelligence. The project quickly garnered popularity,with improvements from hundreds of contributors on GitHub, trending discussions on Twitter, lively activity on the project's Discord server, many YouTube tutorials, and meetups in San Francisco and London."

In [28]:
sementic_splits[1]

'In April 2023, LangChain had incorporated and the new startup raised over $20 million in funding at a valuation of at least $200 million from venture firm Sequoia Capital, a week after announcing a $10 million seed investment from Benchmark.'

In [29]:
sementic_embed = embed_model.embed_query(sementic_splits[0])

In [30]:
sementic_embed

[-0.056384727358818054,
 -0.053467217832803726,
 -0.06954137235879898,
 -0.014295410364866257,
 0.009370681829750538,
 -0.012697895057499409,
 -0.0399722121655941,
 0.003789076814427972,
 -0.015680858865380287,
 -0.045515675097703934,
 0.03538496419787407,
 -0.02254604734480381,
 0.012733330018818378,
 0.03873790055513382,
 0.00894480012357235,
 0.007724706549197435,
 -0.03410578519105911,
 -0.07874759286642075,
 0.0794992744922638,
 -0.010543079115450382,
 0.028724154457449913,
 -0.019288262352347374,
 -0.007689702324569225,
 -0.06615592539310455,
 0.04022713005542755,
 0.08610337227582932,
 0.05861765146255493,
 -0.0717000737786293,
 0.012730344198644161,
 -0.18944038450717926,
 0.006675461307168007,
 -0.006620688829571009,
 0.047157395631074905,
 -0.03919486701488495,
 -0.01773603819310665,
 0.05460413917899132,
 -0.03216155245900154,
 0.019226882606744766,
 -0.00978201162070036,
 0.01933377794921398,
 -0.009136371314525604,
 -0.021263616159558296,
 0.01575324870646,
 -0.03773690015

In [31]:
len(sementic_embed)

384