In [1]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [2]:
with open("hhgttg2.txt") as f:
    hhgttg = f.read()

In [3]:
CHUNK_SIZE=500

In [4]:
text_splitter = RecursiveCharacterTextSplitter(
    # Set a really small chunk size, just to show.
    chunk_size=CHUNK_SIZE,
    chunk_overlap=20,
    length_function=len,
    is_separator_regex=False,
)
texts = text_splitter.create_documents([hhgttg])

In [5]:
texts = text_splitter.split_text(hhgttg)
len(texts)

3888

In [6]:
texts[:5]

["The Hitch Hiker's Guide to the Galaxy \n\nfor Jonny Brock and Clare Gorst  and all other Arlingtoniansfor tea, sympathy, and a sofa\n\n\n\nFar out in the uncharted backwaters of the unfashionable  end  of the  western  spiral  arm  of  the Galaxy lies a small unregarded yellow sun.",
 'Orbiting this at a distance of roughly ninety-two  million  miles is  an  utterly insignificant little blue green planet whose ape- descended life forms are so amazingly primitive that  they  still think digital watches are a pretty neat idea.',
 "This planet has - or rather had - a problem, which was this: most of  the  people  on  it were unhappy for pretty much of the time. Many solutions were suggested for this problem, but most of these were  largely  concerned with the movements of small green pieces of paper, which is odd because on the whole it wasn't  the  small green pieces of paper that were unhappy.",
 "And so the problem remained; lots of the people  were  mean,  and most of them were mise

In [7]:
from sentence_transformers import SentenceTransformer
embed_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')


  from .autonotebook import tqdm as notebook_tqdm
  return torch._C._cuda_getDeviceCount() > 0


In [8]:
from pymilvus import connections, db
connections.connect(
  alias="default",
  user='',
  password='',
  host='localhost',
  port='19530'
)

database = db.create_database("embeddings")
db.using_database("embeddings")

In [9]:
from pymilvus import CollectionSchema, FieldSchema, DataType, Collection
chunk_uid = FieldSchema(
  name="chunk_uid",
  dtype=DataType.INT64,
  is_primary=True,
  auto_id=True
)
chunk = FieldSchema(
  name="chunk",
  dtype=DataType.VARCHAR,
  max_length=CHUNK_SIZE,
  # The default value will be used if this field is left empty during data inserts or upserts.
  # The data type of `default_value` must be the same as that specified in `dtype`.
  default_value=""
)
chunk_embedding = FieldSchema(
  name="chunk_embedding",
  dtype=DataType.FLOAT_VECTOR,
  dim=384
)

schema = CollectionSchema(
  fields=[chunk_uid, chunk, chunk_embedding],
  description="Sentence embeddings",
  enable_dynamic_field=True
)
collection_name = "chunks"

try:
  chunks = Collection(
      name=collection_name,
      schema=schema,
      using='default',
      shards_num=2
      )
except Exception as inst:
  print("Error: " + str(inst))


In [10]:
embeddings = embed_model.encode(texts)
#print(embeddings)
len(texts), embeddings.shape

(3888, (3888, 384))

In [11]:
try:
    mr = chunks.insert([texts,embeddings])
    chunks.flush()
    mr
except Exception as e:
    print(e)

In [12]:
index = {
    "index_type": "IVF_FLAT",
    "metric_type": "COSINE",
    "params": {"nlist": 128},
}

chunks.create_index("chunk_embedding", index)

Status(code=0, message=)