In [None]:
%pip install Ollama -q
%pip install llama_index.llms.ollama -q
%pip install llama_index.core -q
%pip install llama_index.embeddings.ollama -q

In [None]:
from llama_index.llms.ollama import Ollama
from llama_index.core import Settings
from llama_index.core import VectorStoreIndex
from llama_index.core import SimpleDirectoryReader
from llama_index.embeddings.ollama import OllamaEmbedding

In [None]:
import ollama
import requests
from pathlib import Path
import re
import nltk
import json

In [None]:
# Iterate over all items in the directory and create a list of pathlib.PosixPath entries each detailing a path to a .txt training file
#.txt files are expected to be numered

def get_training_files(path):

   directory = Path(path)
   training_files = []

   for file in directory.iterdir():
      training_files.append(file)

   # Sort the list of pathlib.PosixPath entries based on numeric part of the .txt file
   """""
   1) files.sort(: This starts the sorting operation on the files list. The sort() method sorts the elements of the list in place.
   
   2) key=lambda x:: This is a key function used to customize the sorting order. Here, lambda x: defines an anonymous function that takes one argument x.
   
   3) re.search(r'\d+', x.stem): This part uses the re.search() function from the re module to search for a pattern in the file name.
      \d+ is a regular expression pattern that matches one or more digits. x.stem gives the base name of the file without the extension.

   4) .group(): This method returns the matched part of the string. In this case, it returns the matched digits as a string.
   
   5) int(...): This converts the matched string of digits to an integer. This is necessary because we want to sort the files numerically,
      not lexicographically.

   Putting it all together, this line sorts the list of files (files) based on the numeric part extracted from each file's name.
   The lambda function extracts this numeric part using a regular expression and converts it to an integer, which is then used as the sorting key.
"""""
   training_files.sort(key=lambda x: int(re.search(r'\d+', x.stem).group()))

   return training_files

training_set = get_training_files("/Users/raulriveromartinez/Desktop/Skyy AI/training_data")


In [None]:
#Generate a list of str that contains only the names of the .txt files without their full directory

def get_filenames(training_files):   

   filenames = []
   
   for file in training_files:
      filenames.append(file.name)

   return filenames

filenames_set = get_filenames(training_set)

In [None]:
def get_files_contents(training_files):
     
     # Create an empty list to store the contents of each text file in the training dataset
     files_contents = []

     # Iterate through each text file in the training dataset and add its contents to the files_contents list
     for file in training_files:
          with open(file, "r") as contents:
               txt_contents = contents.read()
               files_contents.append(txt_contents)

     return files_contents

contents_set = get_files_contents(training_set)

In [None]:
#put the filename list and the file_content lists on a dictionary with two key-value pairs: filename and contents

def get_contents_dict(filenames, files_contents):

    AllData = []
    x = 0
    for i in range(0,len(filenames)):
        data = {'filename':filenames[i],'contents':files_contents[i]}
        AllData.append(data)

    return AllData

files_content_dict = get_contents_dict(filenames_set, contents_set)

In [None]:
# allData is the name of the dictionary storing the filename and contents of each file as key value pairs

def chunk_data_as_sentences(allData):
	# list to store all data as individual sentences with their corresponding filename
	system_data = list()
	# process each file's content in the dictionary
	for i in allData:
		# segment (chunk) each file's contents into individual sentences
		sentences = nltk.sent_tokenize(i['contents'])
		# store each sentence with its filename
		for s in sentences:
			system_data.append({'filename': i['filename'], 'sentence': s})
	return system_data

chunked_files_content_dict = chunk_data_as_sentences(files_content_dict)


In [None]:
for i in chunked_files_content_dict:
	print (i)

In [None]:
# embedding function to convert sentence into vector
# def ollama_embedding(prompt):
#     return requests.post('http://localhost:11434/api/embeddings', json = {'model':'llama3', 'prompt': prompt})


In [19]:
ollama_embedding = OllamaEmbedding(model_name = "llama3", base_url = 'http://localhost:11434')

query_embedding = list()

query_embedding.append(ollama_embedding.get_query_embedding("The quick brown fox runs over the fence"))
query_embedding.append(ollama_embedding.get_query_embedding("The black fox runs over the fence"))
query_embedding.append(ollama_embedding.get_query_embedding("The car exceeds the highway speed"))
query_embedding.append(ollama_embedding.get_query_embedding("Miami won the word the the best city"))

print(query_embedding)

[[-2.070464611053467, 1.4835457801818848, 0.8498050570487976, 0.8770601153373718, 0.8084642291069031, -1.3659098148345947, -3.299577474594116, -0.5538150072097778, -4.161124229431152, -1.2836076021194458, 1.4512193202972412, 3.27927303314209, -1.6240872144699097, -0.08721035718917847, 2.6577999591827393, 1.5263457298278809, -1.8462872505187988, -1.0329296588897705, -1.7886910438537598, -1.0081626176834106, 0.14241455495357513, -0.8165634870529175, -0.1657838374376297, -0.24435943365097046, -2.3637027740478516, 1.4733588695526123, 1.5693985223770142, -0.5046149492263794, 2.4906373023986816, 0.782398521900177, -3.4325263500213623, 0.6603240966796875, -2.3696579933166504, -0.2055874764919281, 3.064779758453369, -1.0319995880126953, -2.8533706665039062, 0.8986296653747559, 2.1892149448394775, 0.7200424671173096, 0.7323054075241089, -0.20468609035015106, 0.9140107035636902, 0.528365969657898, 1.967456579208374, -0.3619694113731384, 1.1978145837783813, -1.6714370250701904, -1.387238144874572

In [None]:
# create embeddings for the "chunked" data  pass the allData dictionary
def embed_sentences(allData):
	# create a list to store each sentence in each file with its
	# filename, the sentence text, and its embedding
	system_record = list()
	for i in allData:
		# function call need to match completed implementation
		embedding = ollama_embedding(i['sentence'])
		record = {'filename': i['filename'], 'sentence': i['sentence'], 'vector': embedding}
		system_record.append(record)

	with open('mdc_index.json', 'w') as outfile:
        	json.dump(record, outfile, indent=2)

embed_sentences(chunked_files_content_dict)