### Imports

In [3]:
!pip install -U accelerate datasets

Collecting accelerate
  Downloading accelerate-0.33.0-py3-none-any.whl.metadata (18 kB)
Collecting datasets
  Using cached datasets-2.20.0-py3-none-any.whl.metadata (19 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp312-cp312-macosx_11_0_arm64.whl.metadata (3.3 kB)
Collecting pyarrow-hotfix (from datasets)
  Using cached pyarrow_hotfix-0.6-py3-none-any.whl.metadata (3.6 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Using cached dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting pandas (from datasets)
  Using cached pandas-2.2.2-cp312-cp312-macosx_11_0_arm64.whl.metadata (19 kB)
Collecting xxhash (from datasets)
  Using cached xxhash-3.4.1-cp312-cp312-macosx_11_0_arm64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Using cached multiprocess-0.70.16-py312-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.5.0,>=2023.1.0 (from fsspec[http]<=2024.5.0,>=2023.1.0->datasets)
  Using cached fsspec-2024.5.0-py3-none-any.whl.metadata 

In [1]:
!pip install -q chromadb

In [56]:
import torch.nn as nn
import numpy as np
# import Datasets
import chromadb
from sentence_transformers import (
    SentenceTransformer, models, losses, util, InputExample, evaluation, SentenceTransformerTrainingArguments, SentenceTransformerTrainer
)
from accelerate import Accelerator
import glob
import os
import ollama
# from langchain.text_splitter import NLTKTextSplitter
# from langchain_community.document_loaders import (
#     # PDFLoader,
#     # WordDocumentLoader,
#     TextLoader,
#     # ExcelLoader,
#     CSVLoader,
#     # PowerPointLoader
# )

## Basic functions

In [28]:
def getEmbeddingList(model, sentences):
  """ This function returns the sentence embeddings for a given document using the SentenceTransformer model and encapsulates them inside a list.

  @param model: SentenceTransformer: The model to be used for getting the embeddings.
  @param sentences: list: The list of sentences for which embeddings are to be calculated. """

  embeddings = model.encode(sentences)
  return embeddings.tolist()

In [29]:
def getModel() -> SentenceTransformer:
  """ This function creates a SentenceTransformer model using the 'sentence-transformers/all-MiniLM-L6-v2' base model. It utilizes accelerator to make use of multiple GPUs
  and adds a layer to get the sentence embeddings via mean pooling. This model will be used for training sbert's sentence embeddings. """

  accelerator = Accelerator()
  print(f"Using GPUs: {accelerator.num_processes}")

  # Get the base model to train
  word_embedding_model = models.Transformer('sentence-transformers/all-MiniLM-L6-v2')

  # Add layer to get "sentence embedding" (using mean pooling)
  pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
  model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
  return model

In [None]:
# An initial list of actions that the AI can choose between
SET_OF_ACTIONS = ['new file', 'search web', 'search files', 'resize window', 'choose option', 'open file', 'close file', 'minimize window', 'maximize window', 'scroll up', 'scroll down', 'scroll left', 'scroll right', 'open', 'close', 'upload']
# SET_OF_ACTIONS = ['new file', 'search', 'resize window', 'choose option', 'scroll', 'open file', 'close file', 'minimize window', 'maximize window', 'scroll up', 'scroll down', 'scroll left', 'scroll right', 'copy', 'paste', 'cut', 'undo', 'redo', 'drag and drop', 'select', 'deselect', 'save', 'save as', 'open', 'close', 'upload']

# Basic file gathering and putting into database

### Getting file data

In [34]:
def list_files(initdir: str, file_extensions: list):
    '''
    Returns a list of file under initdir and all its subdirectories
    that have file extension contained in file_extensions.
    ''' 
    file_list = []
    file_count = {key: 0 for key in file_extensions}  # for reporting only
    
    # Traverse through directories to find files with specified extensions
    for root, _, files in os.walk(initdir):
        for file in files:
            ext = file.split('.')[-1].lower()
            if ext in file_extensions:
                file_path = os.path.join(root, file)
                file_list.append(file_path)
                # increment type of file
                file_count[ext] += 1
    
    # total = len(file_list)
    # print(f'There are {total} files under dir {initdir}.')
    # for k, n in file_count.items():
        # print(f'   {n} : ".{k}" files')
    return file_list

In [35]:
list_files('test', ['txt', 'c', 'py'])

['test/resolutions.txt',
 'test/sorting.py',
 'test/random.py',
 'test/example.txt',
 'test/buhao.c']

In [52]:
def get_document_info(file_path: str):
    '''
    Open the file at the given file path and return its content.
    
    @param file_path: str: The path of the file to be opened.
    @return: str: The content of the file.
    '''
    try: 
        with open(file_path, 'r') as file:
            content = file.read()
        # metadata = file.metadata
        file_name = os.path.basename(file_path)
        return (file_name, content)
    except:
        return None

    

In [62]:
get_document_info('test/resolutions.txt')

('resolutions.txt',
 '1. Exercise regularly and stay fit.\n2. Learn a new programming language.\n3. Read at least one book every month.\n4. Spend more time with family and friends.\n5. Travel to at least two new places.\n6. Save money and stick to a budget.\n7. Volunteer for a good cause.\n8. Improve my communication skills.\n9. Learn a musical instrument.\n10. Practice mindfulness and reduce stress.')

### ollama implementation for better semantics(you need an ollama server running in the background for this to work)

In [61]:
modelfile = '''
FROM llama3
SYSTEM You are supposed to give a description of the input and nothing else.
'''

ollama.create(model='example', modelfile=modelfile)

In [64]:
ollama.chat(model="example", messages=[
    {
        'role': 'user',
        'content': f'{get_document_info('test/resolutions.txt')[1]}'
    }])

{'model': 'example',
 'created_at': '2024-07-28T19:39:09.125459Z',
 'message': {'role': 'assistant', 'content': "New Year's resolutions!"},
 'done': True,
 'total_duration': 1382438791,
 'load_duration': 1032208,
 'prompt_eval_count': 124,
 'prompt_eval_duration': 1096374000,
 'eval_count': 6,
 'eval_duration': 276858000}

In [67]:
def get_ollama_description(file_path: str, modelfile: str):
    '''
    Get the description of the input from the Ollama model.
    
    @param file_path: str: The file with the document.
    @param modelfile: str: The modelfile for the Ollama model.
    @return: str: The description of the input.
    '''
    content = get_document_info(file_path)
    ollama.create(model='example', modelfile=modelfile)
    response = ollama.chat(model="example", messages=[
        {
            'role': 'user',
            'content': f'Filename: {content[0]}, File content:{content[1]}'
        }])
    return response['message']['content']

In [68]:
get_ollama_description('test/resolutions.txt', modelfile)

"New Year's resolutions text file containing 10 goals for personal improvement, including exercise, learning, travel, financial management, social connections, and self-care."

### Database

In [None]:
embedmodel = getModel()

In [None]:
client = chromadb.Client()

doc_collection = client.get_or_create_collection("docs")

In [None]:
doc_collection.add(
    embeddings=
        getEmbeddingList(embedmodel, file_list)
    ,
    documents=file_names,
    ids=[f'id{i}' for i in range(len(file_list))],
)

### Fine tune this or look at online examples because current outputs are bad

In [22]:
# bad
input = "Python codes for web scraping"

query_result = doc_collection.query(
            query_embeddings=[getEmbeddingList(model, input)],
            n_results=1,
        )

print(query_result)

{'ids': [['id0']], 'distances': [[59.03951644897461]], 'metadatas': [[None]], 'embeddings': None, 'documents': [['sorting.py']], 'uris': None, 'data': None, 'included': ['metadatas', 'documents', 'distances']}


In [25]:
# bad
input = "new years resolutions"

query_result = doc_collection.query(
            query_embeddings=[getEmbeddingList(model, input)],
            n_results=1,
        )

print(query_result)

{'ids': [['id2']], 'distances': [[46.51013946533203]], 'metadatas': [[None]], 'embeddings': None, 'documents': [['example.txt']], 'uris': None, 'data': None, 'included': ['metadatas', 'documents', 'distances']}


In [None]:
# Now we need to get a vocal dataset from hf to do tests
