In [3]:
%pip install --quiet "llmx" "evadb[document,notebook]" "llama-cpp-python"
import evadb
import subprocess
import tempfile
import os
import shutil
import mimetypes
import nbformat
import csv
import tiktoken

In [4]:
cursor = evadb.connect().cursor()

In [5]:
os.environ['OPENAI_KEY'] = 'sk-replace-me'
os.environ['OPENAI_API_KEY'] = os.environ['OPENAI_KEY']
from openai import OpenAI
client = OpenAI()

In [6]:
def load_repository(cursor, repo_url):
  temp_dir = tempfile.mkdtemp()

  target_directory = "repo"
  git_clone_command = ["git", "clone", repo_url, target_directory]
  subprocess.check_call(git_clone_command, cwd=temp_dir)
  repo_path = os.path.join(temp_dir, target_directory)

  id = 1
  rows = [['id', 'name', 'text', 'embeddings']]

  for root, dirs, files in os.walk(repo_path):
      dirs[:] = [d for d in dirs if not d.startswith('.')]
      files = [f for f in files if not f.startswith('.')]

      for file in files:
        if not any(d.startswith('.') for d in root.split(os.path.sep)):
          file_path = os.path.join(root, file)
          mime_type, _ = mimetypes.guess_type(file_path)
          rel_path = os.path.relpath(os.path.join(root, file), repo_path)
          is_text_file = mime_type and mime_type.startswith('text/')

          if is_text_file:
            with open(file_path, 'r', encoding='utf-8') as file:
              file_content = file.read()

          elif file_path.endswith('.ipynb'):
            with open(file_path, 'r', encoding='utf-8') as file:
              notebook_content = nbformat.read(file, as_version=4)
            file_content = ''
            for cell in notebook_content['cells']:
              if cell.cell_type == 'markdown' or cell.cell_type == 'code':
                file_content += cell.source
                file_content += "\n\n"
          else:
            break

          embedding_text = file_content.replace("\n", " ")

          encoding = tiktoken.encoding_for_model("text-embedding-ada-002")
          token_count = len(encoding.encode(embedding_text))

          if token_count < 4097:
            embedding_response = client.embeddings.create(input = [embedding_text], model="text-embedding-ada-002")
            embeddings = embedding_response.data[0].embedding
          else:
            embeddings = []

          rows.append([id, rel_path, file_content, embeddings])
          id += 1


  csv_file = os.path.join(temp_dir, "output.csv")

  with open(csv_file, mode="w", newline="") as file:
      writer = csv.writer(file, delimiter=",")
      for row in rows:
          writer.writerow(row)

  cursor.query('''
  DROP TABLE IF EXISTS repository
  ''').df()

  cursor.query('''
  CREATE TABLE repository
  (id INTEGER,
  name TEXT(150),
  text TEXT(150000),
  embeddings TEXT(150000))
  ''').df()

  cursor.query(f'''
  LOAD CSV '{csv_file}' INTO repository
  ''').df()

  shutil.rmtree(temp_dir)

In [7]:
load_repository(cursor, "https://github.com/microsoft/AI-For-Beginners.git")

In [8]:
cursor.query('''
SELECT * FROM repository
''').df()

Unnamed: 0,_row_id,id,name,text,embeddings
0,1,1,README.md,[![GitHub license](https://img.shields.io/gith...,[]
1,2,2,SECURITY.md,<!-- BEGIN MICROSOFT SECURITY.MD V0.0.7 BLOCK ...,"[0.011329046450555325, -0.02534128911793232, -..."
2,3,3,index.html,"<!DOCTYPE html>\n<html lang=""en"">\n\n<head>\n ...","[-0.002440105425193906, 0.01794261857867241, -..."
3,4,4,lessons/README.md,# Overview\n\n![Overview in a doodle](sketchno...,"[-0.013702348805963993, -0.0089979637414217, 0..."
4,5,5,lessons/3-NeuralNetworks/README.md,# Introduction to Neural Networks\n\n![Summary...,"[-0.01818559691309929, 0.02442140318453312, 0...."
...,...,...,...,...,...
127,128,128,etc/quiz-app/dist/js/chunk-vendors.c1571e8f.js,"(window[""webpackJsonp""]=window[""webpackJsonp""]...",[]
128,129,129,etc/quiz-app/dist/css/app.67375a05.css,"html{font-family:Avenir,Helvetica,Arial,sans-s...","[-0.01229125913232565, 0.02682225964963436, 0...."
129,130,130,etc/quiz-src/qzmkjson.py,src = 'questions-en.txt'\ndst_dir = '../quiz-a...,"[-0.0046498337760567665, 0.01808079145848751, ..."
130,131,131,etc/quiz-src/questions-en.txt,Lesson 1B Introduction to AI: Pre Quiz\n* A fa...,"[-0.005444785580039024, -0.01672228053212166, ..."


In [9]:
cursor.query('''
DROP FUNCTION IF EXISTS Embeddings;
''').df()

cursor.query('''
CREATE FUNCTION Embeddings
IMPL  'drive/MyDrive/Colab Notebooks/embeddings.py';
''').df()

Unnamed: 0,0
0,Function Embeddings added to the database.


In [10]:
cursor.query('''
DROP FUNCTION IF EXISTS EvaLlama;
''').df()

cursor.query('''
CREATE FUNCTION EvaLlama
IMPL  'drive/MyDrive/Colab Notebooks/llama.py';
''').df()

AVX = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | 


Unnamed: 0,0
0,Function EvaLlama added to the database.


In [11]:
cursor.query('''
SELECT name, text, Embeddings("what are the Principles of Responsible AI?", embeddings) FROM repository ORDER BY distance DESC LIMIT 5;
''').df()

Unnamed: 0,name,text,distance
0,lessons/7-Ethics/README.md,# Ethical and Responsible AI\n\nYou have almos...,0.890151
1,lessons/1-Intro/README.md,# Introduction to AI\n\n![Summary of Introduct...,0.813416
2,lessons/1-Intro/translations/README.ja.md,# AI入門と基礎知識\n\n![Summary of Introduction of AI...,0.806615
3,etc/quiz-src/questions-en.txt,Lesson 1B Introduction to AI: Pre Quiz\n* A fa...,0.794848
4,etc/Mindmap.md,# AI\n\n## [Introduction to AI](https://github...,0.78655


In [12]:
cursor.query('''

SELECT ChatGPT('what are the Principles of Responsible AI?', s.text) FROM
(SELECT Embeddings("what are the Principles of Responsible AI?", embeddings), name, text FROM repository ORDER BY distance DESC LIMIT 1) AS s;
''').df()

Unnamed: 0,response
0,The Principles of Responsible AI are as follow...


In [None]:
cursor.query('''
SELECT EvaLlama('what are the five Principles of Responsible AI?', s.text) FROM
(SELECT Embeddings("what are the five Principles of Responsible AI?", embeddings), name, text FROM repository ORDER BY distance DESC LIMIT 1) AS s;
''').df()

AVX = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | 
