In [1]:
%pip install --quiet "evadb[document,notebook]"
import evadb
import subprocess
import tempfile
import os
import shutil
import mimetypes
import nbformat
import csv

In [2]:
cursor = evadb.connect().cursor()

In [10]:
os.environ['OPENAI_KEY'] = 'sk-replace-me'

In [11]:
def load_repository(cursor, repo_url):
  temp_dir = tempfile.mkdtemp()
  target_directory = "repo"
  git_clone_command = ["git", "clone", repo_url, target_directory]
  subprocess.check_call(git_clone_command, cwd=temp_dir)
  repo_path = os.path.join(temp_dir, target_directory)

  id = 1
  rows = [['id', 'name', 'text']]

  for root, dirs, files in os.walk(repo_path):
      dirs[:] = [d for d in dirs if not d.startswith('.')]
      files = [f for f in files if not f.startswith('.')]

      for file in files:
        if not any(d.startswith('.') for d in root.split(os.path.sep)):
          file_path = os.path.join(root, file)
          mime_type, _ = mimetypes.guess_type(file_path)
          rel_path = os.path.relpath(os.path.join(root, file), repo_path)
          is_text_file = mime_type and mime_type.startswith('text/')
          if is_text_file:
            with open(file_path, 'r', encoding='utf-8') as file:
              file_content = file.read()
            rows.append([id, rel_path, file_content])
            id += 1

          elif file_path.endswith('.ipynb'):
            with open(file_path, 'r', encoding='utf-8') as file:
              notebook_content = nbformat.read(file, as_version=4)

            file_content = ''
            for cell in notebook_content['cells']:
              if cell.cell_type == 'markdown' or cell.cell_type == 'code':
                file_content += cell.source
                file_content += "\n\n"
            rows.append([id, rel_path, file_content])
            id += 1

  csv_file = os.path.join(temp_dir, "output.csv")

  with open(csv_file, mode="w", newline="") as file:
      writer = csv.writer(file)
      for row in rows:
          writer.writerow(row)

  cursor.query('''
  DROP TABLE IF EXISTS repository
  ''').df()

  cursor.query('''
  CREATE TABLE repository
  (id INTEGER,
  name TEXT(150),
  text TEXT(150000))
  ''').df()

  cursor.query(f'''
  LOAD CSV '{csv_file}' INTO repository
  ''').df()

  shutil.rmtree(temp_dir)

In [5]:
load_repository(cursor, "https://github.com/microsoft/AI-For-Beginners.git")

In [6]:
response = cursor.query('''
  SELECT ChatGPT('What is this lesson about?', text)
  FROM repository LIMIT 10
''').df()

In [7]:
print(response)

                                    chatgpt.response
0  This lesson is about Genetic Algorithms (GA), ...
1  This lesson is about solving Diophantine equat...
2  This lesson is about using Q-Learning to teach...
3  This lesson is about training a reinforcement ...
4  This lesson is about training a reinforcement ...
5  This lesson is about implementing the REINFORC...
6  This lesson is about training a reinforcement ...
7  This lesson is about text classification using...
8  This lesson is about text classification using...
9  This lesson is about experimenting with OpenAI...


In [8]:
response2 = cursor.query('''
  SELECT ChatGPT('What is this lesson about?', text)
  FROM repository WHERE name = 'lessons/5-NLP/16-RNN/RNNPyTorch.ipynb'
''').df()

In [9]:
print(response2["chatgpt.response"][0])

This lesson is about recurrent neural networks (RNNs) and their applications in natural language processing tasks. It covers the basics of RNNs, including how they capture the order of words in a sequence, and introduces two popular types of RNN architectures: simple RNN and Long Short Term Memory (LSTM). The lesson also discusses the challenges of training RNNs and introduces the concept of packed sequences to handle variable-length input. Additionally, it explores bidirectional and multilayer RNNs and mentions that RNNs can be used for tasks beyond sequence classification, such as text generation and machine translation.
