In [None]:
!pip install langchain beautifulsoup4 chromadb youtube-transcript-api unstructured praw tqdm

In [1]:
import requests
from bs4 import BeautifulSoup
import praw

from langchain.document_loaders import (
    YoutubeLoader,
    UnstructuredURLLoader
)
from langchain.text_splitter import CharacterTextSplitter
from langchain.schema import Document
from langchain_community.embeddings import OllamaEmbeddings
from langchain.vectorstores import Chroma
from tqdm import tqdm

In [2]:
# Define text chunk strategy
splitter = CharacterTextSplitter(
  chunk_size=2000, 
  chunk_overlap=50,
  separator=" "
)

youtube_videos = [
    "DKuGhNl2ACw",
    "oePpR0W-tXQ",
    "RemlqHZiWlw",
    "tVVYpD4joB0",
    "OzfKAT2PZk4",
    "VJfMi9szpBg",
    "ivWC5WncvsM",
    "LjZlAzr8MuQ",
    "-R0LvgywiWk",
    "tSCuRXnfLuI",
    "uj8hjAjI7p4",
    "gqK3dCpwzxE",
    "b18OH-7A1zo",
    "8dHyDCb8-vE",
    "lwXSR4-nq2U"
]

yt_data_split = []
# Youtube
for youtube_video in youtube_videos:
  yt_loader = YoutubeLoader(youtube_video)
  yt_data = yt_loader.load()
  yt_data_split += splitter.split_documents(yt_data)
print(len(yt_data_split))

103


In [3]:
website_urls = [
    "https://www.crxsi.com/repair/timing.belt/",
    "https://www.crxcommunity.com/threads/how-to-replace-the-timing-belt-and-waterpump.500/",
    "https://www.hondapartsonline.net/blog/how-to-change-the-spark-plugs-in-your-honda-civic",
    "https://www.crxcommunity.com/threads/how-to-remove-the-transmission-manual.51949/",
    "https://www.crxcommunity.com/threads/how-to-replace-the-clutch-and-flywheel.227/",
    "https://www.crxcommunity.com/threads/how-to-check-your-ignition-timing.16915/",
    "https://www.crxcommunity.com/threads/obd0-ecu-codes-and-more.9903/"
]
website_loader = UnstructuredURLLoader(urls=website_urls)
website_data = website_loader.load()
website_data_split = splitter.split_documents(website_data)
print(len(website_data_split))



54


In [4]:
# Stackexchange
so_data = []
for i in range(1, 20):
    # Define the Stack Exchange API endpoint and parameters
    api_url = "https://api.stackexchange.com/2.3/questions"
    params = {
        "order": "desc",
        "sort": "votes",
        "filter": "!-MBrU_IzpJ5H-AG6Bbzy.X-BYQe(2v-.J",
        "site": "mechanics",
        "pagesize": 100,
        "page": i,
    }
    # Send GET request to Stack Exchange API
    response = requests.get(api_url, params=params)
    data = response.json()
    # Retrieve the resolved questions
    resolved_questions = [
        question
        for question in data["items"]
        if question["is_answered"] and question.get("accepted_answer_id")
    ]

    # Print the resolved questions
    for question in resolved_questions:
        text = (
            "Title:",
            question["title"] + "\n" + "Question:",
            BeautifulSoup(question["body"]).get_text()
            + "\n"
            + BeautifulSoup(
                [x["body"] for x in question["answers"] if x["is_accepted"]][0]
            ).get_text(),
        )
        source = question["link"]
        so_data.append(Document(page_content=str(text), metadata={"source": source}))
print(len(so_data))


1294


In [5]:
# Define embedding model
embeddings = OllamaEmbeddings(
    model = "llama2",
    num_thread = 4,
    num_gpu = 1
)

content_data = website_data_split + yt_data_split + so_data
content_num = len(content_data)

# Create the Chroma vector store in batches
batch_size = 10
batches = [content_data[i:i + batch_size] for i in range(0, content_num, batch_size)]
for batch in tqdm(batches, desc="Processing batches"):
    db = Chroma.from_documents(
        batch, embeddings, persist_directory="./chroma_db")

Processing batches:   0%|                                                                          | 0/146 [00:00<?, ?it/s]

Processing batches: 100%|██████████████████████████████████████████████████████████████| 146/146 [1:18:55<00:00, 32.44s/it]
