In [7]:
# pip install --upgrade langchain langchain-community

In [1]:
# imports

import os
import re
import math
import json
from tqdm import tqdm
import random
from dotenv import load_dotenv
from huggingface_hub import login
import numpy as np
import pickle
from sentence_transformers import SentenceTransformer
from langchain.document_loaders import DirectoryLoader, TextLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain_community.document_loaders import UnstructuredWordDocumentLoader
from datasets import load_dataset
import chromadb
from sklearn.manifold import TSNE
import plotly.graph_objects as go
import glob

In [2]:
# environment

load_dotenv(override=True)
os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY', 'your-key-if-not-using-env')
os.environ['HF_TOKEN'] = os.getenv('HF_TOKEN', 'your-key-if-not-using-env')
DB = "agile_process"

In [3]:
# Log in to HuggingFace

hf_token = os.environ['HF_TOKEN']
login(hf_token, add_to_git_credential=True)

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


In [4]:
# Load all DOCX files from the Documents folder
files = glob.glob("Documents/*.docx")

documents = []

def add_metadata(doc, doc_type):
    doc.metadata["doc_type"] = doc_type
    return doc

for file_path in files:
    try:
        doc_type = os.path.splitext(os.path.basename(file_path))[0]
        loader = UnstructuredWordDocumentLoader(file_path)
        doc = loader.load()
        documents.extend([add_metadata(d, doc_type) for d in doc])
    except Exception as e:
        print(f"Error loading {file_path}: {e}")

# Split documents into chunks
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunks = text_splitter.split_documents(documents)

print(f"Total number of chunks: {len(chunks)}")
print(f"Document types found: {set(doc.metadata['doc_type'] for doc in documents)}")


Total number of chunks: 17
Document types found: {'Process_Doc'}
