In [1]:
from langchain.document_loaders import UnstructuredPDFLoader, OnlinePDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Load your data

In [12]:
loader = UnstructuredPDFLoader("pdf-files/multicolunm-code-travail-cmr.pdf")

# Tests

In [4]:
import pdf2image
from pdf2image import convert_from_path
pages = convert_from_path("pdf-files/multicolunm-code-travail-cmr.pdf",  poppler_path = r'C:\poppler-0.68.0\bin')

In [5]:
len(pages)

31

In [10]:
images_from_path = convert_from_path("pdf-files/multicolunm-code-travail-cmr.pdf", output_folder="images/", fmt="jpeg")

In [5]:
import os
os.environ["PATH"].split(";")

['C:\\Program Files\\Common Files\\Oracle\\Java\\javapath',
 'C:\\Program Files (x86)\\Common Files\\Oracle\\Java\\javapath',
 'C:\\Program Files (x86)\\Intel\\iCLS Client\\',
 'C:\\Program Files\\Intel\\iCLS Client\\',
 'C:\\WINDOWS\\system32',
 'C:\\WINDOWS',
 'C:\\WINDOWS\\System32\\Wbem',
 'C:\\WINDOWS\\System32\\WindowsPowerShell\\v1.0\\',
 'C:\\Program Files (x86)\\Intel\\Intel(R) Management Engine Components\\DAL',
 'C:\\Program Files\\Intel\\Intel(R) Management Engine Components\\DAL',
 'C:\\Program Files (x86)\\Intel\\Intel(R) Management Engine Components\\IPT',
 'C:\\Program Files\\Intel\\Intel(R) Management Engine Components\\IPT',
 'C:\\WINDOWS\\System32\\OpenSSH\\',
 'C:\\Program Files\\Intel\\WiFi\\bin\\',
 'C:\\Program Files\\Common Files\\Intel\\WirelessCommon\\',
 'C:\\Program Files\\Git\\cmd',
 'C:\\Program Files\\Docker\\Docker\\resources\\bin',
 'C:\\ProgramData\\DockerDesktop\\version-bin',
 'C:\\Program Files\\Amazon\\AWSCLIV2\\',
 'C:\\Program Files\\nodejs\\',
 

# Continue

In [13]:
data = loader.load()

In [7]:
print (f'You have {len(data)} document(s) in your data')
print (f'There are {len(data[0].page_content)} characters in your document')

You have 1 document(s) in your data
There are 119365 characters in your document


In [None]:
file_path = 'ulang-multicolumn-code-travail-cmr.txt'
if os.path.exists(file_path):
    # Delete the file
    os.remove(file_path)
with open(file_path, 'w') as file:
    # Write the text to the file
    file.write(data[0].page_content)

# Chunk your data up into smaller documents

In [8]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
texts = text_splitter.split_documents(data)

In [9]:
print (f'Now you have {len(texts)} documents')

Now you have 136 documents


In [10]:
texts[0]

Document(page_content='Cameroun Code du Travail Loi n°92-007 du 14 août 1992\n\nLoi n°92-007 du 14 août 1992', lookup_str='', metadata={'source': 'Cameroun-Code-1992-travail.pdf'}, lookup_index=0)

# Create embeddings of your documents to get ready for semantic search

In [11]:
from langchain.vectorstores import Chroma, Pinecone
from langchain.embeddings.openai import OpenAIEmbeddings
import pinecone

In [12]:
OPENAI_API_KEY = open('openai.YEK.IPA.txt').readline().strip()
PINECONE_API_KEY = open('pinecone.YEK.IPA.txt').readline().strip()
PINECONE_API_ENV = 'us-east-1-aws'

In [13]:
embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)

In [14]:
# initialize pinecone
pinecone.init(
    api_key=PINECONE_API_KEY,  # find at app.pinecone.io
    environment=PINECONE_API_ENV  # next to api key in console
)
index_name = "langchain2"

In [15]:
docsearch = Pinecone.from_texts([t.page_content for t in texts], embeddings, index_name=index_name)