### Install Libraries

In [4]:
! pip install langchain langchain-community openai groq tiktoken pinecone-client langchain_pinecone unstructured pdfminer==20191125 pdfminer.six==20221105 pillow_heif unstructured_inference sentence-transformers python-dotenv kaggle



In [9]:
from langchain.document_loaders import UnstructuredPDFLoader, OnlinePDFLoader, WebBaseLoader, YoutubeLoader, DirectoryLoader, TextLoader, PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from sklearn.metrics.pairwise import cosine_similarity
from langchain_pinecone import PineconeVectorStore
from langchain.embeddings import OpenAIEmbeddings
from langchain_community.embeddings import HuggingFaceEmbeddings
# from google.colab import userdata
from langchain.schema import Document
from sentence_transformers import SentenceTransformer
from pinecone import Pinecone
from openai import OpenAI
import numpy as np
import tiktoken
import os
from groq import Groq
from dotenv import load_dotenv

load_dotenv()

pinecone_api_key = os.getenv("PINECONE_API_KEY")
os.environ['PINECONE_API_KEY'] = pinecone_api_key

# openai_api_key = userdata.get("OPENAI_API_KEY")
# os.environ['OPENAI_API_KEY'] = openai_api_key
# openai_client = OpenAI()

groq_api_key = os.getenv("GROQ_API_KEY")
os.environ['GROQ_API_KEY'] = groq_api_key

### Initialize HuggingFace Embeddings client

In [10]:
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

  embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [14]:
sample_text = "Hello World!"

query_result = embeddings.embed_query(sample_text)

In [15]:
query_result

[-0.020386816933751106,
 0.025280792266130447,
 -0.0005662452895194292,
 0.011615419760346413,
 -0.03798845037817955,
 -0.11998119205236435,
 0.04170941933989525,
 -0.02085712179541588,
 -0.05900676175951958,
 0.024232570081949234,
 0.0621202290058136,
 0.06767992675304413,
 0.033100228756666183,
 -0.010369383729994297,
 -0.03121573105454445,
 -0.032733216881752014,
 -0.0021117650903761387,
 0.009261962957680225,
 -0.12476455420255661,
 0.011236815713346004,
 0.03904544934630394,
 0.054402485489845276,
 -0.0028255314100533724,
 0.044556282460689545,
 -0.08542022109031677,
 -0.02287369966506958,
 0.039140552282333374,
 0.03604685142636299,
 -0.03212680667638779,
 -0.06425869464874268,
 0.05812907963991165,
 0.04669088125228882,
 0.08061561733484268,
 -0.007734259124845266,
 -0.02208320051431656,
 0.06713147461414337,
 -0.04504144564270973,
 -0.10212123394012451,
 0.001264391466975212,
 0.04680197685956955,
 0.02639591135084629,
 -0.06990957260131836,
 -0.04453347250819206,
 -0.006901960

In [16]:
len(query_result)

384

### Initialize Groq client

In [17]:
# Free Llama 3.1 API via Groq

groq_client = Groq(api_key=os.getenv('GROQ_API_KEY'))

### Calculate senetence similarity using embeddings

In [22]:
def get_huggingface_embeddings(text, model_name="sentence-transformers/all-MiniLM-L6-v2"):
    model = SentenceTransformer(model_name)
    return model.encode(text)


def cosine_similarity_between_sentences(sentence1, sentence2):
    # Get embeddings for both sentences
    embedding1 = np.array(get_huggingface_embeddings(sentence1))
    embedding2 = np.array(get_huggingface_embeddings(sentence2))

    # Reshape embeddings for cosine_similarity function
    embedding1 = embedding1.reshape(1, -1)
    embedding2 = embedding2.reshape(1, -1)

    print("Embedding for Sentence 1:", embedding1)
    print("\nEmbedding for Sentence 2:", embedding2)

    # Calculate cosine similarity
    similarity = cosine_similarity(embedding1, embedding2)
    return similarity[0][0]


# Example case
sentence1 = "Messi is the GOAT of soccer"
sentence2 = "Ronaldo is the GOAT of soccer"


similarity = cosine_similarity_between_sentences(sentence1, sentence2)
print(f"\n\nCosine similarity between '{sentence1}' and '{sentence2}': {similarity:.4f}")



Embedding for Sentence 1: [[ 9.38195735e-02  7.31627047e-02 -4.58354205e-02 -4.08518054e-02
   1.01908799e-02  4.66041155e-02  5.36862314e-02  7.47547224e-02
   9.87894386e-02  5.35894521e-02  2.60681789e-02 -1.04978532e-01
   3.10286563e-02  6.27063513e-02  1.17027663e-01 -1.49084497e-02
  -6.79235533e-02 -5.16762994e-02  2.14818809e-02 -1.13156162e-01
  -9.03053023e-03 -4.20069695e-03 -2.14867163e-02 -5.45071959e-02
   1.82195231e-02 -9.34087783e-02  2.18170658e-02  4.86695245e-02
  -2.13192552e-02 -2.66328845e-02  5.13898134e-02  1.55285308e-02
   1.21598840e-02 -6.90901354e-02 -1.42612457e-02  9.17946622e-02
   6.93361610e-02  5.23859039e-02  1.12035811e-01 -8.83120857e-03
  -3.44977453e-02 -2.42918395e-02 -1.06337806e-02 -1.37211597e-02
   1.00750811e-01 -1.39364842e-02 -1.34442924e-02 -3.35518308e-02
  -4.18993039e-03 -1.22694569e-02 -3.87586728e-02  4.82078493e-02
   8.37437883e-02 -2.73859967e-02  7.97670148e-03  6.01575971e-02
   2.76909415e-02 -3.90203670e-02  4.72285934e-02 

### Load Sample Dataset

In [27]:
! pip install kaggle

! kaggle datasets download -d ayoubcherguelaine/company-documents-dataset





ERROR: Could not find a version that satisfies the requirement git (from versions: none)
ERROR: No matching distribution found for git


Dataset URL: https://www.kaggle.com/datasets/ayoubcherguelaine/company-documents-dataset
License(s): apache-2.0
company-documents-dataset.zip: Skipping, found more recently modified local copy (use --force to force download)


### Zip Files

In [29]:
import zipfile

# Path to the downloaded zip file
zip_file_path = "company-documents-dataset.zip"

# Unzipping the file
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(".")

In [30]:
def process_directory(directory_path):
    data = []
    for root, _, files in os.walk(directory_path):
        for file in files:

            file_path = os.path.join(root, file)
            print(f"Processing file: {file_path}")
            loader = PyPDFLoader(file_path)
            data.append({"File": file_path, "Data": loader.load()})

    return data

directory_path = "CompanyDocuments"
documents = process_directory(directory_path)

Processing file: CompanyDocuments\Inventory Report\monthly\monthly\StockReport_2016-07.pdf
Processing file: CompanyDocuments\Inventory Report\monthly\monthly\StockReport_2016-08.pdf
Processing file: CompanyDocuments\Inventory Report\monthly\monthly\StockReport_2016-09.pdf
Processing file: CompanyDocuments\Inventory Report\monthly\monthly\StockReport_2016-10.pdf
Processing file: CompanyDocuments\Inventory Report\monthly\monthly\StockReport_2016-11.pdf
Processing file: CompanyDocuments\Inventory Report\monthly\monthly\StockReport_2016-12.pdf
Processing file: CompanyDocuments\Inventory Report\monthly\monthly\StockReport_2017-01.pdf
Processing file: CompanyDocuments\Inventory Report\monthly\monthly\StockReport_2017-02.pdf
Processing file: CompanyDocuments\Inventory Report\monthly\monthly\StockReport_2017-03.pdf
Processing file: CompanyDocuments\Inventory Report\monthly\monthly\StockReport_2017-04.pdf
Processing file: CompanyDocuments\Inventory Report\monthly\monthly\StockReport_2017-05.pdf

In [31]:
documents

[{'File': 'CompanyDocuments\\Inventory Report\\monthly\\monthly\\StockReport_2016-07.pdf',
  'Data': [Document(metadata={'source': 'CompanyDocuments\\Inventory Report\\monthly\\monthly\\StockReport_2016-07.pdf', 'page': 0}, page_content="Stock Report for 2016-07\nCategory\nProduct\nUnits Sold\nUnits in Stock\nUnit Price\nBeverages\nChang\n105\n17\n19\nBeverages\nChartreuse verte\n48\n69\n18\nBeverages\nGuaraná Fantástica\n43\n20\n4.5\nBeverages\nLakkalikööri\n15\n57\n18\nBeverages\nOutback Lager\n41\n15\n15\nBeverages\nSteeleye Stout\n20\n20\n18\nCondiments\nChef Anton's Gumbo Mix\n77\n0\n21.35\nCondiments\nLouisiana Fiery Hot..\n35\n76\n21.05\nCondiments\nOriginal Frankfurter..\n27\n32\n13\nConfections\nMaxilaku\n40\n10\n20\nConfections\nPavlova\n95\n29\n17.45\nConfections\nSchoggi Schokolade\n25\n49\n43.9\nConfections\nSir Rodney's Marmalade\n40\n40\n81\nConfections\nSir Rodney's Scones\n30\n3\n10\nConfections\nTarte au sucre\n15\n17\n49.3\nDairy Products\nCamembert Pierrot\n40\n19\n