# Data Ingestion and Vectorisation

Document loaders load data from a source into a `Document` object. A `Document` contains a piece of text and associated metadata. <br>
The format of `Document` is convenient for further vectorisation and analysis.

We will use _LangChain_ library for easy loading of documents. See _myloadlib.py_ for details.

In [None]:
!python -m pip install langchain==0.1.0

In [None]:
!pip install -U pip setuptools wheel
!pip install -U spacy

In [None]:
!pip install -U torch

In [None]:
import torch

## Set Environment

In [None]:
import os
import pandas as pd

In [None]:
import langdetect
from langdetect import DetectorFactory, detect, detect_langs

In [None]:
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler

In [None]:
from langchain.docstore.document import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory
from langchain.prompts import PromptTemplate

In [None]:
# Embedding facilities
from langchain.embeddings import HuggingFaceEmbeddings

In [None]:
# Pipelines
from langchain import HuggingFacePipeline
from langchain.chains import RetrievalQA

In [None]:
from langchain_community.embeddings.spacy_embeddings import SpacyEmbeddings

In [None]:
embedder = SpacyEmbeddings(model_name="en_core_web_sm")

In [None]:
!pip install --upgrade --quiet  spacy

In [None]:
!pip3 install https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.0/en_core_web_sm-2.2.0.tar.gz

## Load Documents

In [None]:
import myloadlib
from myloadlib import loadDir, loadFile, loadWiki, loadYoutube, readAPI

In [None]:
import importlib 
importlib.reload(myloadlib)

In [None]:
# Collect all here
documents = []

###  Load Single Files

In [None]:
file = "./data/welcome-to-cphbusiness.pdf"

In [None]:
# file = './data/cphbusiness-2025-strategi-pixi_web.pdf'

In [None]:
docs = myloadlib.loadFile(file)

In [None]:
documents.extend(docs)
len(documents)

### Test

In [None]:
# metadata of loaded Document
docs[0].metadata 

In [None]:
# content of the Document
# doc[0].page_content[:1000]
documents[0].page_content

In [None]:
documents[0].metadata

### Load Wiki

In [None]:
subject = "Copenhagen Business Academy"
# subject = pd.DataFrame(tab.name)

In [None]:
# subject = 'Cphbusiness'

In [None]:
lang = 'en'

In [None]:
docs = myloadlib.loadWiki(subject, lang, 2)

In [None]:
documents.extend(docs)

In [None]:
len(documents)

In [None]:
documents[17].page_content[50:1000]

### Load Youtube

In [None]:
# url="https://www.youtube.com/watch?v=LUCwMPLqdpA&t=12s"
url = 'https://www.youtube.com/watch?v=D04DaEzIV-A&t=4s'
save_dir="./media/"

In [None]:
url

In [None]:
lang = 'en'

In [None]:
docs = myloadlib.loadYoutube(url, lang)

In [None]:
documents.extend(docs)
len(documents)

In [None]:
# metadata of loaded Document
documents[18].type

In [None]:
# content of the Document
# doc[0].page_content[:1000]
documents[18].page_content

In [None]:
documents[18].metadata

## Chunking

![image-2.png](attachment:image-2.png)

In [None]:
import myutils
from myutils import chunkDocs, langDetect, wordCloud

In [None]:
import importlib 
importlib.reload(myutils)

In [None]:
splits = myutils.chunkDocs(documents, 140)  
splits

In [None]:
len(splits)

In [None]:
splits[70]

In [None]:
# put the splits in dataframe
df = pd.DataFrame(splits, columns=['page_content', 'metadata', 'type'])
df.sample(5)

In [None]:
df['page_content'][0]

In [None]:
df['metadata'][0]

In [None]:
df['type'][0]

In [None]:
im, longstring = myutils.wordCloud(df, 'page_content')

In [None]:
im

In [None]:
import spacy
import langdetect
from wordcloud import WordCloud 
from langdetect import DetectorFactory, detect, detect_langs

In [None]:
# Detect the language of the text
def langDetect(text):
    mylang = 'en'
    default_model = 'en_core_web_md'
    mylangprob = 0.0
    try:
        langs = langdetect.detect_langs(text)
        mylang, mylangprop = langs[0].lang, langs[0].prob 
        
        # English
        if mylang=='en': 
            models = ['en_core_web_md', 'da_core_news_md']
            default_model = 'en_core_web_md'
        # Danish    
        elif mylang=='da' or lang=='no': 
            models = ['da_core_news_md', 'en_core_web_md']
            default_model = 'da_core_news_md'
        # both    
        nlp = spacy.load(default_model)
        stopw = nlp.Defaults.stop_words
    
    # another language
    except langdetect.lang_detect_exception.LangDetectException:
        log.debug('Language not supported')
    
    print('mylang = ' + mylang)
    return default_model, stopw

In [None]:
langDetect('Language not supported')

In [None]:
!python -m spacy download da_core_news_md

In [None]:
!python -m spacy download en_core_web_sm

In [None]:
!python -m spacy validate

In [None]:
!python -m spacy download en_core_web_trf
!python -m spacy download en_core_web_md
!python -m spacy download da_core_news_sm

In [None]:
!python -m spacy info


In [None]:
import spacy
from spacy.lang.da.examples import sentences 

nlp = spacy.load("da_core_news_sm")
doc = nlp(sentences[0])
print(doc.text)
for token in doc:
    print(token.text, token.pos_,  token.dep_)

In [None]:
nlp = spacy.load("en_core_web_sm")

In [None]:
#import en_core_web_sm
#nlp = en_core_web_sm.load()

## Embeddings

In [None]:
model_name = "sentence-transformers/all-mpnet-base-v2"
# model_name = "sentence-transformers/all-MiniLM-l6-v2"
model_kwargs = {'device': 'cpu'}
encode_kwargs = {'normalize_embeddings': False}

In [None]:
embeddings = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)

In [None]:
# from langchain.vectorstores import FAISS

In [None]:
# db = FAISS.from_documents(splits, embeddings)

![image.png](attachment:image.png)

## Storing the Embeddings in Vector DB

In [None]:
# db = Chroma.from_documents(splits, embeddings)

In [None]:
persist_directory = '../data/chroma/'

# Create the vector store
vectordb = Chroma.from_documents(
    documents=splits,
    embedding=embeddings,
    persist_directory=persist_directory
)
vectordb.persist()

In [None]:
vectordb._collection.count()

## Similarity Search

In [None]:
query = 'Which are the parking options at Cphbusiness?'

In [None]:
answer = vectordb.similarity_search(query, k=3)
answer

In [None]:
answer = db.similarity_search(query, k=3)
answer

In [None]:
for d in answer:
    print(d.page_content)

In [None]:
for d in answer:
    print(d.metadata)
    # del(d.metadata)

## Information Retrieval

In [None]:
question = 'When is students administration open?'

In [None]:
question = 'Where are the parking places of Cphbusiness?'

In [None]:
question = 'Which are the strategic goals of Cphbusiness?'

In [None]:
# first, fetch 5 most similar, then choose 4 most diverse
answer = vectordb.max_marginal_relevance_search(question, k=2, fetch_k=5)
for d in answer:
    print(d.page_content)

![image.png](attachment:image.png)

In [None]:
!bye

Images: https://medium.com/@onkarmishra/using-langchain-for-question-answering-on-own-data-3af0a82789ed