# Loading and Preprocessing Data

In [1]:
# ! pip install langchain
# ! pip install -qU langchain_community beautifulsoup4
! pip install tiktoken
! pip install sentence-transformers

Collecting sentence-transformers
  Using cached sentence_transformers-3.3.1-py3-none-any.whl (268 kB)
Collecting transformers<5.0.0,>=4.41.0
  Using cached transformers-4.46.3-py3-none-any.whl (10.0 MB)
Collecting scikit-learn
  Using cached scikit_learn-1.5.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.3 MB)
Collecting tqdm
  Using cached tqdm-4.67.1-py3-none-any.whl (78 kB)
Collecting Pillow
  Using cached pillow-11.0.0-cp310-cp310-manylinux_2_28_x86_64.whl (4.4 MB)
Collecting scipy
  Using cached scipy-1.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (41.2 MB)
Collecting huggingface-hub>=0.20.0
  Using cached huggingface_hub-0.26.3-py3-none-any.whl (447 kB)
Collecting torch>=1.11.0
  Downloading torch-2.5.1-cp310-cp310-manylinux1_x86_64.whl (906.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m906.4/906.4 MB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:10[0m
Collecting filelock
  Using cached filelock-3.16.1-py3-none-

In [2]:
from langchain_ollama import OllamaLLM

class Models:
    def __init__(self):
        """Initialize the Models class."""
        self.llm = None  # Initialize llm attribute as None

    def mistral(self):
        """Create and return an instance of the Mistral LLM."""
        try:
            self.llm = OllamaLLM(base_url="http://localhost:11434", model="llama3.1", temperature=0)
            print("LLM instance created successfully.")
            return self.llm
        except Exception as e:
            print(f"An error occurred while creating the LLM instance: {e}")
            return None


# Create an instance of the Models class
llm_check = Models()  # Create an instance of the Models class

# Initialize the Mistral LLM using the instance
llm = llm_check.mistral()

if llm:
    print("LLM is ready for use!")
else:
    print("Failed to initialize the LLM.")


LLM instance created successfully.
LLM is ready for use!


In [3]:
from langchain_community.document_loaders import WebBaseLoader
#Initialize the WebBaseLoader
loader = WebBaseLoader("https://dmap.ncdr.nat.gov.tw/1109/disaster-topics/%E7%81%BD%E5%AE%B3%E8%AD%A6%E6%88%92%E5%80%BC/")
#$Load data and store it in a variabe
data = loader.load()
#display data
data

USER_AGENT environment variable not set, consider setting it to identify your requests.


[Document(metadata={'source': 'https://dmap.ncdr.nat.gov.tw/1109/disaster-topics/%E7%81%BD%E5%AE%B3%E8%AD%A6%E6%88%92%E5%80%BC/', 'title': '3D災害潛勢地圖', 'description': '3D災害潛勢地圖，「災害潛勢」指某一地區過去曾發生災害，或未來有較高的致災機會。', 'language': 'zh-hant'}, page_content='\n\n\n\n\n\n3D災害潛勢地圖\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\r\n        XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX;visibility:hidden">\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n3D災害潛勢地圖\n\n\n\n\n\n\n\n情境說明\n\n情境應用\n背景說明\n最新消息\n\n\n\n災害主題\n\n哪裡容易淹水（淹水潛勢）\n山崩、土石流（坡地災害潛勢）\n斷層、土壤液化\n海岸災害 海嘯溢淹\n火山災害\n災害警戒值\n\n\n\n地圖查詢\n\n\n資料分析與下載\n\n縣市層級\n鄉鎮層級\n聚落層級\n警戒值查詢\n下載清單\n\n\n\n相關連結\n\n本中心其他網站\n部會署防災相關\n國外防救災網站\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n情境說明\n\n情境應用\n背景說明\n最新消息\n\n\n\n災害主題\n\n哪裡容易淹水（淹水潛勢）\n山崩、土石流（坡地災害潛勢）\n斷層、土壤液化\n海岸災害 海嘯溢淹\n火山災害\n災害警戒值\n\n\n\n地圖查詢\n\n\n資料分析與下載\n\n縣市層級\n鄉鎮層級\n聚落層級\n警戒值查詢\n下載清單\n\n

In [4]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
#Initialize the text splitter with specified parameter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 200,
    chunk_overlap= 20
)

In [5]:
#chunk the loaded document into smaller chunk
chunks = text_splitter.split_documents(data)
len(chunks)

30

## Data Embedding and Vector Databases

- Once document has been successfully loaded and chunks into smaller part, we can:
1 :
  Choose an Embedding model to transform this human text  into vector, there, will be store in:
2 : Vector database, the vector embeddings  will be stored in a vector store. For this   purpose, we will be using ChromaDB.

In [6]:
from langchain.embeddings import HuggingFaceEmbeddings

# Specify the model name and additional arguments
model_name = "sentence-transformers/all-MiniLM-L6-v2"
model_kwargs = {'device' : 'cpu'}
encode_kwargs = {'normalize_embeddings': False}

# Initialize HuggingFace Embeddings
hf = HuggingFaceEmbeddings(
    model_name = model_name,
    model_kwargs = model_kwargs,
    encode_kwargs = encode_kwargs
)

  hf = HuggingFaceEmbeddings(
  from .autonotebook import tqdm as notebook_tqdm


In [7]:
embeb = hf.embed_documents(texts=['h','e'])
#print lenght of one of embedding to check its dimension
print(len(embeb[1]))

384
