In [9]:
import os
import requests
from bs4 import BeautifulSoup
import PyPDF2

# Define folder paths
Miniproject_dir = './Miniproject'
resources_dir = os.path.join(Miniproject_dir, 'resources')

# Create Miniproject folder if it doesn't exist
if not os.path.exists(Miniproject_dir):
    os.makedirs(Miniproject_dir)

# Create resources folder inside Miniproject if it doesn't exist
if not os.path.exists(resources_dir):
    os.makedirs(resources_dir)

# Load data from Wikipedia
wiki_url = 'https://en.wikipedia.org/wiki/Healthcare_in_Denmark'
wiki_response = requests.get(wiki_url)
wiki_soup = BeautifulSoup(wiki_response.text, 'html.parser')
wiki_text = wiki_soup.get_text()

# Save Wikipedia text to a file
wiki_file_path = os.path.join(resources_dir, 'wikipedia_text.txt')
with open(wiki_file_path, 'w', encoding='utf-8') as file:
    file.write(wiki_text)

# Load data from PDF
pdf_url = 'https://sum.dk/Media/C/A/Healthcare-in%20denmark%20an%20overview%20english-V16-dec.pdf'
pdf_response = requests.get(pdf_url)
pdf_file_path = os.path.join(resources_dir, 'healthcare_pdf_text.txt')
with open(pdf_file_path, 'wb') as f:
    f.write(pdf_response.content)

pdf_text = ""
with open(pdf_file_path, 'rb') as f:
    pdf_reader = PyPDF2.PdfReader(f)
    for page_num in range(len(pdf_reader.pages)):
        pdf_text += pdf_reader.pages[page_num].extract_text()

# Save PDF text to a file
with open(pdf_file_path, 'w', encoding='utf-8') as file:
    file.write(pdf_text)


In [10]:
%%markdown
# Text processing


# Text processing



In [11]:
import os
import string
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Download NLTK resources (run only once)
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Define folder paths
Miniproject_dir = 'Miniproject'
resources_dir = os.path.join(Miniproject_dir, 'resources')

# Load Wikipedia text data
wiki_file_path = os.path.join(resources_dir, 'wikipedia_text.txt')
with open(wiki_file_path, 'r', encoding='utf-8') as file:
    wiki_text = file.read()

# Load PDF text data
pdf_file_path = os.path.join(resources_dir, 'healthcare_pdf_text.txt')
with open(pdf_file_path, 'r', encoding='utf-8') as file:
    pdf_text = file.read()

# Text preprocessing
def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()

    # Tokenization
    tokens = word_tokenize(text)

    # Remove punctuation and digits
    table = str.maketrans('', '', string.punctuation + string.digits)
    tokens = [token.translate(table) for token in tokens]

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    return tokens

# Preprocess Wikipedia text
preprocessed_wiki_text = preprocess_text(wiki_text)

# Preprocess PDF text
preprocessed_pdf_text = preprocess_text(pdf_text)

# Display preprocessed text for verification
print("Preprocessed Wikipedia Text:")
print(preprocessed_wiki_text[:10])  # Displaying first 10 tokens for brevity
print("\nPreprocessed PDF Text:")
print(preprocessed_pdf_text[:10])  # Displaying first 10 tokens for brevity


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\mikke\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mikke\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\mikke\AppData\Roaming\nltk_data...


Preprocessed Wikipedia Text:
['healthcare', 'denmark', '', 'wikipedia', 'jump', 'content', 'main', 'menu', 'main', 'menu']

Preprocessed PDF Text:
['', 'healthcare', 'denmark', 'overview', '', 'colophon', 'healthcare', 'denmark', '', 'overview']


In [12]:
%%markdown
# Vectorization
Bag-of-words (BoW)

# Vectorization
Bag-of-words (BoW)


In [13]:
from sklearn.feature_extraction.text import CountVectorizer

# Initialize CountVectorizer
vectorizer = CountVectorizer()

# Fit and transform the preprocessed text data
bow_matrix_wiki = vectorizer.fit_transform([' '.join(preprocessed_wiki_text)])
bow_matrix_pdf = vectorizer.transform([' '.join(preprocessed_pdf_text)])

# Display the shape of the Bag-of-Words matrix
print("Bag-of-Words Matrix Shape (Wikipedia Text):", bow_matrix_wiki.shape)
print("Bag-of-Words Matrix Shape (PDF Text):", bow_matrix_pdf.shape)


Bag-of-Words Matrix Shape (Wikipedia Text): (1, 1064)
Bag-of-Words Matrix Shape (PDF Text): (1, 1064)


In [14]:
%%markdown
TF-IDF (Term Frequency-Inverse Document Frequency)

TF-IDF (Term Frequency-Inverse Document Frequency)


In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the preprocessed text data
tfidf_matrix_wiki = tfidf_vectorizer.fit_transform([' '.join(preprocessed_wiki_text)])
tfidf_matrix_pdf = tfidf_vectorizer.transform([' '.join(preprocessed_pdf_text)])

# Display the shape of the TF-IDF matrix
print("TF-IDF Matrix Shape (Wikipedia Text):", tfidf_matrix_wiki.shape)
print("TF-IDF Matrix Shape (PDF Text):", tfidf_matrix_pdf.shape)

TF-IDF Matrix Shape (Wikipedia Text): (1, 1064)
TF-IDF Matrix Shape (PDF Text): (1, 1064)


In [16]:
%%markdown
# Interactive application

# Interactive application


In [19]:
!pip install langchain==0.1.6 
!pip install langchain-community==0.0.19 
!pip install langchain-core==0.1.23
!pip install langdetect
!pip install sentence-transformers




[notice] A new release of pip available: 22.3 -> 24.0
[notice] To update, run: C:\Users\mikke\AppData\Local\Programs\Python\Python311\python.exe -m pip install --upgrade pip





[notice] A new release of pip available: 22.3 -> 24.0
[notice] To update, run: C:\Users\mikke\AppData\Local\Programs\Python\Python311\python.exe -m pip install --upgrade pip





[notice] A new release of pip available: 22.3 -> 24.0
[notice] To update, run: C:\Users\mikke\AppData\Local\Programs\Python\Python311\python.exe -m pip install --upgrade pip





[notice] A new release of pip available: 22.3 -> 24.0
[notice] To update, run: C:\Users\mikke\AppData\Local\Programs\Python\Python311\python.exe -m pip install --upgrade pip


Collecting sentence-transformers
  Using cached sentence_transformers-2.7.0-py3-none-any.whl (171 kB)
Collecting transformers<5.0.0,>=4.34.0
  Using cached transformers-4.40.2-py3-none-any.whl (9.0 MB)
Collecting torch>=1.11.0
  Downloading torch-2.3.0-cp311-cp311-win_amd64.whl (159.8 MB)
     ------------------------------------- 159.8/159.8 MB 21.1 MB/s eta 0:00:00
Collecting huggingface-hub>=0.15.1
  Using cached huggingface_hub-0.23.0-py3-none-any.whl (401 kB)
Collecting Pillow
  Downloading pillow-10.3.0-cp311-cp311-win_amd64.whl (2.5 MB)
     ---------------------------------------- 2.5/2.5 MB 53.6 MB/s eta 0:00:00
Collecting filelock
  Using cached filelock-3.14.0-py3-none-any.whl (12 kB)
Collecting fsspec>=2023.5.0
  Using cached fsspec-2024.3.1-py3-none-any.whl (171 kB)
Collecting sympy
  Using cached sympy-1.12-py3-none-any.whl (5.7 MB)
Collecting networkx
  Using cached networkx-3.3-py3-none-any.whl (1.7 MB)
Collecting mkl<=2021.4.0,>=2021.1.1
  Using cached mkl-2021.4.0-py2


[notice] A new release of pip available: 22.3 -> 24.0
[notice] To update, run: C:\Users\mikke\AppData\Local\Programs\Python\Python311\python.exe -m pip install --upgrade pip


In [21]:
import os
import requests
from bs4 import BeautifulSoup
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from langchain_community.llms import Ollama
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.prompts import PromptTemplate
from langdetect import detect



# Define folder paths
Miniproject_dir = 'Miniproject'
resources_dir = os.path.join(Miniproject_dir, 'resources')

# Load data from Wikipedia
wiki_url = 'https://en.wikipedia.org/wiki/Healthcare_in_Denmark'
wiki_response = requests.get(wiki_url)
wiki_soup = BeautifulSoup(wiki_response.text, 'html.parser')
wiki_text = wiki_soup.get_text()

# Load PDF text data
pdf_url = 'https://sum.dk/Media/C/A/Healthcare-in%20denmark%20an%20overview%20english-V16-dec.pdf'
pdf_response = requests.get(pdf_url)
pdf_text = pdf_response.text

# Text preprocessing
def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()

    # Tokenization
    tokens = word_tokenize(text)

    # Remove punctuation and digits
    tokens = [token for token in tokens if token.isalnum()]

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    return ' '.join(tokens)

# Preprocess Wikipedia text
preprocessed_wiki_text = preprocess_text(wiki_text)

# Preprocess PDF text
preprocessed_pdf_text = preprocess_text(pdf_text)

# Initialize TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the preprocessed text data
tfidf_matrix_wiki = tfidf_vectorizer.fit_transform([preprocessed_wiki_text])
tfidf_matrix_pdf = tfidf_vectorizer.transform([preprocessed_pdf_text])

# Initialize the embeddings model
model_name = "sentence-transformers/paraphrase-MiniLM-L6-v2"
model_kwargs = {"device": "cpu"}
encode_kwargs = {"normalize_embeddings": False}
embeddings_model = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)

# Initialize the Ollama language model
llm = Ollama(model="mistral", callback_manager=CallbackManager([StreamingStdOutCallbackHandler()]))

# Define a prompt template
template = """Use the following pieces of context to answer the question at the end.
If you don't know the answer, just say that you don't know, don't try to make up an answer.
Use five sentences maximum. Keep the answer as concise as possible.

{context}

Question: {question}

Helpful Answer:
"""

# Create a prompt from the template
prompt = PromptTemplate.from_template(template)

# Combine the TF-IDF matrices and embeddings model into a single vector
combined_vector = tfidf_matrix_wiki + tfidf_matrix_pdf + embeddings_model

# Define a function to handle user queries
def handle_query(description):
    # Detect language and handle non-English inputs
    if detect(description) != 'en':
        return "Sorry, I currently support only English descriptions."

    # Process the description with the retrieval chain
    response = llm({"query": description, "context": combined_vector})
    return response["result"]



TypeError: unsupported operand type(s) for +: 'csr_matrix' and 'HuggingFaceEmbeddings'