We will use the BBC News Classification Dataset

In [1]:
import pandas as pd
url = "https://raw.githubusercontent.com/susanli2016/PyCon-Canada-2019-NLP-Tutorial/master/bbc-text.csv"
df  = pd.read_csv(url)
df = df.sample(n=1000, random_state=42)  # Sample 1000 rows for faster processing

In [2]:
df[df.isna().any(axis=1)]  # Check for any NaN values in the DataFrame

Unnamed: 0,category,text


There is not any NaN value

In [3]:
data = df.to_dict('records')  # Convert DataFrame to a list of dictionaries

In [4]:
from qdrant_client import models, QdrantClient
from sentence_transformers import SentenceTransformer

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
encoder = SentenceTransformer('all-MiniLM-L6-v2') # Model to create embeddings

In [6]:
# create the vector database client
qdrant = QdrantClient(":memory:") # Create in-memory Qdrant instance

In [9]:
# Create collection to store wines
qdrant.recreate_collection(
    collection_name="bbc_news",
    vectors_config=models.VectorParams(
        size=encoder.get_sentence_embedding_dimension(), # Vector size is defined by used model
        distance=models.Distance.COSINE
    )
)

  qdrant.recreate_collection(


True

In [10]:
# vectorize!
qdrant.upload_points(
    collection_name="bbc_news",
    points=[
        models.PointStruct(
            id=idx,
            vector=encoder.encode(doc["text"]).tolist(),
            payload=doc,
        ) for idx, doc in enumerate(data) # data is the variable holding all the bbc_news
    ]
)

In [11]:
user_prompt = "How did the stock market react to major banking news?"

In [12]:
# Search time 

hits = qdrant.search(
    collection_name="bbc_news",
    query_vector=encoder.encode(user_prompt).tolist(),
    limit=3
)
for hit in hits:
  print(hit.payload, "score:", hit.score)

{'category': 'business', 'text': 'stock market eyes japan recovery japanese shares have ended the year at their highest level since 13 july amidst hopes of an economic recovery during 2005.  the nikkei index of leading shares gained 7.6% during the year to close at 11 488.76 points. in 2005 it  will rise toward 13 000   predicted morgan stanley equity strategist naoki kamiyama. the optimism in the financial markets contrast sharply with pessimism in the japanese business community. earlier this month  the quarterly tankan survey of japanese manufacturers found that business confidence had weakened for the first time since march 2003.  slower economic growth  rising oil prices  a stronger yen and weaker exports were blamed for the fall in confidence. despite this  traders expect strength in the global economy to benefit japan  which has been close to sliding into recession in recent months. structural reform within japan and an anticipated end to the banking sector s bad debt problems s

  hits = qdrant.search(


In [14]:
# define a variable to hold the search results
search_results = [hit.payload for hit in hits]
search_results

[{'category': 'business',
  'text': 'stock market eyes japan recovery japanese shares have ended the year at their highest level since 13 july amidst hopes of an economic recovery during 2005.  the nikkei index of leading shares gained 7.6% during the year to close at 11 488.76 points. in 2005 it  will rise toward 13 000   predicted morgan stanley equity strategist naoki kamiyama. the optimism in the financial markets contrast sharply with pessimism in the japanese business community. earlier this month  the quarterly tankan survey of japanese manufacturers found that business confidence had weakened for the first time since march 2003.  slower economic growth  rising oil prices  a stronger yen and weaker exports were blamed for the fall in confidence. despite this  traders expect strength in the global economy to benefit japan  which has been close to sliding into recession in recent months. structural reform within japan and an anticipated end to the banking sector s bad debt problem

In [15]:
# Now time to connect to the local large language model
from openai import OpenAI
client = OpenAI(
    base_url="http://127.0.0.1:8080/v1", # "http://<Your api-server IP>:port"
    api_key = "sk-no-key-required"
)
completion = client.chat.completions.create(
    model="LLaMA_CPP",
    messages=[
        {"role": "system", "content": "You are chatbot, a news specialist. Your top priority is to help guide users to the most relevant news articles based on their queries."},
        {"role": "user", "content": user_prompt},
        {"role": "assistant", "content": str(search_results)}
    ]
)
print(completion.choices[0].message)

