In [2]:
import pandas as pd
from langchain_huggingface.embeddings import HuggingFaceEmbeddings

# Load the CSV file
file_path = 'articles.csv'
try:
    df = pd.read_csv(file_path)
except FileNotFoundError:
    print(f"File not found: {file_path}")
    raise

# Ensure the 'articles' column exists
text_column = 'articles'  # Update this based on the actual column name
if text_column not in df.columns:
    raise KeyError(f"Column '{text_column}' not found in the DataFrame. Available columns: {df.columns}")

# Initialize the HuggingFaceEmbeddings model
embedding_model_name = "sentence-transformers/all-MiniLM-L6-v2"
embeddings = HuggingFaceEmbeddings(model_name=embedding_model_name)

# Generate embeddings
def generate_embeddings(texts):
    return [embeddings.embed_query(text) for text in texts]

df['embeddings'] = generate_embeddings(df[text_column])

# Save the embeddings to a CSV file
output_file = 'embeddings.csv'
df[[text_column, 'embeddings']].to_csv(output_file, index=False)
print(f"Embeddings saved to {output_file}")


Embeddings saved to embeddings.csv


In [3]:
# article recommendations
import pandas as pd
from langchain_huggingface.embeddings import HuggingFaceEmbeddings
from scipy.spatial.distance import cosine

# Load the CSV file
file_path = 'articles.csv'
try:
    df = pd.read_csv(file_path)
except FileNotFoundError:
    print(f"File not found: {file_path}")
    raise

# Ensure the 'articles' column exists
text_column = 'articles'
if text_column not in df.columns:
    raise KeyError(f"Column '{text_column}' not found in the DataFrame. Available columns: {df.columns}")

# Initialize the HuggingFaceEmbeddings model
embedding_model_name = "sentence-transformers/all-MiniLM-L6-v2"
embeddings = HuggingFaceEmbeddings(model_name=embedding_model_name)

# Generate embeddings
def generate_embeddings(texts):
    return [embeddings.embed_query(text) for text in texts]

df['embeddings'] = generate_embeddings(df[text_column])

# Save the embeddings to a CSV file
output_file = 'embeddings.csv'
df[[text_column, 'embeddings']].to_csv(output_file, index=False)
print(f"Embeddings saved to {output_file}")

# Function to get article recommendations
def get_article_recommendations(article_index, df, n_recommendations=3):
    target_embedding = df['embeddings'].iloc[article_index]
    
    similarities = []
    for idx, embedding in enumerate(df['embeddings']):
        if idx != article_index:
            similarity = 1 - cosine(target_embedding, embedding)
            similarities.append((idx, similarity))
    
    # Sort by similarity and get top recommendations
    recommendations = sorted(similarities, key=lambda x: x[1], reverse=True)[:n_recommendations]
    
    return [(df[text_column].iloc[idx], sim) for idx, sim in recommendations]

# Example usage:
article_idx = 0  # Index of the article you want recommendations for
recommendations = get_article_recommendations(article_idx, df)
print("Based on:", df[text_column].iloc[article_idx][:100], "...\n")
print("Recommendations:")
for article, similarity in recommendations:
    print(f"Similarity: {similarity:.2f}")
    print(f"Article: {article[:100]}...")
    print()

Embeddings saved to embeddings.csv
Based on: Create Next App
Drinks Food
 Life
Subscribe
About
On the Love of Raki and Turkish Food 
 
A little b ...

Recommendations:
Similarity: 0.45
Article: Create Next App
Drinks Food
 Life
Subscribe
About
On the Love of Mole at Meson Antugua Santa Catarin...

Similarity: 0.43
Article: Create Next App
Drinks Food
 Life
Subscribe
About
Looking to add some pop to your website? Let us he...

Similarity: 0.42
Article: Create Next App
Drinks Food
 Life
Subscribe
About
Drinking Amaro Montenegro at ZXY Gallery
ZXY Galle...



In [4]:
from langchain_huggingface.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_community.llms import HuggingFacePipeline
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
import numpy as np
import torch
import ast  # For safely evaluating strings containing lists

# First, convert embeddings back to the right format
# The embeddings in df are already numpy arrays, so we don't need to convert them
embeddings_list = df['embeddings'].tolist()
texts = df[text_column].tolist()

# Create FAISS vector store with existing embeddings
vector_store = FAISS.from_embeddings(
    list(zip(texts, embeddings_list)),
    embeddings
)

# Initialize SmolLM model
model_id = "HuggingFaceTB/SmolLM-360M-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.float16,
    device_map="auto"
)

# Create text generation pipeline
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=256,
    temperature=0.1,
    do_sample=True,
    pad_token_id=tokenizer.eos_token_id
)

# Initialize LangChain LLM
llm = HuggingFacePipeline(pipeline=pipe)

def rag_response(question: str, num_docs=3):
    # Retrieve relevant documents
    docs = vector_store.similarity_search(question, k=num_docs)
    
    # Create context from retrieved documents
    context = "\n\n".join([doc.page_content for doc in docs])
    
    # Create prompt following SmolLM-Instruct format
    prompt = f"""### Instruction: Using the following context, provide a clear and accurate answer to the question. If the information isn't available in the context, say so.

Context:
{context}

Question: {question}

### Response: Based on the provided context, """

    # Generate response
    response = llm.invoke(prompt)
    
    return {
        "answer": response,
        "sources": [doc.page_content[:200] + "..." for doc in docs]
    }

# Example usage
def ask_question(question: str):
    result = rag_response(question)
    print("Answer:", result["answer"])
    print("\nSources:")
    for i, source in enumerate(result["sources"], 1):
        print(f"\nSource {i}:", source)

# Let's print the first embedding to verify the format
print("First embedding shape:", len(embeddings_list[0]))

First embedding shape: 384


  llm = HuggingFacePipeline(pipeline=pipe)


In [5]:
import pickle
import pandas as pd

# Save the FAISS vector store
with open("vector_store.pkl", "wb") as f:
    pickle.dump(vector_store, f)

# Save embeddings and texts as a CSV for compatibility
df_saved = pd.DataFrame({
    "text": texts,
    "embeddings": embeddings_list
})
df_saved.to_csv("embeddings_and_texts.csv", index=False)

print("Vector store and embeddings saved!")


Vector store and embeddings saved!


In [9]:
# Ask a specific question about your documents
question = "What is one of the main topics discussed in these articles?"
ask_question(question)

# Ask follow-up questions
question2 = "Can you summarize the key findings?"
ask_question(question2)

# Ask for specific information
question3 = "Are there any recommendations or conclusions mentioned?"
ask_question(question3)

Answer: ### Instruction: Using the following context, provide a clear and accurate answer to the question. If the information isn't available in the context, say so.

Context:
Create Next App
Drinks Food
 Life
Subscribe
About
An Awkward Question about Natural Wine 
An awkward question is posed, “hey...there...I'm looking for a ...natural wine...something funky...that isn’t going to cost over twenty five dollars?”
If you are like me, then your normal speaking voice is ridden with pauses so as to humbly point out a lack of certainty. Luckily, the people at
 
Irving Bottle
 
are prepared for this. When it comes to the new movement of natural wines, one could say the whole movement should be speaking in such a tone. A movement that is clearly going to gain steam in 2019, natural wines do not come with a specific certification one has to pay for necessarily, like a wine that is certified organic or biodynamic for example and thus, the definition of natural is open to interpretation.
While t

KeyboardInterrupt: 

In [6]:
# Ask a specific question about your documents
question = "Where can I get Natural Wine?"
ask_question(question)

# Ask follow-up questions
question2 = "How can I invest in Whiskey?"
ask_question(question2)



Answer: ### Instruction: Using the following context, provide a clear and accurate answer to the question. If the information isn't available in the context, say so.

Context:
Create Next App
Drinks Food
 Life
Subscribe
About
An Awkward Question about Natural Wine 
An awkward question is posed, “hey...there...I'm looking for a ...natural wine...something funky...that isn’t going to cost over twenty five dollars?”
If you are like me, then your normal speaking voice is ridden with pauses so as to humbly point out a lack of certainty. Luckily, the people at
 
Irving Bottle
 
are prepared for this. When it comes to the new movement of natural wines, one could say the whole movement should be speaking in such a tone. A movement that is clearly going to gain steam in 2019, natural wines do not come with a specific certification one has to pay for necessarily, like a wine that is certified organic or biodynamic for example and thus, the definition of natural is open to interpretation.
While t

In [None]:
#blah 1
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain_huggingface.embeddings import HuggingFaceEmbeddings

# Split articles into chunks
text_splitter = CharacterTextSplitter(chunk_size=300, chunk_overlap=50)
texts = text_splitter.split_text(df['articles'].tolist())

# Generate embeddings
embedding_model_name = "sentence-transformers/all-MiniLM-L6-v2"
embeddings = HuggingFaceEmbeddings(model_name=embedding_model_name)
text_embeddings = [embeddings.embed_query(text) for text in texts]

# Store in FAISS
vector_store = FAISS.from_texts(texts, embeddings)
vector_store.save_local("faiss_index")


TypeError: expected string or bytes-like object, got 'list'