In [1]:
!pip install pinecone



In [2]:
!pip install cohere



In [3]:
!pip install sentence_transformers



In [4]:
!pip install --upgrade --force-reinstall torch torchvision torchaudio

Collecting torch
  Using cached torch-2.5.0-cp310-cp310-manylinux1_x86_64.whl.metadata (28 kB)
Collecting torchvision
  Using cached torchvision-0.20.0-cp310-cp310-manylinux1_x86_64.whl.metadata (6.1 kB)
Collecting torchaudio
  Using cached torchaudio-2.5.0-cp310-cp310-manylinux1_x86_64.whl.metadata (6.4 kB)
Collecting filelock (from torch)
  Using cached filelock-3.16.1-py3-none-any.whl.metadata (2.9 kB)
Collecting typing-extensions>=4.8.0 (from torch)
  Using cached typing_extensions-4.12.2-py3-none-any.whl.metadata (3.0 kB)
Collecting networkx (from torch)
  Using cached networkx-3.4.1-py3-none-any.whl.metadata (6.3 kB)
Collecting jinja2 (from torch)
  Using cached jinja2-3.1.4-py3-none-any.whl.metadata (2.6 kB)
Collecting fsspec (from torch)
  Using cached fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-

In [5]:
!pip install numpy==1.25.2 --force-reinstall

Collecting numpy==1.25.2
  Using cached numpy-1.25.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.6 kB)
Using cached numpy-1.25.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.2 MB)
Installing collected packages: numpy
  Attempting uninstall: numpy
    Found existing installation: numpy 2.1.2
    Uninstalling numpy-2.1.2:
      Successfully uninstalled numpy-2.1.2
Successfully installed numpy-1.25.2


In [2]:
!pip install wikipedia

Collecting wikipedia
  Downloading wikipedia-1.4.0.tar.gz (27 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: wikipedia
  Building wheel for wikipedia (setup.py) ... [?25l[?25hdone
  Created wheel for wikipedia: filename=wikipedia-1.4.0-py3-none-any.whl size=11679 sha256=3e1f3c1cb63cc6cee9cd697c370a04d2f225a108f0a5083574aecfbd1e658689
  Stored in directory: /root/.cache/pip/wheels/5e/b6/c5/93f3dec388ae76edc830cb42901bb0232504dfc0df02fc50de
Successfully built wikipedia
Installing collected packages: wikipedia
Successfully installed wikipedia-1.4.0


# PART 1

# Importing Libraries

In [None]:
import pinecone
import cohere
from sentence_transformers import SentenceTransformer
import wikipedia
from pinecone import ServerlessSpec
import numpy as np

# Initializing & Pre-Processing

In [None]:
# Initialize Pinecone using the Pinecone class
pinecone = pinecone.Pinecone(api_key="8f1ebcff-82ce-4d59-9cff-f35680fab870", environment="us-west1-gcp")

# Create a Pinecone index (lowercase and hyphen for naming)
index_name = "wiki-articles"
if index_name not in pinecone.list_indexes().names():
    pinecone.create_index(
        name=index_name,
        dimension=384,
        spec=ServerlessSpec(cloud="aws", region="us-east-1"),
        metric="cosine",
    )

# Load the SentenceTransformer model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Function to preprocess Wikipedia articles and store embeddings
def preprocess_and_store(articles):
    for article in articles:
        title = article.title
        text = article.summary

        # Convert text to embedding
        embedding = model.encode(text)

        # Convert embedding to list
        embedding_list = embedding.tolist()

        # Use the index object for upsert
        index = pinecone.Index(index_name)
        index.upsert([(title, embedding_list)])

# Example dataset (replace with your desired articles)
articles = [
    wikipedia.page("Machine Learning", auto_suggest=False),
    wikipedia.page("Artificial Intelligence", auto_suggest=False),
    wikipedia.page("Data Science", auto_suggest=False),
]

# Preprocess and store articles
preprocess_and_store(articles)

# Answring the Query

In [31]:
# Function to retrieve and generate answer
def get_answer(query):
    query_embedding = model.encode(query)

    # Convert query embedding to list
    query_embedding_list = query_embedding.tolist()

    # Use the index object for querying
    index = pinecone.Index(index_name)
    results = index.query(vector=query_embedding_list, top_k=5, include_metadata=True)

    retrieved_docs = [result["id"] for result in results["matches"]]
    prompt = "Based on these documents, answer the question:\n" + query + "\nDocuments:\n" + "\n".join(retrieved_docs)

    # Initialize Cohere (replace with your API key)
    # Instead of cohere.configure, create a Cohere client instance
    co = cohere.Client("epOVT4qDQZjw2fmUxFts1ilaOyivjIOO8AqocChT")

    try:
        # Use the client instance to generate text
        response = co.generate(
            model='command',
            prompt=prompt,
            max_tokens=50,
            temperature=0.8,
            k=0,
            p=1,
            frequency_penalty=0,
            presence_penalty=0,
            stop_sequences=[],
            return_likelihoods='NONE')
        answer = response.generations[0].text
        return answer
    except cohere.CohereError as e:
        print(f"Error generating answer with Cohere: {e}")
        return "Failed to generate answer using Cohere API."


# Example Queries

In [29]:
# Example query
query = "What is the difference between machine learning and artificial intelligence?"
answer = get_answer(query)
print("Here is the answer:", answer)

Here is the answer:  While artificial intelligence (AI) and machine learning (ML) may be considered synonymous, they are not always mutually inclusive. 

AI is a broader concept that deals with creating intelligent machines that can mimic human behavior and solve problems, while machine learning is


In [30]:
query2 = "What is the role of data science in AI?"
answer2 = get_answer(query2)
print("Here is the answer:", answer2)

Here is the answer:  Data science is a multidisciplinary approach that leverages techniques and theories from the fields of mathematics, statistics, computer science, and information science to extract meaningful insights and knowledge from diverse data sets. It focuses on understanding, analyzing, and interpreting data to drive informed decision
