<a href="https://colab.research.google.com/github/advik-7/Deep_Learning_projects/blob/main/kannada_vectorbase_querying.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install faiss-gpu

Collecting faiss-gpu
  Downloading faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.4 kB)
Downloading faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (85.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.5/85.5 MB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-gpu
Successfully installed faiss-gpu-1.7.2


In [5]:
import faiss
import numpy as np
import time
from sentence_transformers import SentenceTransformer

# Example: Read Kannada text from a file (replace this with the actual file path)
def read_text_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return file.readlines()

# Function to vectorize the Kannada text data using a multilingual model
def vectorize_text(text_data, model):
    return model.encode(text_data, convert_to_numpy=True)  # Returns numpy array

# Function to create the Faiss index
def create_faiss_index(vectors):
    index = faiss.IndexFlatL2(vectors.shape[1])  # Use L2 distance metric (Euclidean distance)
    index.add(vectors)  # Add vectors to the index
    return index

# Function to adjust the query vector's dimensionality to match the Faiss index
def adjust_query_vector(query_vector, required_dim):
    current_dim = query_vector.shape[1]  # Get the dimensionality of the query vector
    if current_dim == required_dim:
        return query_vector  # No change needed
    elif current_dim < required_dim:
        # Pad the query vector with zeros if it's smaller than required
        padding = np.zeros((query_vector.shape[0], required_dim - current_dim), dtype=np.float32)
        return np.hstack((query_vector, padding))  # Concatenate along the feature dimension
    else:
        # Truncate the query vector if it's larger than required
        return query_vector[:, :required_dim]  # Take only the first 'required_dim' elements

# Function to query the Faiss index
def query_faiss_index(index, query_vector, k):
    # Ensure query_vector is a 2D NumPy array and of type float32
    query_vector = np.array(query_vector, dtype=np.float32)  # Ensure it's float32
    if query_vector.ndim == 1:
        query_vector = query_vector.reshape(1, -1)  # Reshape if it's a 1D array

    distances, indices = index.search(query_vector, k)
    return distances, indices

if __name__ == "__main__":
    # Load a pre-trained multilingual model that supports Kannada text
    model = SentenceTransformer('bert-base-multilingual-cased')  # Or another model like IndicBERT
  # A multilingual model supporting Kannada

    # Step 1: Read the Kannada text file (replace this with the actual path to your Kannada text file)
    file_path = "/content/Kannada_text_practise_RAG.txt"
    text_data = read_text_file(file_path)

    # Step 2: Vectorize the Kannada text data using the pre-trained model
    vectors = vectorize_text(text_data, model)

    # Step 3: Create a Faiss index
    faiss_index = create_faiss_index(vectors)

    # Allow user to input a query text in Kannada (or choose a specific line from the text file)
    query_text = input("Enter a query text in Kannada: ")  # User input for the query
    query_vector = vectorize_text([query_text], model)  # Vectorize the query

    # Adjust the query vector to ensure it has the same dimensionality as the Faiss index
    required_dim = vectors.shape[1]  # The dimensionality of the vectors in the index
    query_vector_adjusted = adjust_query_vector(query_vector, required_dim)

    # Step 4: Query the Faiss index
    k = 5  # Number of nearest neighbors to retrieve
    start_time = time.time()  # Start timing the search

    distances, indices = query_faiss_index(faiss_index, query_vector_adjusted, k)

    # End timing the search process
    end_time = time.time()

    # Output the results
    print("Indices of nearest neighbors:", indices)
    print("Distances of nearest neighbors:", distances)

    # Print the time taken for retrieval
    print(f"Time taken for retrieval: {end_time - start_time:.4f} seconds")

    # Optionally, display the retrieved text for better understanding
    print("\nRetrieved nearest neighbors:")
    for idx in indices[0]:
        print(f"- {text_data[idx].strip()}")




config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

Enter a query text in Kannada: ಬೆಕ್ಕು
Indices of nearest neighbors: [[19 15 18  2 16]]
Distances of nearest neighbors: [[75.071655 76.241295 86.43228  92.8045   94.83303 ]]
Time taken for retrieval: 0.0001 seconds

Retrieved nearest neighbors:
- ಬೆಕ್ಕಿನ ಕೂದಲು ಪ್ರಥಮ ಇನ್ಸ್ಟಿಟ್ಯೂಟ್ ಎಕ್ಲ್ಜರ್ ನೀಡು,
- ಬೆಕ್ಕಿನ ಹಾರವು ತೃಪ್ತಿಯ ಮತ್ತು ವಿಶ್ರಾಂತಿಯ ಸೂಚಕವಾಗಿದೆ.
- ಬೆಕ್ಕುಗಳು ತಮ್ಮನ್ನು ತಾವು ಸ್ವಚ್ಛಗೊಳಿಸಲು ಬಹಳ ಸಮಯ ವ್ಯತ್ಯಯ ಮಾಡುತ್ತವೆ.
- ಒಂದು ಆನೆ ಅದರ ದೀರ್ಘ ಕೈಗಳನ್ನು ಬಳಸಿ ಮರದಿಂದ ಮರಕ್ಕೆ ಸುಲಭವಾಗಿ ಹಾರಬಹುದು.
- ಬೆಕ್ಕುಗಳು ಪರಿಣಿತ ಹಾರಿ ಹುಣಸೆ, ಸಾಮಾನ್ಯವಾಗಿ ಹಕ್ಕಿಗಳು ಮತ್ತು ಎಲೆಗಳು ಹಾರುವ ಪ್ರಾಣಿಗಳನ್ನು ಹಿಡಿದುಕೊಳ್ಳುತ್ತವೆ.
