In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity
from pymongo import MongoClient
import re

# Function to extract different data types from the prompt
def extract_data_types(prompt):
    numbers = re.findall(r'\b\d+\b', prompt)
    text_only = re.sub(r'\b\d+\b', '', prompt)
    mixed_value = prompt
    numbers = [float(num) for num in numbers] if numbers else []
    return numbers, text_only, mixed_value

# Function to vectorize each part of the prompt
def vectorize_prompt(prompt, vectorizers):
    numbers, text_only, mixed_value = extract_data_types(prompt)

    num_vectorizer = vectorizers['numeric']
    text_vectorizer = vectorizers['text']

    num_vectors = num_vectorizer.transform([numbers]) if numbers else np.array([])
    text_vectors = text_vectorizer.transform([text_only])

    combined_vector = np.hstack((num_vectors.toarray(), text_vectors.toarray()))

    return combined_vector

# Function to find similar vectors
def find_similar_vectors(prompt_vector, collection_name, all_vectors):
    similarities = cosine_similarity(prompt_vector.reshape(1, -1), all_vectors)

    most_similar_indices = similarities.argsort()[0][::-1][:10]
    similar_data = [all_vectors[i] for i in most_similar_indices]

    return similar_data

# MongoDB connection
client = MongoClient('mongodb+srv://rag:rag@rag.tbwlcef.mongodb.net/')
db = client['rag']

# Load the data
data = pd.read_csv('cleaned_data_text_features.csv')

# Vectorize numerical columns
numeric_columns = data.select_dtypes(include=[np.number]).columns
numeric_vectorizers = {col: MinMaxScaler() for col in numeric_columns}
numeric_vectors = {col: vectorizer.fit_transform(data[[col]]) for col, vectorizer in numeric_vectorizers.items()}

# Vectorize text column
text_column = 'text_column'  # Replace with your text column name
text_vectorizer = TfidfVectorizer()
text_vectors = text_vectorizer.fit_transform(data[text_column])

# Combine all vectors
all_vectors = np.hstack(list(numeric_vectors.values()) + [text_vectors.toarray()])

# Store all vectors in MongoDB
collection_name = 'all_vectors'  # Replace with your collection name
collection = db[collection_name]
collection.delete_many({})  # Clear previous vectors
vector_records = [{'vector': vector.tolist()} for vector in all_vectors]
collection.insert_many(vector_records)

# Vectorizers dictionary
vectorizers = {'numeric': numeric_vectorizers, 'text': text_vectorizer}

# Example usage
prompt = "2 singham"
prompt_vector = vectorize_prompt(prompt, vectorizers)
similar_data = find_similar_vectors(prompt_vector, collection_name, all_vectors)
print(prompt_vector)
print(similar_data)


Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


ConfigurationError: The DNS query name does not exist: _mongodb._tcp.rag1.gck2wq8.mongodb.net.