In [3]:
import pandas as pd
import numpy as np
import pymongo
from bson import json_util
import json
from sentence_transformers import SentenceTransformer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.impute import SimpleImputer

# Load preprocessed data
df = pd.read_csv('preprocessed_data.csv')

# Initialize the vectorization model
model = SentenceTransformer('bert-base-uncased')

# Automatically identify column types
text_columns = df.select_dtypes(include=['object']).columns
numeric_columns = df.select_dtypes(include=[np.number]).columns
numeric_columns = numeric_columns[df[numeric_columns].notna().any()]

# Ensure 'ID' is not included in text_columns for vectorization
text_columns = [col for col in text_columns if col != 'ID']

# Vectorize text columns
for column in text_columns:
    df[f"{column}_vectors"] = df[column].apply(lambda x: model.encode(x, convert_to_tensor=True).numpy())

# Impute NaN values in numeric columns
imputer = SimpleImputer(strategy="median")
imputed_data = pd.DataFrame(imputer.fit_transform(df[numeric_columns]), columns=numeric_columns)
df[numeric_columns] = imputed_data

# Normalize numeric data
scaler = MinMaxScaler()
df[numeric_columns] = scaler.fit_transform(df[numeric_columns])

# One-hot encode categorical data
encoder = OneHotEncoder()
encoded_categorical = encoder.fit_transform(df[text_columns]).toarray()
df = df.drop(columns=text_columns)

# Manually create feature names for the one-hot encoded columns
new_feature_names = []
for col, categories in zip(text_columns, encoder.categories_):
    for category in categories:
        new_feature_names.append(f"{col}_{category}")

# Create a DataFrame with the encoded data
encoded_df = pd.DataFrame(encoded_categorical, columns=new_feature_names, index=df.index)

# Concatenate with the original DataFrame
df = pd.concat([df, encoded_df], axis=1)

# Connect to MongoDB
client = pymongo.MongoClient("mongodb_uri")
db = client.rag

# Create two collections
collection_original = db.original_data
collection_vectorized = db.vectorized_data

# Convert DataFrame to JSON for original data, excluding vector columns
json_original_data = json.loads(df.drop(columns=[col for col in df.columns if col.endswith('_vectors')]).to_json(orient='records'))

# Convert DataFrame to JSON for vectorized data, excluding non-vector columns
json_vectorized_data = json.loads(df.drop(columns=[col for col in df.columns if not col.endswith('_vectors') and col != 'ID']).to_json(orient='records'))

# Insert data into MongoDB for original data
collection_original.insert_many(json_original_data)

# Insert data into MongoDB for vectorized data, including the 'ID' field
for doc in json_vectorized_data:
    matching_id = df.loc[df['ID'].astype(str) == str(doc['ID']), 'ID']  # Ensure the data types match
    if not matching_id.empty:
        doc['unique_key'] = matching_id.iloc[0]
collection_vectorized.insert_many(json_vectorized_data)

# Close the connection
client.close()


No sentence-transformers model found with name bert-base-uncased. Creating a new one with MEAN pooling.
