In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, mean_squared_error
import joblib  # Correct import from sklearn
import torch
from torchvision import transforms, models
from torch.utils.data import DataLoader, TensorDataset

# 1. Generate and save the sample dataset
np.random.seed(42)
num_rows = 20000

# Generate sample data
data = {
    'query': [f'search_query_{i}' for i in range(num_rows)],
    'user_preference': np.random.choice(['relevant', 'irrelevant'], num_rows),
    'browsing_history': [f'browse_history_{i}' for i in range(num_rows)],
    'query_type': np.random.choice(['voice', 'text', 'image'], num_rows),
    'response_time': np.random.uniform(0.1, 5.0, num_rows),
    'relevance_score': np.random.uniform(0, 1, num_rows)
}

# Create DataFrame
df = pd.DataFrame(data)

# Save as CSV file
df.to_csv('sample_search_data.csv', index=False)
print("Sample dataset saved as 'sample_search_data.csv'.")

# 2. Load and preprocess the dataset
df = pd.read_csv('sample_search_data.csv')

# Text normalization
df['query'] = df['query'].str.lower().str.replace(r'\d+', '')

# Split dataset
X = df[['query', 'browsing_history']]
y = df['relevance_score']

# Binning the continuous relevance scores into discrete labels
threshold = 0.5  # Example threshold for binary classification
y_binned = (y >= threshold).astype(int)

X_train, X_test, y_train, y_test = train_test_split(X, y_binned, test_size=0.2, random_state=42)

# Text Vectorization using TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=1000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train['query'])
X_test_tfidf = tfidf_vectorizer.transform(X_test['query'])

# 3. Train a Random Forest model for personalized search
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train_tfidf, y_train)

# Save the model
joblib.dump(rf_model, 'best_personalized_search_model.sav')
print("Personalized Search Model saved.")

# Load the model
loaded_rf_model = joblib.load('best_personalized_search_model.sav')

# Predict and evaluate
predictions = loaded_rf_model.predict(X_test_tfidf)
accuracy = accuracy_score(y_test, predictions)
print(f"Accuracy: {accuracy}")

# 4. Image Search Model (Example with Vision Transformer)
images = np.random.rand(20000, 224, 224, 3)  # Random images for demonstration

# Image Transformations
image_transform = transforms.Compose([
    transforms.ToPILImage(),
    transforms.Resize((224, 224)),
    transforms.ToTensor()
])

# Transform images
image_data = torch.stack([image_transform(image) for image in images])

# Create dataset
dataset = TensorDataset(image_data)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

# Pre-trained ViT model
model = models.vit_b_16(pretrained=True)
model.eval()

# Extract features from images
with torch.no_grad():
    image_features = []
    for inputs in dataloader:
        inputs = inputs[0]
        outputs = model(inputs)
        image_features.append(outputs)

# Saving image features
np.save('image_features.npy', torch.cat(image_features))
print("Image features saved as 'image_features.npy'.")


Sample dataset saved as 'sample_search_data.csv'.
Personalized Search Model saved.
Accuracy: 0.511


MemoryError: Unable to allocate 22.4 GiB for an array with shape (20000, 224, 224, 3) and data type float64