In [55]:
# Install necessary libraries
!pip install pandas numpy scikit-learn sentence-transformers faiss-cpu joblib jupyterlab



In [56]:
# Import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from scipy.sparse import hstack
import faiss
import joblib
import ast
import re
from math import sqrt

In [57]:
df_games = pd.read_csv('df_games.csv')

In [58]:
# Load the embeddings matrix from the .npy file
embeddings_matrix = np.load("embeddings_matrix.npy")
print("Embeddings matrix loaded successfully.")

Embeddings matrix loaded successfully.


In [59]:
#creating the search function
def search(dataframe, column_name, search_string):
    return dataframe[dataframe[column_name].str.contains(search_string, case=False, na=False)]

In [60]:
#keyword_soup cleaning
df_games['keyword_soup'] = df_games['keyword_soup'].fillna('').astype(str)

import re
# Replace multiple spaces with a single space and strip leading/trailing spaces
df_games['name'] = df_games['name'].str.replace(r'\s+', ' ', regex=True).str.strip()


In [61]:
df_games.to_csv("df_games.csv", index=False)

In [62]:
#vector creation (final_production_vectors.npy)
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import normalize
from scipy.sparse import hstack, csr_matrix


# --- 1. Define Your Final, Tuned Weights ---
# These weights prioritize content and publisher identity.
keyword_weight = 2.0
publisher_weight = 2.5
semantic_weight = 2.5
numerical_weight = 1.0

# --- 3. Build a Professional Preprocessing Pipeline ---

# Define the columns for each transformer
numerical_features = ['game_age', 'reviews_per_year', 'quality_score']
keyword_features = 'keyword_soup'
publisher_features = 'publisher_cleaned'

# Create the master preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('keywords', TfidfVectorizer(max_features=1000, stop_words='english', ngram_range=(1,2), min_df=5), keyword_features),
        ('publisher', TfidfVectorizer(max_features=500), publisher_features) # Limit publisher features
    ],
    transformer_weights={
        'num': numerical_weight,
        'keywords': keyword_weight,
        'publisher': publisher_weight
    })

# --- 4. Process Data and Combine with Semantics ---

# Apply the entire pipeline to get weighted numerical, keyword, and publisher features
processed_features = preprocessor.fit_transform(df_games)

# Apply weight to the semantic vectors
weighted_semantics = embeddings_matrix * semantic_weight

# Combine all features into one matrix
combined_weighted_vectors = hstack([
    processed_features,
    csr_matrix(weighted_semantics)
]).astype('float32')

# --- 5. Dimensionality Reduction for Accuracy ---

# Reduce the combined features to their most important 300 signals
svd = TruncatedSVD(n_components=300, random_state=42)
reduced_vectors = svd.fit_transform(combined_weighted_vectors)

# --- 6. Normalize and Finalize ---

# Normalize the final vectors for accurate similarity search
final_vectors_normalized = normalize(reduced_vectors, norm='l2', axis=1)

# Convert to the final format for Faiss
final_vectors = final_vectors_normalized.astype('float32')

print("Successfully created final, production-ready feature vectors.")
print("Final vector shape:", final_vectors.shape)

# Save the final vectors for your engine
np.save("final_production_vectors.npy", final_vectors)

Successfully created final, production-ready feature vectors.
Final vector shape: (88886, 300)


In [63]:
#the main engine file
import faiss
import numpy as np
import pandas as pd

# --- Load all necessary components ---
df_games = pd.read_csv("df_games.csv")  # Or your full cleaned dataset CSV
vectors = np.load("final_production_vectors.npy")  # Load the improved hybrid vectors

# Verify the dimensions of the vectors
print(f"Shape of vectors: {vectors.shape}")

# Rebuild the Faiss index with the correct dimensions
d = vectors.shape[1]  # Number of dimensions
index = faiss.IndexFlatL2(d)  # L2 distance metric
index.add(vectors)  # Add the vectors to the index

title_to_index = pd.Series(df_games.index, index=df_games['name'])

def get_profile_recommendations(game_titles, ratings, k=6): # Added 'ratings' parameter
    """
    Finds and prints recommendations based on a weighted average profile of input games.
    """
    try:
        # Check if the number of games and ratings match
        if len(game_titles) != len(ratings):
            print("Error: The number of games and ratings must be the same.")
            return

        # 1. Check if all titles exist in the dataset
        missing_titles = [title for title in game_titles if title not in title_to_index.index]
        if missing_titles:
            print(f"Error: The following games were not found in the dataset: {', '.join(missing_titles)}")
            return
        
        # 2. Get the vectors for all input games
        input_vectors = [vectors[title_to_index[title]] for title in game_titles]
        
        # 3. Calculate the WEIGHTED average vector to create the "taste profile"
        query_vector = np.average(input_vectors, axis=0, weights=ratings).reshape(1, -1).astype('float32')
        
        # Verify the dimensions of the query vector
        print(f"Shape of query vector: {query_vector.shape}")

        # New line: Increase the search accuracy
        index.nprobe = 10 
        
        # 4. Search the Faiss index
        distances, indices = index.search(query_vector, k)
        
        print(f"--- Recommendations for a fan of {', '.join(game_titles)} ---")
        
        # 5. Print the results, filtering out the input games
        recs = []
        for i in range(k):
            rec_title = df_games.iloc[indices[0][i]]['name']
            if rec_title not in game_titles:
                recs.append(rec_title)
        
        for i, rec in enumerate(recs):
            print(f"{i+1}. {rec}")

    except KeyError as e:
        print(f"Error: Game {e} not found in the dataset.")

# --- Now, test with both games! ---
test_games = ["Counter Strike 2", "Grand Theft Auto V", "Red Dead Redemption 2"]
# Add a list of ratings (out of 10) to correspond to the games
test_ratings = [10, 10, 10]
# Call the function with the new ratings parameter
get_profile_recommendations(test_games, test_ratings)

Shape of vectors: (88886, 300)
Shape of query vector: (1, 300)
--- Recommendations for a fan of Counter Strike 2, Grand Theft Auto V, Red Dead Redemption 2 ---
1. Sons Of The Forest
2. Call of Duty
3. Cyberpunk 2077
4. Marvel Rivals
5. Baldur's Gate 3


In [64]:
#search(df_games, 'name', "assassin's creed")