In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse.linalg import svds

In [6]:
class HybridRecommender:
    def __init__(self, min_rating=0, max_rating=5):
        self.min_rating = min_rating
        self.max_rating = max_rating
        self.user_item_matrix = None
        self.product_features = None
        self.product_ids = None
        self.user_ids = None
        
    def preprocess_data(self, df):
        """Preprocess data by handling duplicates and missing values."""
        # Handle duplicate user-product combinations by taking mean rating
        processed_df = df.groupby(['user_id', 'product_id']).agg({
            'rating': 'mean',
            'product_name': 'first',
            'category': 'first',
            'brand': 'first',
            'normalized_product_name': 'first',
            'normalized_about_product': 'first'
        }).reset_index()
        
        # Fill missing ratings with 0 or another default value
        processed_df['rating'] = processed_df['rating'].fillna(0)
        
        # Ensure all text columns are strings
        text_columns = ['normalized_product_name', 'normalized_about_product', 'category', 'brand']
        for col in text_columns:
            processed_df[col] = processed_df[col].fillna('').astype(str)
        
        return processed_df
        
    def prepare_user_item_matrix(self, df):
        """Create and normalize the user-item interaction matrix."""
        # Create the interaction matrix with aggregated ratings
        self.user_item_matrix = df.pivot(
            index='user_id', 
            columns='product_id', 
            values='rating'
        ).fillna(0)
        
        self.user_ids = self.user_item_matrix.index
        self.product_ids = self.user_item_matrix.columns
        
        # Normalize ratings
        matrix_norm = self.user_item_matrix.values
        self.user_ratings_mean = np.mean(matrix_norm, axis=1)
        matrix_norm = matrix_norm - self.user_ratings_mean.reshape(-1, 1)
        
        return matrix_norm
    
    def train_collaborative(self, matrix_norm, n_factors=50):
        """Train collaborative filtering using SVD."""
        # Handle case where n_factors is too large
        n_factors = min(n_factors, min(matrix_norm.shape) - 1)
        
        U, sigma, Vt = svds(matrix_norm, k=n_factors)
        sigma = np.diag(sigma)
        self.user_features = U
        self.item_features = Vt.T
        self.sigma = sigma
        
    def train_content_based(self, df):
        """Train content-based filtering using product features."""
        # Combine relevant text features
        text_features = df.groupby('product_id').agg({
            'normalized_product_name': 'first',
            'normalized_about_product': 'first',
            'category': 'first',
            'brand': 'first'
        })
        
        # Combine all text features and handle missing values
        text_features['combined_features'] = text_features.apply(
            lambda x: ' '.join([str(x['normalized_product_name']), 
                              str(x['normalized_about_product']),
                              str(x['category']),
                              str(x['brand'])]), axis=1
        )
        
        # Create TF-IDF matrix
        tfidf = TfidfVectorizer(stop_words='english')
        self.product_features = tfidf.fit_transform(text_features['combined_features'])
        
    def get_collaborative_recommendations(self, user_id, n_recommendations=5):
        """Get collaborative filtering recommendations for a user."""
        if user_id not in self.user_ids:
            return []
        
        user_idx = np.where(self.user_ids == user_id)[0][0]
        
        # Predict ratings
        predicted_ratings = (
            self.user_features[user_idx].dot(self.sigma).dot(self.item_features.T) +
            self.user_ratings_mean[user_idx]
        )
        
        # Get top recommendations
        already_rated = self.user_item_matrix.loc[user_id].gt(0)
        predicted_ratings[already_rated] = self.min_rating - 1
        
        top_products_idx = predicted_ratings.argsort()[-n_recommendations:][::-1]
        return self.product_ids[top_products_idx]
    
    def get_content_recommendations(self, product_id, n_recommendations=5):
        """Get content-based recommendations for a product."""
        if product_id not in self.product_ids:
            return []
        
        product_idx = np.where(self.product_ids == product_id)[0][0]
        
        # Calculate similarity scores
        sim_scores = cosine_similarity(
            self.product_features[product_idx:product_idx+1], 
            self.product_features
        ).flatten()
        
        # Get top recommendations
        sim_scores[product_idx] = -1  # Exclude the input product
        top_products_idx = sim_scores.argsort()[-n_recommendations:][::-1]
        return self.product_ids[top_products_idx]
    
    def get_hybrid_recommendations(self, user_id, n_recommendations=5, collaborative_weight=0.7):
        """Get hybrid recommendations combining both approaches."""
        if user_id not in self.user_ids:
            return []
            
        # Get collaborative recommendations
        collab_recs = self.get_collaborative_recommendations(
            user_id, 
            n_recommendations=n_recommendations*2
        )
        
        # Get content recommendations based on user's highly rated products
        user_ratings = self.user_item_matrix.loc[user_id]
        top_rated_products = user_ratings[user_ratings > user_ratings.mean()].index[:3]
        
        content_recs = []
        for product_id in top_rated_products:
            content_recs.extend(
                self.get_content_recommendations(
                    product_id, 
                    n_recommendations=n_recommendations
                )
            )
        
        # Combine recommendations with weights
        rec_dict = {}
        
        # Add collaborative recommendations
        for i, prod_id in enumerate(collab_recs):
            rec_dict[prod_id] = collaborative_weight * (len(collab_recs) - i)
            
        # Add content recommendations
        for i, prod_id in enumerate(content_recs):
            if prod_id in rec_dict:
                rec_dict[prod_id] += (1 - collaborative_weight) * (len(content_recs) - i)
            else:
                rec_dict[prod_id] = (1 - collaborative_weight) * (len(content_recs) - i)
        
        # Sort and return top recommendations
        sorted_recs = sorted(rec_dict.items(), key=lambda x: x[1], reverse=True)
        return [prod_id for prod_id, _ in sorted_recs[:n_recommendations]]

In [7]:
def create_recommendations(df, user_id, n_recommendations=5):
    """Create recommendations with proper data preprocessing."""
    # Initialize recommender
    recommender = HybridRecommender()
    
    # Preprocess data
    processed_df = recommender.preprocess_data(df)
    
    # Prepare data and train models
    matrix_norm = recommender.prepare_user_item_matrix(processed_df)
    recommender.train_collaborative(matrix_norm)
    recommender.train_content_based(processed_df)
    
    # Get recommendations
    recommendations = recommender.get_hybrid_recommendations(
        user_id,
        n_recommendations=n_recommendations
    )
    
    if not recommendations:
        return pd.DataFrame()  # Return empty DataFrame if no recommendations
    
    # Get recommended product details
    recommended_products = processed_df[processed_df['product_id'].isin(recommendations)].drop_duplicates('product_id')[
        ['product_id', 'product_name', 'category', 'brand', 'rating']
    ]
    
    return recommended_products

In [8]:
# Load DataFrame
df = pd.read_csv('/Users/anithasmac/Projects/CustomerJourneyMapping/Featured_Amazon_Data.csv')

In [11]:
import random

# Get unique user IDs
unique_users = df['user_id'].unique()

# Randomly select one user from the unique users
random_user_id = random.choice(unique_users)


In [12]:
# Get recommendations for a specific user
recommendations = create_recommendations(df, user_id=random_user_id, n_recommendations=5)
print(recommendations)

      product_id                                       product_name  \
159   B0B997FBZT  Acer 139 cm (55 inches) H Series 4K Ultra HD A...   
637   B08CT62BM1  Wayona USB Type C Fast Charging Cable Charger ...   
638   B08CTNJ985  Wayona USB Type C 65W Fast Charging 2M/6Ft Lon...   
1049  B081FG1QYX  Wayona Type C Cable Nylon Braided USB C QC 3.0...   
1050  B081FJWN52  Wayona Usb Type C To Usb Nylon Braided Quick C...   

                                               category   brand  rating  
159   Electronics|HomeTheater,TV&Video|Televisions|S...    Acer     4.3  
637   Computers&Accessories|Accessories&Peripherals|...  Wayona     4.3  
638   Computers&Accessories|Accessories&Peripherals|...  Wayona     4.3  
1049  Computers&Accessories|Accessories&Peripherals|...  Wayona     4.3  
1050  Computers&Accessories|Accessories&Peripherals|...  Wayona     4.3  
