In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

In [2]:
data = pd.read_csv("shopping_behavior_updated.csv")

### Understanding the data 

In [3]:
data.head()

Unnamed: 0,Customer ID,Age,Gender,Item Purchased,Category,Purchase Amount (USD),Location,Size,Color,Season,Review Rating,Subscription Status,Shipping Type,Discount Applied,Promo Code Used,Previous Purchases,Payment Method,Frequency of Purchases
0,1,55,Male,Blouse,Clothing,53,Kentucky,L,Gray,Winter,3.1,Yes,Express,Yes,Yes,14,Venmo,Fortnightly
1,2,19,Male,Sweater,Clothing,64,Maine,L,Maroon,Winter,3.1,Yes,Express,Yes,Yes,2,Cash,Fortnightly
2,3,50,Male,Jeans,Clothing,73,Massachusetts,S,Maroon,Spring,3.1,Yes,Free Shipping,Yes,Yes,23,Credit Card,Weekly
3,4,21,Male,Sandals,Footwear,90,Rhode Island,M,Maroon,Spring,3.5,Yes,Next Day Air,Yes,Yes,49,PayPal,Weekly
4,5,45,Male,Blouse,Clothing,49,Oregon,M,Turquoise,Spring,2.7,Yes,Free Shipping,Yes,Yes,31,PayPal,Annually


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3900 entries, 0 to 3899
Data columns (total 18 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Customer ID             3900 non-null   int64  
 1   Age                     3900 non-null   int64  
 2   Gender                  3900 non-null   object 
 3   Item Purchased          3900 non-null   object 
 4   Category                3900 non-null   object 
 5   Purchase Amount (USD)   3900 non-null   int64  
 6   Location                3900 non-null   object 
 7   Size                    3900 non-null   object 
 8   Color                   3900 non-null   object 
 9   Season                  3900 non-null   object 
 10  Review Rating           3900 non-null   float64
 11  Subscription Status     3900 non-null   object 
 12  Shipping Type           3900 non-null   object 
 13  Discount Applied        3900 non-null   object 
 14  Promo Code Used         3900 non-null   

In [5]:
data.isnull().sum()

Customer ID               0
Age                       0
Gender                    0
Item Purchased            0
Category                  0
Purchase Amount (USD)     0
Location                  0
Size                      0
Color                     0
Season                    0
Review Rating             0
Subscription Status       0
Shipping Type             0
Discount Applied          0
Promo Code Used           0
Previous Purchases        0
Payment Method            0
Frequency of Purchases    0
dtype: int64

#### Encoding categorical Variables

For content based filtering, need to combine textual features

For collaborative filtering, using the surprice library use customer id, item purchased and review rating

In [6]:
# For Content-Based Filtering
# No need to encode categorical variables yet; we'll combine them into a text field.

# For Collaborative Filtering
# Ensure 'Review Rating' is present and suitable for use as ratings.


## Content Based filtering

#### Feature engineering

In [7]:
# Combine relevant item attributes to create a single string that represents each item

In [8]:
# Step 1: Combine relevant features into a single 'combined_features' column
def combine_features(row):
    return row['Item Purchased'] + ' ' + row['Category'] + ' ' + row['Color'] + ' ' + row['Size']

# Apply the function to create the 'combined_features' column
data['combined_features'] = data.apply(combine_features, axis=1)

#### Vectorization and similarity calculation

Using TF-IDF vectorizer to convert the text data into numerical features and calculate cosine similarity between items

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [10]:
# Initialize TF-IDF vectorizer 
tfidf = TfidfVectorizer(stop_words='english')

# Fit and transform the combined features
tfidf_matrix = tfidf.fit_transform(data['combined_features'])

# compute cosine similarity matrix
cosine_sim = cosine_similarity(tfidf_matrix,tfidf_matrix)

# create a reverse mapping of indices and item names
indices = pd.Series(data.index, index=data['Item Purchased'].str.lower()).drop_duplicates()

print("Cosine Similarity Matrix Shape:", cosine_sim.shape)

Cosine Similarity Matrix Shape: (3900, 3900)


### Recommendation Function

Creating a function that takes an "Item Purchased" as input and returns similar items

In [20]:
def get_content_based_recommendation(item, cosine_sim = cosine_sim):
    item = item.lower()
    if item not in indices:
        print(f"'{item}' not found in the dataset.")
        return []
    
    # get the index of the item of the first occurance of the item
    idx = indices[item].iloc[0]
   
    
    #Get the pairwise similarity scores for that item
    sim_scores = list(enumerate(cosine_sim[idx]))

    
    #Sort based on similarity scores
    sim_scores = sorted(sim_scores, key = lambda x:x[1], reverse=True)
    
    # get the scores of the top 5 similar items (excluding iteslf)
    sim_scores = sim_scores[1:6]
    
    # get the item indices
    item_indices = [i[0] for i in sim_scores]
    
    # return the top 5 similar items
    return data['Item Purchased'].iloc[item_indices]

#Example usage
recommended_items = get_content_based_recommendation('jeans')
print("Recommended Items", recommended_items.tolist())



Recommended Items ['Jeans', 'Jeans', 'Jeans', 'Jeans', 'Jeans']


In [19]:
def get_content_based_recommendation(item, cosine_sim=cosine_sim):
    item = item.lower()  # Convert the item to lowercase for matching
    if item not in indices:
        print(f"'{item}' not found in the dataset.")
        return []
    
    # Get all indices of the item
    idx_list = indices[item]
    
    # Aggregate the similarity scores by averaging across all indices
    agg_sim_scores = cosine_sim[idx_list].mean(axis=0)
    
    # Get the pairwise similarity scores for the aggregated result
    sim_scores = list(enumerate(agg_sim_scores))
   

    # Sort the items based on similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # Get the scores of the top 5 similar items (excluding the input item itself)
    sim_scores = sim_scores[1:6]
    
    # Get the item indices
    item_indices = [i[0] for i in sim_scores]
    
    # Return the top 5 most similar items
    return data['Item Purchased'].iloc[item_indices]

#Example usage
recommended_items = get_content_based_recommendation('blouse')
print("Recommended Items", recommended_items.tolist())


Recommended Items ['Blouse', 'Blouse', 'Blouse', 'Blouse', 'Blouse']


## Collaborative filtering

Recommends filtering recommends items based on user interactions. Here I'm using surprise library for this purpose

#### Preparing the user-item matrix

Using customer id, item purchased and review rating as the primary features for collaborative filtering

In [21]:
cf_data = data[['Customer ID','Item Purchased','Review Rating']]

cf_data.head()

Unnamed: 0,Customer ID,Item Purchased,Review Rating
0,1,Blouse,3.1
1,2,Sweater,3.1
2,3,Jeans,3.1
3,4,Sandals,3.5
4,5,Blouse,2.7


### Model training and prediction using surprise 

In [22]:
from surprise import Dataset, Reader, SVD
from surprise.model_selection import cross_validate, train_test_split
from sklearn.metrics import mean_squared_error
from math import sqrt

In [23]:
# Define a reader object with the rating_scale
reader = Reader(rating_scale=(1,5))

#Load the data from the dataframe
data_2 = Dataset.load_from_df(cf_data,reader)

# split into train and tests sets
train, test = train_test_split(data_2, test_size=0.2, random_state=42)

# Intialize the SVD algorithm
algo = SVD()

# train the algorithm on the trainset
algo.fit(train)

# Make predictions on the testset
predictions = algo.test(test)

# evaluate the performance
from surprise import accuracy

rmse = accuracy.rmse(predictions)
print(f"Collaborative Filtering RMSE: {rmse}")

RMSE: 0.7182
Collaborative Filtering RMSE: 0.718226619604839


### Making the recommendations

In [27]:
def get_collaborative_recommendations(user_id, top_n=5):
    #get all items
    all_items = data['Item Purchased'].unique()
    
    # get items already interacted with by the user
    user_items = data[data['Customer ID'] == user_id]['Item Purchased'].unique()
    
    #get items not yet interacted 
    items_to_predict = [item for item in all_items if item not in user_items]
    
    #predict ratings for these items
    predictions = [algo.predict(user_id, item) for item in items_to_predict]
    
    #sort predictions by estimated rating
    predictions.sort(key=lambda x: x.est, reverse = True)
    
    # get the top n recommendations
    top_recommendations = predictions[:top_n]
    
    #return the recommended item names
    return [pred.iid for pred in top_recommendations]


#example usage
user_id = 1
recommended_items_cf = get_collaborative_recommendations(user_id)
print(f"Collaborative Filtering Recommendations for user {user_id}:", recommended_items_cf)

Collaborative Filtering Recommendations for user 1: ['Socks', 'Coat', 'Belt', 'Jacket', 'Sandals']


In [30]:
def get_collaborative_recommendations(user_id, top_n=5):
    #get all items
    all_items = data['Item Purchased'].unique()
    
    # get items already interacted with by the user
    user_items = data[data['Customer ID'] == user_id]['Item Purchased'].unique()
    
    #get items not yet interacted 
    items_to_predict = [item for item in all_items if item not in user_items]
    
    #predict ratings for these items
    predictions = [algo.predict(user_id, item) for item in items_to_predict]
    
    #sort predictions by estimated rating
    predictions.sort(key=lambda x: x.est, reverse = True)
    
    # get the top n recommendations
    top_recommendations = predictions[:top_n]
    
    #return the recommended item names
    return [pred.iid for pred in top_recommendations]


#example usage
user_id = 11
recommended_items_cf = get_collaborative_recommendations(user_id)
print(f"Collaborative Filtering Recommendations for user {user_id}:", recommended_items_cf)

Collaborative Filtering Recommendations for user 11: ['T-shirt', 'Gloves', 'Belt', 'Jeans', 'Dress']
