In [15]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.model_selection import train_test_split
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
import nltk
import re

In [16]:
# Read in Ecommerce Data
df = pd.read_csv("Online Retail.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 541909 entries, 0 to 541908
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   InvoiceNo    541909 non-null  object 
 1   StockCode    541909 non-null  object 
 2   Description  540455 non-null  object 
 3   Quantity     541909 non-null  int64  
 4   InvoiceDate  541909 non-null  object 
 5   UnitPrice    541909 non-null  float64
 6   CustomerID   406829 non-null  float64
 7   Country      541909 non-null  object 
dtypes: float64(2), int64(1), object(5)
memory usage: 33.1+ MB


In [17]:
df = df.dropna()

In [18]:
df.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,12/1/10 8:26,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,12/1/10 8:26,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,12/1/10 8:26,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,12/1/10 8:26,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,12/1/10 8:26,3.39,17850.0,United Kingdom


## Find Similar Customers

In [19]:
df_pivot = df.pivot_table(index=['CustomerID'],
                columns=['StockCode'],
                values='Quantity',
                fill_value=0).reset_index()

purchase_counts = df_pivot.groupby('CustomerID').sum()
# Ensures no negative values
purchase_counts = np.maximum(purchase_counts, 0)
customer_similarities = pd.DataFrame(cosine_similarity(purchase_counts), index=purchase_counts.index, columns=purchase_counts.index)

In [20]:
def recommend_products(customer_id, n_recommendations=5):
    similar_customers = customer_similarities[customer_id].sort_values(ascending=False).index[1:]
    recommended_products = set()

    for similar_customer in similar_customers:
        products_bought = purchase_counts.loc[similar_customer][purchase_counts.loc[similar_customer] > 0].index
        products_not_bought = products_bought.difference(purchase_counts.loc[customer_id][purchase_counts.loc[customer_id] > 0].index)
        recommended_products.update(products_not_bought)

        if len(recommended_products) >= n_recommendations:
            break

    return list(recommended_products)[:n_recommendations]

In [35]:
def compareRecommendations(cID, prod_rec):
    comparison = {'Products Purchased': [], 'Products Recommended': []}
    comparison['Products Purchased'] = list(df[df['CustomerID'] == cID]['Description'])
    comparison['Products Recommended'] = set([list(df[df['StockCode'] == prodID]['Description'])[0] for prodID in prod_rec])
    return comparison

In [22]:
testID = np.random.choice(df['CustomerID'])
recommend_products(testID, 5)

['21977', '22501', '22667', '23243', '22585']

## Find Similar Products with TF-IDF Vectorizing

In [23]:
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))

In [24]:
def preprocess_text(text):
    text = str(text).lower()
    text = re.sub(r'\W', ' ', text)
    tokens = text.split()
    tokens = [word for word in tokens if word not in stop_words]
    return ' '.join([stemmer.stem(lemmatizer.lemmatize(word)) for word in tokens])

In [25]:
preprocess_vectorize = np.vectorize(preprocess_text)
cleaned_descriptions = preprocess_vectorize(df['Description'].unique())

In [26]:
tfIdf = TfidfVectorizer()
matrix = tfIdf.fit_transform(cleaned_descriptions)
similarities = cosine_similarity(matrix)

In [27]:
def get_similar_products(prod_id, n=5):
    idx = df.index[df['StockCode'] == prod_id][0]
    sim_scores = list(enumerate(similarities[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_indices = [i[0] for i in sim_scores[1:n+1]]
    return df.iloc[sim_indices][['StockCode', 'Description']]

similar_products = get_similar_products(prod_id='84029G', n=5)
print(similar_products)

     StockCode                        Description
5076     21260                      FIRST AID TIN
4712     22413    METAL SIGN TAKE IT OR LEAVE IT 
218      22910  PAPER CHAIN KIT VINTAGE CHRISTMAS
176     85099C     JUMBO  BAG BAROQUE BLACK WHITE
274      22726         ALARM CLOCK BAKELIKE GREEN


## Combine Recommendations

In [33]:
def combined_recommendations(customer_id, n_recommendations=5):
    similar_customers = customer_similarities[customer_id].sort_values(ascending=False).index[1:]
    customer_based_recs = set()
    
    for similar_customer in similar_customers:
        products_bought = purchase_counts.loc[similar_customer][purchase_counts.loc[similar_customer] > 0].index
        products_not_bought = products_bought.difference(purchase_counts.loc[customer_id][purchase_counts.loc[customer_id] > 0].index)
        customer_based_recs.update(products_not_bought)
        """if len(customer_based_recs) >= n_recommendations:
            # Stop at n_recommendations
            break"""
    
    product_based_recs = set()
    for product in customer_based_recs:
        try:
            similar_products_df = get_similar_products(prod_id=product, n=n_recommendations)
            product_based_recs.update(similar_products_df['StockCode'].values)
        except:
            continue
    
    final_recommendations = list(customer_based_recs.intersection(product_based_recs))[:n_recommendations]
    
    recommendations_df = df[df['StockCode'].isin(final_recommendations)][['StockCode', 'Description']].drop_duplicates()
    return recommendations_df


test_customer_id = np.random.choice(df['CustomerID'].dropna().unique())
recommendations = combined_recommendations(customer_id=test_customer_id, n_recommendations=5)
print(recommendations)

     StockCode                    Description
473      16237           SLEEPING CAT ERASERS
1015     22501     PICNIC BASKET WICKER LARGE
1096     20973  12 PENCIL SMALL TUBE WOODLAND
4310    15060B     FAIRY CAKE DESIGN UMBRELLA
4595     21026                      SPACE OWL


In [36]:
compareRecommendations(test_customer_id, recommendations['StockCode'])

{'Products Purchased': ['CUTE CATS TAPE',
  'STARS GIFT TAPE ',
  'RETRO PLASTIC POLKA TRAY',
  'BAKING SET 9 PIECE RETROSPOT ',
  'SET/20 RED RETROSPOT PAPER NAPKINS ',
  'SET OF SALT AND PEPPER TOADSTOOLS',
  'RED POLKADOT COFFEE  MUG',
  'SET/2 RED RETROSPOT TEA TOWELS ',
  'EDWARDIAN PARASOL BLACK'],
 'Products Recommended': {'12 PENCIL SMALL TUBE WOODLAND',
  'FAIRY CAKE DESIGN UMBRELLA',
  'PICNIC BASKET WICKER LARGE',
  'SLEEPING CAT ERASERS',
  'SPACE OWL'}}