# Collaborative Filtering for our Transaction Datasets

In [1]:
import sys
import pandas as pd
import numpy as np
import scipy.sparse as sparse
from scipy.sparse.linalg import spsolve
import random
from sklearn import metrics
import tqdm as notebook_tqdm
from sklearn.preprocessing import MinMaxScaler
import itertools
import implicit

In [2]:
retail_df = pd.read_csv('Data/modified_receipt_data.csv')

In [3]:
retail_df.head(10)

Unnamed: 0,STORE_ID,RECEIPT_NUM,CUSTOMER_ID,STYLE_ID,QUANTITY_SOLD,PRICE,TRANSACTION_DATE,Product_name
0,1,19408728,668082,158499,1.0,10.19,2021-12-18,Mast Store Outfitters Mountain Short Sleeve T-...
1,1,19408728,668082,174492,1.0,104.99,2021-12-18,Women's Talus AT UltraDry Boots
2,1,19408728,668082,183665,1.0,13.19,2021-12-18,Mast Store Outfitters Logo Long Sleeve T-Shirt
3,1,19408730,340737,4123,1.0,10.49,2021-12-18,Essential Burt's Bees Kit
4,1,19408730,340737,115940,1.0,9.09,2021-12-18,Royal Jelly Body Butter - Original - 1.65 Ounce
5,1,19408730,340737,137361,1.0,17.49,2021-12-18,Royal Jelly Body Butter - Tupelo Honey - 6.7 O...
6,1,19408808,0,203554,1.0,10.99,2021-12-18,Max Plush Toy - 7 Inch
7,1,19408844,0,170049,1.0,5.99,2021-12-18,Mast Store Provisioners Spiced Apple Cider
8,1,19408845,0,204697,1.0,8.99,2021-12-18,Boom Bloom Expandable Vase
9,1,19408916,0,69502,2.0,3.99,2021-12-18,Bandana


In [26]:
retail_df.shape

(1562224, 8)

In [4]:
retail_df[retail_df["CUSTOMER_ID"]==0]

Unnamed: 0,STORE_ID,RECEIPT_NUM,CUSTOMER_ID,STYLE_ID,QUANTITY_SOLD,PRICE,TRANSACTION_DATE,Product_name
6,1,19408808,0,203554,1.0,10.99,2021-12-18,Max Plush Toy - 7 Inch
7,1,19408844,0,170049,1.0,5.99,2021-12-18,Mast Store Provisioners Spiced Apple Cider
8,1,19408845,0,204697,1.0,8.99,2021-12-18,Boom Bloom Expandable Vase
9,1,19408916,0,69502,2.0,3.99,2021-12-18,Bandana
10,1,19408916,0,83702,1.0,9.95,2021-12-18,Hummingbirds Notecards
...,...,...,...,...,...,...,...,...
1531218,15,20993515,0,217366,1.0,28.00,2022-12-18,Men's Stretch Jersey Henley Shirt
1535610,97,19682282,0,144670,1.0,209.95,2022-03-01,Super 550 Series Boots
1537635,97,19850382,0,124424,1.0,0.00,2022-04-18,Mast Store Provisioners Traffic Jam
1540482,97,19903497,0,214610,1.0,22.49,2022-05-03,Women's Roanoke Vintage Poncho Short Sleeve Top


In [5]:
retail_df.drop(retail_df[retail_df['CUSTOMER_ID'] == 0].index, inplace = True)

In [6]:
retail_df.columns = retail_df.columns.str.lower()

In [7]:
retail_df.sample(5)

Unnamed: 0,store_id,receipt_num,customer_id,style_id,quantity_sold,price,transaction_date,product_name
923794,8,20010115,23060,25512,1.0,3.59,2022-05-29,Mast Store Provisioners Salted Gourmet Virgini...
1532491,97,19509493,682000,130039,1.0,25.0,2021-12-30,Classic Flask
1539772,97,19902091,1148757,191958,1.0,21.0,2022-05-02,Men's Scout Boot Sock Cushion Socks
1545140,97,20354727,1155698,217784,3.0,11.49,2022-08-11,Women's Weekend Gallery Footsie Socks
773638,7,20147835,116190,110630,1.0,16.99,2022-06-30,Spirograph Design Set in Collectible Tin


In [8]:
retail_df.shape

(81768, 8)

In [10]:
# #to make values in df lowercase
# df = pd.DataFrame(data)
# df['col_1'] = df['col_1'].str.lower()
# print(df)

In [9]:
retail_df.columns

Index(['store_id', 'receipt_num', 'customer_id', 'style_id', 'quantity_sold',
       'price', 'transaction_date', 'product_name'],
      dtype='object')

In [10]:
retail_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 81768 entries, 0 to 1562223
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   store_id          81768 non-null  int64  
 1   receipt_num       81768 non-null  int64  
 2   customer_id       81768 non-null  int64  
 3   style_id          81768 non-null  int64  
 4   quantity_sold     81768 non-null  float64
 5   price             81768 non-null  float64
 6   transaction_date  81768 non-null  object 
 7   product_name      81768 non-null  object 
dtypes: float64(2), int64(4), object(2)
memory usage: 5.6+ MB


In [11]:
len(retail_df["customer_id"].unique())

16892

In [35]:
retail_df['quantity_sold'] = retail_df['quantity_sold'].astype("float32")
#retail_df['QUANTITY'] = retail_df['QUANTITY'].astype("float32")
retail_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 81768 entries, 0 to 1562223
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   store_id          81768 non-null  int64  
 1   receipt_num       81768 non-null  int64  
 2   customer_id       81768 non-null  int64  
 3   style_id          81768 non-null  int64  
 4   quantity_sold     81768 non-null  float32
 5   price             81768 non-null  float64
 6   transaction_date  81768 non-null  object 
 7   product_name      81768 non-null  object 
dtypes: float32(1), float64(1), int64(4), object(2)
memory usage: 5.3+ MB


There are a lot of “CustomerID” were missing from the data, so we will have to remove those rows.

Group “CustomerID” and “StockCode” then sum the “Quantity”. So that we get each **customer and each item interactions**.
If “Quantity” = 0, we change to one.
Eliminate negative “Quantity”.

In [15]:
#retail_df.drop(retail_df[retail_df['CUSTOMER_ID'] == 0].index, inplace = True)

In [36]:
retail_df['quantity_sold'].unique()

array([  1.  ,   3.  ,   4.  ,   2.  ,  10.  ,   6.  ,  12.  ,   5.  ,
        36.  ,  37.  ,  16.  ,  28.  ,  17.  ,  21.  ,  14.  ,   7.  ,
         8.  , 106.  ,  30.  ,  35.  ,  11.  ,  27.  ,  34.  ,  43.  ,
         9.  ,  18.  ,  39.  ,  15.  , 231.  , 164.  ,  25.  ,  19.  ,
       190.  ,  23.  ,  32.  ,  24.  ,   2.14,  20.  ,  54.  ,  26.  ,
        13.  ,  42.  ,   3.98,  91.  ,  72.  ,  22.  , 134.  ,  31.  ,
        61.  ,  50.  ,  33.  ,  66.  ,  63.  ,  73.  , 161.  ,  41.  ,
        46.  ,  83.  ,  49.  ,  44.  ,  51.  ,  40.  ,   0.5 ,  56.  ,
        62.  ,  60.  ,  45.  ,  29.  ,  48.  ], dtype=float32)

In [12]:
grouped_df = retail_df[['customer_id', 'style_id', 'product_name','quantity_sold',]].groupby(['customer_id', 'style_id', 'product_name']).sum().reset_index()

In [38]:
grouped_df.shape

(63866, 4)

In [13]:
grouped_df.head()

Unnamed: 0,customer_id,style_id,product_name,quantity_sold
0,2,17602,Honees Liquid Honey Cough Drops,1.0
1,2,17703,Necco Wafers Candy - Assorted,1.0
2,2,34053,Little Red Wagon,1.0
3,2,83574,Magic Ball & Vase Trick Toy,1.0
4,2,90813,Mast Store Provisioners Raspberry Jalapeno Jam,1.0


In [20]:
grouped_df.quantity_sold.describe()

count    63866.000000
mean         1.862707
std          5.169692
min          0.500000
25%          1.000000
50%          1.000000
75%          2.000000
max        802.000000
Name: quantity_sold, dtype: float64

In [21]:
#grouped_df.loc[grouped_df['style_id'] == 433]

In [22]:
#grouped_df.loc[grouped_df['customer_id'] == 24137].sort_values('quantity_sold', ascending=False)[['style_id','customer_id', 'product_name','quantity_sold']].head(20)

The vast majority of customers purchased less than 40 pieces of same item in one interaction, very few of them purchased more than 2,000 pieces of same item in one interaction.

In [14]:
print(f'Number of unique customers: {grouped_df.customer_id.nunique()}')
print(f'Number of unique items: {grouped_df.style_id.nunique()}')

print(f'Average purchase quantity per interaction: {int(grouped_df.quantity_sold.mean())}')
print(f'Minimum purchase quantity per interaction: {grouped_df.quantity_sold.min()}')
print(f'Maximum purchase quantity per interaction: {grouped_df.quantity_sold.max()}')

Number of unique customers: 16892
Number of unique items: 7165
Average purchase quantity per interaction: 1
Minimum purchase quantity per interaction: 0.5
Maximum purchase quantity per interaction: 802.0


**Implicit Feedback**

Instead of representing an explicit rating, the “Quantity” can represent a “confidence” in terms of how strong the interaction was. Items with a larger number of “Quantity” by a customer can carry more weight in our ratings matrix of “Quantity”.

In [15]:
unique_customers = grouped_df.customer_id.unique()
customer_ids = dict(zip(unique_customers, np.arange(unique_customers.shape[0], dtype=np.int32)))

unique_items = grouped_df.style_id.unique()
item_ids = dict(zip(unique_items, np.arange(unique_items.shape[0], dtype=np.int32)))

grouped_df['user_id'] = grouped_df.customer_id.apply(lambda i: customer_ids[i])
grouped_df['item_id'] = grouped_df.style_id.apply(lambda i: item_ids[i])

In [25]:
unique_customers = grouped_df.customer_id.unique()
print(unique_customers.shape[0])
customer_ids = dict(zip(unique_customers, np.arange(unique_customers.shape[0], dtype=np.int32)))
slicedDict = dict(itertools.islice(customer_ids.items(), 0 ,10))
print(slicedDict)
#print(customer_ids.keys())


16892
{2: 0, 14: 1, 15: 2, 61: 3, 78: 4, 81: 5, 87: 6, 108: 7, 139: 8, 168: 9}


In [26]:
unique_items = grouped_df.style_id.unique()
item_ids = dict(zip(unique_items, np.arange(unique_items.shape[0], dtype=np.int32)))
#print(item_ids.values())
#print(item_ids.keys())
print(unique_items.shape[0])

slicedDict = dict(itertools.islice(item_ids.items(), 0 ,10))

print(slicedDict)

7165
{17602: 0, 17703: 1, 34053: 2, 83574: 3, 90813: 4, 94377: 5, 113925: 6, 120018: 7, 128111: 8, 136526: 9}


In [22]:
grouped_df.head()

Unnamed: 0,customer_id,style_id,products,quantity_sold,user_id,item_id
0,2,17602,honees liquid honey cough drops,1.0,0,0
1,2,17703,necco wafers candy - assorted,1.0,0,1
2,2,34053,little red wagon,1.0,0,2
3,2,83574,magic ball & vase trick toy,1.0,0,3
4,2,90813,mast store provisioners raspberry jalapeno jam,1.0,0,4


We will create numeric “customer_id and “item_id” columns.
Create two matrices, one for fitting the model (item-customer) and another one for recommendations (customer-item).

In [19]:
grouped_df.rename(columns = {'product_name':'products'}, inplace = True)

In [21]:
grouped_df['products'] = grouped_df['products'].apply(lambda x: str(x).lower())

In [23]:
grouped_df.to_pickle("./grouped_data_all_store.pkl")

In [2]:
unpickled_df = pd.read_pickle("./grouped_data_all_store.pkl")
unpickled_df.head()

Unnamed: 0,customer_id,style_id,products,quantity_sold,user_id,item_id
0,2,17602,honees liquid honey cough drops,1.0,0,0
1,2,17703,necco wafers candy - assorted,1.0,0,1
2,2,34053,little red wagon,1.0,0,2
3,2,83574,magic ball & vase trick toy,1.0,0,3
4,2,90813,mast store provisioners raspberry jalapeno jam,1.0,0,4


In [48]:
sparse_item_customer = sparse.csr_matrix((unpickled_df['quantity_sold'].astype(float), (unpickled_df['item_id'], unpickled_df['user_id'])))
sparse_customer_item = sparse.csr_matrix((unpickled_df['quantity_sold'].astype(float), (unpickled_df['user_id'], unpickled_df['item_id'])))

In [8]:
sparse_item_customer

<7165x16892 sparse matrix of type '<class 'numpy.float64'>'
	with 63866 stored elements in Compressed Sparse Row format>

In [49]:
sparse.save_npz('sparse_item_customer_vs.npz', sparse_item_customer)
sparse_item_customer_new = sparse.load_npz('sparse_item_customer_vs.npz')
sparse_item_customer_new

<7165x16892 sparse matrix of type '<class 'numpy.float64'>'
	with 63866 stored elements in Compressed Sparse Row format>

In [50]:
sparse.save_npz('sparse_customer_item_vs.npz', sparse_customer_item)
sparse_customer_item_new = sparse.load_npz('sparse_customer_item_vs.npz')
sparse_customer_item_new

<16892x7165 sparse matrix of type '<class 'numpy.float64'>'
	with 63866 stored elements in Compressed Sparse Row format>

ALS is an iterative optimization process where we for every iteration try to arrive closer and closer to a factorized representation of our original data.
We set the type of our matrix to double for the ALS function to run properly.

In [1]:
model = implicit.als.AlternatingLeastSquares(factors=50)
#AlternatingLeastSquares(factors=100, regularization=0.01, dtype=np.float32, use_native=True, use_cg=True, use_gpu=implicit.cuda.HAS_CUDA, iterations=15, calculate_training_loss=False, num_threads=0, random_state=None)
alpha = 15
data = (sparse_customer_item * alpha).astype('double')

model.fit(data)
#80,300
#use_gpu=True,calculate_training_loss=True,use_cg=True,

NameError: name 'implicit' is not defined

In [12]:
print(model)

<implicit.cpu.als.AlternatingLeastSquares object at 0x00000245D7D1FC70>


In [13]:
# find related items
related = model.similar_items(89)
related

(array([  89,  980, 6868, 1568,  594, 3972, 2589, 3649, 1003, 6479]),
 array([1.        , 0.67287254, 0.6609791 , 0.6221852 , 0.5870612 ,
        0.5838666 , 0.5795336 , 0.5778222 , 0.5747155 , 0.5727518 ],
       dtype=float32))

In [22]:
import pickle
Pkl_Filename = "ALS_Model.pkl"  

with open(Pkl_Filename, 'wb') as file:  
    pickle.dump(model, file)

In [4]:
# Load the Model back from file
import pickle
Pkl_Filename=("ALS_Model.pkl")
with open(Pkl_Filename, 'rb') as file:  
    Pickled_ALS_Model = pickle.load(file)

Pickled_ALS_Model

<implicit.cpu.als.AlternatingLeastSquares at 0x245d7d1f850>

Finding the Similar Items

In [4]:
unpickled_df.loc[unpickled_df['item_id'] == 31].head()

Unnamed: 0,customer_id,style_id,products,quantity_sold,user_id,item_id
31,2,220364,mast general store euro badge t-shirt,1.0,0,31
11705,116190,220364,mast general store euro badge t-shirt,2.0,233,31
15971,272282,220364,mast general store euro badge t-shirt,1.0,390,31
16076,274460,220364,mast general store euro badge t-shirt,1.0,396,31
17278,309035,220364,mast general store euro badge t-shirt,1.0,459,31


In [None]:
grouped_df.loc[grouped_df.index == 9172].head()

Unnamed: 0,customer_id,style_id,product_name,quantity_sold,user_id,item_id
9172,116190,192706,Front Range Leash,2.0,233,4145


Life Gets Better With Grandkids Short Sleeve T-Shirt


In [95]:
unpickled_df.loc[unpickled_df['product_name'] == "Clear Bryant Crossbody Bag"].head()

Unnamed: 0,customer_id,style_id,product_name,quantity_sold,user_id,item_id
11993,116190,225081,Clear Bryant Crossbody Bag,1.0,233,5595
12892,122226,225081,Clear Bryant Crossbody Bag,1.0,250,5595


Finding the 10 most similar items to “SWIVEL QD BRNG BLR 81”.

Get the customer and item vectors from our trained model.
Calculate the vector norms.
Calculate the similarity score.
Get the top 10 items.
Create a list of item-score tuples of most similar items with this item.

In [20]:
item_id = 31
n_similar = 10

item_vecs = model.item_factors
customer_vecs = model.user_factors

item_norms = np.sqrt((item_vecs * item_vecs).sum(axis=1))

scores = item_vecs.dot(item_vecs[item_id]) / item_norms
top_idx = np.argpartition(scores, -n_similar)[-n_similar:]
similar = sorted(zip(top_idx, scores[top_idx] / item_norms[item_id]), key=lambda x: -x[1])

In [15]:
similar    

[(89, 1.0),
 (980, 0.67287254),
 (6868, 0.6609791),
 (1568, 0.6221852),
 (594, 0.5870612),
 (3972, 0.5838666),
 (2589, 0.5795336),
 (3649, 0.5778222),
 (1003, 0.5747155),
 (6479, 0.5727518)]

[  89,  980, 6868, 1568,  594, 3972, 2589, 3649, 1003, 6479]),

In [21]:
for item in similar:
    idx, score = item
    try:
        print(unpickled_df.product_name.loc[unpickled_df.item_id== idx].iloc[0])
    except:
        print(unpickled_df.product_name.loc[unpickled_df.index== idx].iloc[0])

Mast General Store Euro Badge T-Shirt
Mast General Store Est 1883 Short Sleeve T-Shirt
Toddler Mast General Store Stacked Logo T-Shirt
Infant Born To Be Wild Long Sleeve Onesie
Disco String Lights
Mast General Store Take Me to Roanoke Short Sleeve T-Shirt
Youth Boonerang Multi-Color Short Sleeve T-Shirt
Greenville Moon Short Sleeve T-Shirt
Mast Store Outfitters Short Sleeve T-Shirt
Medium Flex Bottle Boot


In [6]:
#item_id = 3
n_similar = 10

def similar_items(item_id):
    item_vecs = Pickled_ALS_Model.item_factors
    customer_vecs = Pickled_ALS_Model.user_factors

    item_norms = np.sqrt((item_vecs * item_vecs).sum(axis=1))

    scores = item_vecs.dot(item_vecs[item_id]) / item_norms
    top_idx = np.argpartition(scores, -n_similar)[-n_similar:]
    similar = sorted(zip(top_idx, scores[top_idx] / item_norms[item_id]), key=lambda x: -x[1])
    list_similar=[]
    for item in similar:
        idx, score = item
        list_similar.append(unpickled_df.product_name.loc[unpickled_df.item_id == idx].iloc[0])
    return list_similar

In [7]:
similar_items(item_id=89)

['Mast Store Provisioners Hot Pickled Okra',
 'Model 4 Fixed Blade Knife',
 'Have a Little Pun Notecards',
 'Flat Out Knife',
 'Chogan Woods T-Hawk Leather Sheath',
 'Aegis AT Tanto Knife',
 'Asheville Hanging With The Locals Long Sleeve T-Shirt',
 'Mast Store Provisioners Hot Jalapeno Dill Pickles',
 'Asheville Leather Pint Glass Sleeve',
 'Asheville Mast General Store Tie Dye Hoodie']

Implicit also has built in functions for recommendations and similar items.

**Recommend Items to Customers**

In [158]:
def recommend(customer_id, sparse_customer_item, customer_vecs, item_vecs,num_items=10):
    
    customer_interactions = sparse_customer_item[customer_id,:].toarray()
    customer_interactions = customer_interactions.reshape(-1) + 1
    customer_interactions[customer_interactions > 1] = 0
    
    rec_vector = customer_vecs[customer_id,:].dot(item_vecs.T).toarray()
    
    min_max = MinMaxScaler()
    rec_vector_scaled = min_max.fit_transform(rec_vector.reshape(-1,1))[:,0]
    recommend_vector = customer_interactions * rec_vector_scaled

    item_idx = np.argsort(recommend_vector)[::-1][:num_items]
    
    descriptions = []
    scores = []

    for idx in item_idx:
        descriptions.append(unpickled_df.product_name.loc[unpickled_df.item_id == idx].iloc[0])
        scores.append(recommend_vector[idx])

    recommendations = pd.DataFrame({'DESCRIPTION': descriptions, 'Score': scores})

    return recommendations

Create recommendations for customer with id 2.

In [168]:
customer_vecs = sparse.csr_matrix(model.user_factors)
item_vecs = sparse.csr_matrix(model.item_factors)

# Create recommendations for customer with id 2
customer_id = 200

recommendations = recommend(customer_id, sparse_customer_item, customer_vecs, item_vecs)

print(recommendations)

                                         DESCRIPTION     Score
0  Mast Store Provisioners Salted Gourmet Virgini...  0.883017
1         Mast Store Provisioners Sweet Fire Pickles  0.781997
2              Mast Store Provisioners Hot Chow Chow  0.774145
3            Assorted Spiced Gum Drops Candy - 1 lb.  0.770193
4                      Lil' Ranger Rifle Cap Toy Gun  0.748265
5          Old Fashioned Cinnamon Sanded Drops Candy  0.742805
6                          Candy Barrel Reusable Bag  0.696622
7          Yellow Old Fashioned Southern Style Grits  0.686508
8        Old Fashioned Watermelon Sanded Drops Candy  0.675222
9                             Original Potato Chips   0.645393


Now we have top 10 recommendations for customer_id 2. Do they make sense? Let’s get top 20 items this customer has purchased.

In [169]:
grouped_df.loc[grouped_df['user_id'] == 200].sort_values('quantity_sold', ascending=False)[['customer_id', 'product_name', 'quantity_sold']].head(20)


Unnamed: 0,customer_id,product_name,quantity_sold
6217,93645,Mast Store Provisioners Mild Chow Chow,2.0
6215,93645,Pixy Stix Candy - 1 lb.,1.0
6216,93645,Old Fashioned Root Beer Sanded Drops Candy,1.0
6218,93645,Mast Store Provisioners House Blend Whole Bean...,1.0
6219,93645,Sea Salt & Cracked Black Pepper Pork Rinds,1.0


This customer’s top purchases were lip glosses, designed tissues and holiday cake cases, etc things like people purchase when hosting holiday parties. The items we recommended to him (her) includes fruit straws, gift boxes, cocktail parasols, etc. Those are also things people purchase when hosting a party.

Evaluation the RecSys

In [None]:
item="banana"
costs=24
print("the %s cost is rupees %d "%(item,costs))

the banana cost is rupees 24 


In [None]:
dict={"item":"banana","cost":24}
print("the {} cost is rupees {}".format(item,costs))

the banana cost is rupees 24


In [None]:
a=["cat","dog","monkey"]
#print(a)
b=[]
for i in a:
    i.split()
    #print(i)
    
    #b.append(list(i))
    c=list(i)
    #b.split()
    #b.append(i)
    print(c)
    
    
    

['c', 'a', 't']
['d', 'o', 'g']
['m', 'o', 'n', 'k', 'e', 'y']
