In [1]:
import random
import pandas as pd
import numpy as np

import pickle
import scipy.sparse as sparse
from scipy.sparse.linalg import spsolve
from sklearn.metrics.pairwise import cosine_distances, cosine_similarity
import re
import os
from sklearn.preprocessing import MinMaxScaler

'''
# Loading and processing data
'''
 
raw_data = pd.read_csv('subcat_378.csv')
#raw_data = raw_data.drop(raw_data.columns[1], axis=1)
raw_data = raw_data.loc[:,['GLUSR','MAPPED_MCAT','PURCHASE_CNT']]
#raw_data = raw_data.drop(raw_data.columns[1], axis=1)
raw_data.columns = ['user', 'mcat', 'purchase_cnt']

# Drop NaN columns
data = raw_data.dropna()
data = data.copy()

# Create a numeric user_id and mcat_id column
data['user'] = data['user'].astype("category")
data['mcat'] = data['mcat'].astype("category")
data['user_id'] = data['user'].cat.codes
data['mcat_id'] = data['mcat'].cat.codes
 
 # Create a lookup frame so we can get the mcat names back in 
 # readable form later.
item_lookup = data[['mcat_id', 'mcat']].drop_duplicates()
item_lookup['mcat_id'] = item_lookup.mcat_id.astype(str)

user_lookup = data[['user_id', 'user']].drop_duplicates()
user_lookup['user_id'] = item_lookup.mcat_id.astype(str)
 
data = data.drop(['user', 'mcat'], axis=1)
 
 # Drop any rows that have 0 purchases
#data = data.loc[data.purchase_cnt != 0]
 
 # Create lists of all users, mcats and their purchase counts
users = list(np.sort(data.user_id.unique()))
mcats = list(np.sort(data.mcat_id.unique()))
purchases = list(data.purchase_cnt)
 
 # Get the rows and columns for our new matrix
rows = data.user_id.astype(int)
cols = data.mcat_id.astype(int)
 
 # Create a sparse matrix for our users and mcats containing number of purchases
data_sparse_new = sparse.csr_matrix((purchases, (rows, cols)), shape=(len(users), len(mcats)))

In [5]:
D_sample = pd.read_excel('Sample_head.xlsx')
print(D_sample)

    User_ID                      MCAT  Number of Leads Purchased
0         1       Android POS Machine                        121
1         2                   Scanner                          3
2         3  Electronic Cash Register                          0
3         4            Cash Registers                         56
4         1      Spot Billing Machine                          4
5         1         Bluetooth Printer                          1
6         2            Coaxial Cables                         76
7         3            BNC Connectors                         23
8         3             AC DC Adapter                         10
9         3       Power Over Ethernet                         10
10        4          LED Power Supply                          5


In [10]:
D2_sample = pd.read_excel('Sample_head.xlsx',sheet_name='Sheet2')
print(D2_sample.tail(16))

    user_id        mcat_recommended     score
10      460          Coaxial Cables  0.888891
11      460          BNC Connectors  0.900179
12      460           AC DC Adapter  0.909324
13      460     Power Over Ethernet  0.920662
14      460        LED Power Supply  0.920834
15      460             Tower Light  0.928179
16      460          Male Connector  0.940248
17      460           Power Adapter  0.878361
18      460              POE Switch  0.959206
19      460  Thermal Imaging Camera  0.982029
20     1977            Food Warmers  1.024220
21     1977   Decorative Table Lamp  0.997653
22     1977              Brass Lamp  0.988891
23     1977             Lamp Shades  0.987453
24     1977        Decorative Lamps  0.946331
25     1977          Fancy Lanterns  0.943662


In [7]:
print("Length of Users :",len(users))
print("Length of Mcats :",len(mcats))
print("Length of purchase :",len(purchases))

print("Dimension of the Matrix :",data_sparse_new.shape)

Length of Users : 473
Length of Mcats : 266
Length of purchase : 14532
Dimension of the Matrix : (473, 266)


In [8]:
data.head()

Unnamed: 0,purchase_cnt,user_id,mcat_id
0,0,3,27
1,1,11,92
2,4,11,92
3,7,11,92
4,0,11,18


In [9]:
 
""" 
Implementation of Alternating Least Squares with implicit data. We iteratively
    compute the user (x_u) and item (y_i) vectors using the following formulas:
 
    x_u = ((Y.T*Y + Y.T*(Cu - I) * Y) + lambda*I)^-1 * (X.T * Cu * p(u))
    y_i = ((X.T*X + X.T*(Ci - I) * X) + lambda*I)^-1 * (Y.T * Ci * p(i))
 
    Args:
        sparse_data (csr_matrix): Our sparse user-by-item matrix
 
        alpha_val (int): The rate in which we'll increase our confidence
        in a preference with more interactions.
 
        iterations (int): How many times we alternate between fixing and 
        updating our user and item vectors
 
        lambda_val (float): Regularization value
 
        features (int): How many latent features we want to compute.
    
    Returns:     
        X (csr_matrix): user vectors of size users-by-features
        
        Y (csr_matrix): item vectors of size items-by-features
     """

   

def implicit_als(sparse_data, alpha_val=40, iterations=10, lambda_val=0.1, features=10):
     # Calculate the foncidence for each value in our data
    confidence = sparse_data * alpha_val
    
    # Get the size of user rows and item columns
    user_size, item_size = sparse_data.shape
    
    # We create the user vectors X of size users-by-features, the item vectors
    # Y of size items-by-features and randomly assign the values.
    X = sparse.csr_matrix(np.random.normal(size = (user_size, features)))
    Y = sparse.csr_matrix(np.random.normal(size = (item_size, features)))
    
    #Precompute I and lambda * I
    X_I = sparse.eye(user_size)
    Y_I = sparse.eye(item_size)
    
    I = sparse.eye(features)
    lI = lambda_val * I
    
    for i in range(iterations):
        print('iteration %d of %d' % (i+1, iterations))
        
        # Precompute Y-transpose-Y and X-transpose-X
        yTy = Y.T.dot(Y)
        xTx = X.T.dot(X)

        # Loop through all users
        for u in range(user_size):

            # Get the user row.
            u_row = confidence[u,:].toarray() 

            # Calculate the binary preference p(u)
            p_u = u_row.copy()
            p_u[p_u != 0] = 1.0

            # Calculate Cu and Cu - I
            CuI = sparse.diags(u_row, [0])
            Cu = CuI + Y_I

            # Put it all together and compute the final formula
            yT_CuI_y = Y.T.dot(CuI).dot(Y)
            yT_Cu_pu = Y.T.dot(Cu).dot(p_u.T)
            X[u] = spsolve(yTy + yT_CuI_y + lI, yT_Cu_pu)

    
        for i in range(item_size):

            # Get the item column and transpose it.
            i_row = confidence[:,i].T.toarray()

            # Calculate the binary preference p(i)
            p_i = i_row.copy()
            p_i[p_i != 0] = 1.0

            # Calculate Ci and Ci - I
            CiI = sparse.diags(i_row, [0])
            Ci = CiI + X_I

            # Put it all together and compute the final formula
            xT_CiI_x = X.T.dot(CiI).dot(X)
            xT_Ci_pi = X.T.dot(Ci).dot(p_i.T)
            Y[i] = spsolve(xTx + xT_CiI_x + lI, xT_Ci_pi)

    return X, Y

In [10]:
#raw2['user'] = raw2['user'].astype("category").cat.codes
iterations = 10

In [11]:
user_vecs1, item_vecs1 = implicit_als(data_sparse_new, iterations=5, features=20, alpha_val=40)

iteration 1 of 5
iteration 2 of 5
iteration 3 of 5
iteration 4 of 5
iteration 5 of 5


In [12]:
data.mcat_id.head()

0    27
1    92
2    92
3    92
4    18
Name: mcat_id, dtype: int16

In [13]:
#------------------------------
# FIND SIMILAR MCATS
#------------------------------

# Let's find similar mcats to mcat_id 92. 
# Note that this ID might be different for you if you're using
# the full dataset or if you've sliced it somehow. 
mcat_id = 92

# Get the item row for Jay-Z
item_vec = item_vecs1[mcat_id].T

# Calculate the similarity score between mcats=1 and other mcats
# and select the top 10 most similar.
scores = item_vecs1.dot(item_vec).toarray().reshape(1,-1)[0]
top_10 = np.argsort(scores)[::-1][:10]

mcats = []
mcats_scores = []

# Get and print the actual mcat names and scores
for idx in top_10:
    mcats.append(item_lookup.mcat.loc[item_lookup.mcat_id == str(idx)].iloc[0])
    mcats_scores.append(scores[idx])

similar = pd.DataFrame({'mcat': mcats, 'score': mcats_scores})

print(similar)

     mcat      score
0   25557  14.590245
1   95670   4.530865
2    5329   3.488501
3   95673   3.421736
4    9046   3.378059
5   20046   3.276832
6   95566   3.215481
7  180792   3.054726
8   95581   2.658631
9   34776   2.655455


In [15]:
# Let's say we want to recommend artists for user with ID 2023
user_id = 11

#------------------------------
# GET ITEMS CONSUMED BY USER
#------------------------------

# Let's print out what the user has listened to
consumed_idx = data_sparse_new[user_id,:].nonzero()[1].astype(str)
consumed_items = item_lookup.loc[item_lookup.mcat_id.isin(consumed_idx)]
print(consumed_items)


#------------------------------
# CREATE USER RECOMMENDATIONS
#------------------------------

def recommend(user_id, data_sparse_new, user_vecs1, item_vecs1, item_lookup, num_items=10):
    """Recommend items for a given user given a trained model
    
    Args:
        user_id (int): The id of the user we want to create recommendations for.
        
        data_sparse (csr_matrix): Our original training data.
        
        user_vecs (csr_matrix): The trained user x features vectors
        
        item_vecs (csr_matrix): The trained item x features vectors
        
        item_lookup (pandas.DataFrame): Used to map artist ids to artist names
        
        num_items (int): How many recommendations we want to return:
        
    Returns:
        recommendations (pandas.DataFrame): DataFrame with num_items artist names and scores
    
    """
  
    # Get all interactions by the user
    user_interactions = data_sparse_new[user_id,:].toarray()

    # We don't want to recommend items the user has consumed. So let's
    # set them all to 0 and the unknowns to 1.
    user_interactions = user_interactions.reshape(-1) + 1 #Reshape to turn into 1D array
    user_interactions[user_interactions > 1] = 0

    # This is where we calculate the recommendation by taking the 
    # dot-product of the user vectors with the item vectors.
    rec_vector = user_vecs1[user_id,:].dot(item_vecs1.T).toarray()

    # Let's scale our scores between 0 and 1 to make it all easier to interpret.
    min_max = MinMaxScaler()
    rec_vector_scaled = min_max.fit_transform(rec_vector.reshape(-1,1))[:,0]
    recommend_vector = user_interactions*rec_vector_scaled
   
    # Get all the artist indices in order of recommendations (descending) and
    # select only the top "num_items" items. 
    item_idx = np.argsort(recommend_vector)[::-1][:num_items]

    mcats = []
    scores = []

    # Loop through our recommended artist indicies and look up the actial artist name
    for idx in item_idx:
        mcats.append(item_lookup.mcat.loc[item_lookup.mcat_id == str(idx)].iloc[0])
        scores.append(recommend_vector[idx])

    # Create a new dataframe with recommended artist names and scores
    recommendations = pd.DataFrame({'mcat': mcats, 'score': scores})
    
    return recommendations

# Let's generate and print our recommendations
recommendations = recommend(user_id, data_sparse_new, user_vecs1, item_vecs1, item_lookup)
print(recommendations)

   mcat_id    mcat
1       92   25557
19     224  142414
20       1     210
21       6    2906
44     114   34776
     mcat     score
0    3640  1.000000
1   95596  0.950159
2   95569  0.910121
3   16636  0.885380
4     213  0.848514
5  132912  0.794544
6   16039  0.794363
7   39409  0.790214
8   95619  0.773000
9   13613  0.763179


In [16]:
#more faster approach

import sys
import pandas as pd
import numpy as np
import scipy.sparse as sparse
from scipy.sparse.linalg import spsolve
import random

from sklearn.preprocessing import MinMaxScaler

import implicit # The Cython library

# Load the data like we did before
raw_data = pd.read_csv('subcat_378.csv')
raw_data = raw_data.loc[:,['GLUSR','MAPPED_MCAT','PURCHASE_CNT']]
#raw_data = raw_data.drop(raw_data.columns[1], axis=1)
raw_data.columns = ['user', 'mcat', 'purchase_cnt']

# Drop NaN columns
data = raw_data.dropna()
data = data.copy()

# Create a numeric user_id and artist_id column
data['user'] = data['user'].astype("category")
data['mcat'] = data['mcat'].astype("category")
data['user_id'] = data['user'].cat.codes
data['mcat_id'] = data['mcat'].cat.codes

# The implicit library expects data as a item-user matrix so we
# create two matricies, one for fitting the model (item-user) 
# and one for recommendations (user-item)
sparse_item_user = sparse.csr_matrix((data['purchase_cnt'].astype(float), (data['mcat_id'], data['user_id'])))
sparse_user_item = sparse.csr_matrix((data['purchase_cnt'].astype(float), (data['user_id'], data['mcat_id'])))

# Initialize the als model and fit it using the sparse item-user matrix
model = implicit.als.AlternatingLeastSquares(factors=20, regularization=0.1, iterations=20)

# Calculate the confidence by multiplying it by our alpha value.
alpha_val = 15
data_conf = (sparse_item_user * alpha_val).astype('double')

# Fit the model
model.fit(data_conf)


#---------------------
# FIND SIMILAR ITEMS
#---------------------

# Find the 10 most similar to mcat 92
mcat_id = 92 
n_similar = 10

# Get the user and item vectors from our trained model
user_vecs = model.user_factors
item_vecs = model.item_factors

# Calculate the vector norms
item_norms = np.sqrt((item_vecs * item_vecs).sum(axis=1))

# Calculate the similarity score, grab the top N items and
# create a list of item-score tuples of most similar artists
scores = item_vecs.dot(item_vecs[mcat_id]) / item_norms
top_idx = np.argpartition(scores, -n_similar)[-n_similar:]
similar = sorted(zip(top_idx, scores[top_idx] / item_norms[mcat_id]), key=lambda x: -x[1])

# Print the names of our most similar artists
for item in similar:
    idx, score = item
    print(data.mcat.loc[data.mcat_id == idx].iloc[0])


#------------------------------
# CREATE USER RECOMMENDATIONS
#------------------------------

def recommend(user_id, sparse_user_item, user_vecs, item_vecs, num_items=10):
    """The same recommendation function we used before"""

    user_interactions = sparse_user_item[user_id,:].toarray()

    user_interactions = user_interactions.reshape(-1) + 1
    user_interactions[user_interactions > 1] = 0

    rec_vector = user_vecs[user_id,:].dot(item_vecs.T).toarray()

    min_max = MinMaxScaler()
    rec_vector_scaled = min_max.fit_transform(rec_vector.reshape(-1,1))[:,0]
    recommend_vector = user_interactions * rec_vector_scaled

    item_idx = np.argsort(recommend_vector)[::-1][:num_items]

    mcats = []
    scores = []

    for idx in item_idx:
        mcats.append(data.mcat.loc[data.mcat_id == idx].iloc[0])
        scores.append(recommend_vector[idx])

    recommendations = pd.DataFrame({'mcat': mcats, 'score': scores})

    return recommendations

# Get the trained user and item vectors. We convert them to 
# csr matrices to work with our previous recommend function.
user_vecs = sparse.csr_matrix(model.user_factors)
item_vecs = sparse.csr_matrix(model.item_factors)

# Create recommendations for user with id 2025
user_id = 11

recommendations = recommend(user_id, sparse_user_item, user_vecs, item_vecs)

print(recommendations)

100%|████████████████████████████████████████████████████████████████████████████████| 20.0/20 [00:00<00:00, 76.55it/s]


25557
9046
11347
34776
95670
145249
74236
101712
3722
184191
    mcat     score
0  53759  0.825571
1  95670  0.789236
2  11845  0.747133
3    212  0.726179
4  10057  0.709587
5   9046  0.696731
6   5329  0.676408
7  16636  0.674436
8   3640  0.642622
9  68632  0.635805


In [19]:
raw_data = pd.read_csv('cat_13.csv')

In [22]:
print('Total Unique Seller:',len(raw_data.GLUSR.unique()))
print('Total Unique MCAT count :', len(raw_data.MAPPED_MCAT.unique()))

Total Unique Seller: 4851
Total Unique MCAT count : 4066


In [23]:
# Using impliciti modeling

import sys
import pandas as pd
import numpy as np
import scipy.sparse as sparse
from scipy.sparse.linalg import spsolve
import random

from sklearn.preprocessing import MinMaxScaler

import implicit

# Load the data like we did before
#raw_data = pd.read_csv('cat_13.csv')
raw_data = raw_data.loc[:,['GLUSR','MAPPED_MCAT','PURCHASE_CNT']]
#raw_data = raw_data.drop(raw_data.columns[1], axis=1)
raw_data.columns = ['user', 'mcat', 'purchase_cnt']

# Drop NaN columns
data = raw_data.dropna()
data = data.copy()

# Create a numeric user_id and artist_id column
data['user'] = data['user'].astype("category")
data['mcat'] = data['mcat'].astype("category")
data['user_id'] = data['user'].cat.codes
data['mcat_id'] = data['mcat'].cat.codes

# The implicit library expects data as a item-user matrix so we
# create two matricies, one for fitting the model (item-user) 
# and one for recommendations (user-item)
sparse_item_user = sparse.csr_matrix((data['purchase_cnt'].astype(float), (data['mcat_id'], data['user_id'])))
sparse_user_item = sparse.csr_matrix((data['purchase_cnt'].astype(float), (data['user_id'], data['mcat_id'])))

# Initialize the als model and fit it using the sparse item-user matrix
model = implicit.als.AlternatingLeastSquares(factors=20, regularization=0.1, iterations=20)

# Calculate the confidence by multiplying it by our alpha value.
alpha_val = 15
data_conf = (sparse_item_user * alpha_val).astype('double')

#Fit the model
model.fit(data_conf)


#---------------------
# FIND SIMILAR ITEMS
#---------------------

Ids = list(data.user_id.unique())
R1 = pd.DataFrame(columns=['mcat_id','score','user_id'])
R1


for i in range(len(Ids)):
    user_id = Ids[i]
    
    recommended = model.recommend(user_id, sparse_user_item)

    mcats = []
    scores = []
    users = []

# Get artist names from ids
    for item in recommended:
        idx, score = item
        mcats.append(data.mcat.loc[data.mcat_id == idx].iloc[0])
        #users.append(data.user.loc[data.user_id== idx].iloc[0])
        scores.append(score)

    # Create a dataframe of mcat names and scores
    recommendations = pd.DataFrame({'mcat_id': mcats, 'score': scores})
    recommendations['user_id'] = data.user.loc[data.user_id== Ids[i]].iloc[0]

#print(recommendations)
    
    R1 = R1.append(recommendations)
        
        
print(R1.shape)


100%|████████████████████████████████████████████████████████████████████████████████| 20.0/20 [00:01<00:00, 17.79it/s]


(48510, 3)


In [26]:
R2 = R1[['user_id','mcat_id','score']]

In [32]:
R2 = R2.sort_values(by='user_id')

In [35]:
R2.to_excel('user_to_mcat_recommen_ALS_grp_id13.xlsx',index=False)

In [None]:
for i in range(len(Ids)):
    user_id = Ids[i]
    
    recommended = model.recommend(user_id, sparse_user_item)

    mcats = []
    scores = []
    users = []

# Get artist names from ids
    for item in recommended:
        idx, score = item
        mcats.append(data.mcat.loc[data.mcat_id == idx].iloc[0])
        #users.append(data.user.loc[data.user_id== idx].iloc[0])
        scores.append(score)

    # Create a dataframe of artist names and scores
    recommendations = pd.DataFrame({'mcat_id': mcats, 'score': scores})
    recommendations['user_id'] = data.user.loc[data.user_id== Ids[i]].iloc[0]

#print(recommendations)
    
    R1 = R1.append(recommendations)
        
        
print(R1.shape)


In [37]:
type(model)

implicit.als.AlternatingLeastSquares

In [18]:
R1.sort_values(by='user_id').head(10)

Unnamed: 0,mcat_id,score,user_id
0,15862,1.317767,1977
8,98602,0.740766,1977
7,23434,0.771373,1977
6,23435,0.922006,1977
9,33887,0.725858,1977
4,97,0.988622,1977
3,25564,1.027371,1977
2,10281,1.030788,1977
5,16423,0.971073,1977
1,6241,1.16635,1977


In [38]:
# Saving Model to the file and loading the same for further use:

import pickle

In [40]:
# Save model to the drive
pickle.dump(model,open('c:/Users/imart/Documents/Collaborative/Model_ALS_grp_13.pkl','wb'))