# Hybrid Filtering 

### LightFM package with Users Features
###### Recent research has demonstrated that hybrid filtering models, combining collaborative and content-based filtering could be more effective in some cases. This method can also overcome some recommendation problems such as the "cold start" or "sparsity" problem. 

In [6]:
import pandas as pd
import numpy as np
import random
from scipy import sparse
from sklearn.metrics.pairwise import cosine_similarity
from lightfm import LightFM
from lightfm.evaluation import auc_score
import itertools

In [7]:
#Import Data 
df=pd.read_csv("C:/Users/alorenzodebrionne/Documents/Python/purchased_data.csv")
users_features = pd.read_csv("C:/Users/alorenzodebrionne/Documents/Python/user_data.csv")

#### To ensure statistical significance users with less than 5 items, and items with less than 10 ratings are excluded.

In [28]:
#Filter on Items with have at least 10 users
item_count = df.groupby(['ItemID']).size().reset_index(name='counts').sort_values(['counts'], ascending=False)
list_items = item_count[item_count['counts']>10]

# Filter on Users with at least 5 items
user_count = df.groupby(['userID']).size().reset_index(name='counts').sort_values(['counts'], ascending=False)
list_users = user_count[user_count['counts']>5]

list_items= list_items['ItemID']
list_users= list_users['userID']


df= df[df['ItemID'].isin(list_items)]
df= df[df['userID'].isin(list_users)]

users_features = users_features[users_features['userID'].isin(list_users)]


print("UserID:", df.userID.nunique())
print("ItemID:", df.ItemID.nunique())

UserID: 1112
ItemID: 53


#### We convert our User-Item Matrix to binary matrix. NB: you can create a Weighted Matrix and use it during the LightFM model

In [29]:
df_pivot = df.pivot(index='userID', columns='ItemID').nb_purchased
userID = pd.DataFrame(df_pivot.index)
# Replace all zeros per NaN
df_pivot=df_pivot.fillna(0)
# Binary Interactions 
df_pivot[df_pivot != 0] =1

### Transform your Matrices to a compress a sparse row format
##### Users Features & Items Features needs to contain ONLY NUMERICS

In [30]:
#Count number of Products and Users
n_items= len(df_pivot.columns)
n_users =len(df_pivot)

In [32]:
users_features=pd.get_dummies(users_features)

In [33]:
user_to_item_matrix= sparse.csr_matrix(df_pivot.values)
user_property_matrix = sparse.csr_matrix(users_features.values)

## Create the train set and test set
### Process for Data Masking
##### Randomly assign 0 values to the training set - Test set is a copy of the DataFrame
###### Keep the list of users that were altered

In [34]:
import random
def make_train(ratings, pct_test = 0.25):
    
    test_set = ratings.copy() # Make a copy of the original set to be the test set. 
    training_set = ratings.copy() # Make a copy of the original data we can alter as our training set. 
    nonzero_inds = training_set.nonzero() # Find the indices in the ratings data where an interaction exists
    nonzero_pairs = list(zip(nonzero_inds[0], nonzero_inds[1])) # Zip these pairs together of user,item index into list
    random.seed(123) # Set the random seed to zero for reproducibility
    num_samples = int(np.ceil(pct_test*len(nonzero_pairs))) # Round the number of samples needed to the nearest integer
    samples = random.sample(nonzero_pairs, num_samples) # Sample a random number of user-item pairs without replacement
    user_inds = [index[0] for index in samples] # Get the user row indices
    item_inds = [index[1] for index in samples] # Get the item column indices
    training_set[user_inds, item_inds] = 0 # Assign all of the randomly chosen user-item pairs to zero
    training_set.eliminate_zeros() # Get rid of zeros in sparse array storage after update to save space
    return training_set, test_set, list(set(user_inds)) # Output the unique list of user rows that were altered  

In [35]:
X_train, X_test, item_users_altered = make_train(user_to_item_matrix, pct_test = 0.25)

## LightFM Models

### Tune Hyperparameters

In [37]:
def sample_hyperparameters():
    """
    Yield possible hyperparameter choices.
    """

    while True:
        yield {
            "no_components": np.random.randint(16, 64),
            "learning_schedule": np.random.choice(["adagrad", "adadelta"]),
            "loss": np.random.choice(["bpr", "warp", "warp-kos"]),
            "learning_rate": np.random.exponential(0.05),
            "item_alpha": np.random.exponential(1e-8),
            "user_alpha": np.random.exponential(1e-8),
            "max_sampled": np.random.randint(5, 15),
            "num_epochs": np.random.randint(5, 50),
        }


def random_search(train, test, num_samples=10, num_threads=1):

    for hyperparams in itertools.islice(sample_hyperparameters(), num_samples):
        num_epochs = hyperparams.pop("num_epochs")

        model = LightFM(**hyperparams)
        model.fit(train, epochs=num_epochs)

        score = auc_score(model, X_test).mean()

        hyperparams["num_sample_hyperparametersepochs"] = num_epochs

        yield (score, hyperparams, model)


if __name__ == "__main__":

    (score, hyperparams, model) = max(random_search(X_train, X_test, num_threads=2), key=lambda x: x[0])

    print("Best score {} at {}".format(score, hyperparams))

random_search(X_train,X_test,num_samples=10, num_threads=1)

Best score 0.9428374767303467 at {'learning_rate': 0.028546874915723632, 'user_alpha': 9.339008221479138e-10, 'loss': 'warp', 'num_sample_hyperparametersepochs': 45, 'item_alpha': 5.2037162041155225e-09, 'learning_schedule': 'adadelta', 'max_sampled': 5, 'no_components': 59}


<generator object random_search at 0x000001BD2CE7B1A8>

## Predictions 

In [39]:
#Hyperparameters optimized
no_comp, lr, ep, max_sam = 59, 0.01746475831477904,45 , 5
ua, ia = 9.540255960433563e-09, 6.794510538622656e-09

model=LightFM(no_components=no_comp,learning_rate=lr, loss='warp', learning_schedule = 'adadelta', max_sampled = max_sam, user_alpha = ua, item_alpha = ia)

model.fit(X_train,epochs=ep,verbose=True,user_features=user_property_matrix)

Epoch 0
Epoch 1
Epoch 2
Epoch 3
Epoch 4
Epoch 5
Epoch 6
Epoch 7
Epoch 8
Epoch 9
Epoch 10
Epoch 11
Epoch 12
Epoch 13
Epoch 14
Epoch 15
Epoch 16
Epoch 17
Epoch 18
Epoch 19
Epoch 20
Epoch 21
Epoch 22
Epoch 23
Epoch 24
Epoch 25
Epoch 26
Epoch 27
Epoch 28
Epoch 29
Epoch 30
Epoch 31
Epoch 32
Epoch 33
Epoch 34
Epoch 35
Epoch 36
Epoch 37
Epoch 38
Epoch 39
Epoch 40
Epoch 41
Epoch 42
Epoch 43
Epoch 44


<lightfm.lightfm.LightFM at 0x1bd2cf14f98>

# Questions : http://lyst.github.io/lightfm/docs/