# Trendyol Bootcamp Capstone Project - Buğra Taksuk
- Hybrid recommender system: Hybrid recommender system is a special type of recommender system that combines both content and collaborative filtering method. Combining collaborative filtering and content-based filtering could be more effective in some cases. Hybrid approaches can be implemented in several ways: by making content-based and collaborative-based predictions separately and then combining them; by adding content-based capabilities to a collaborative-based approach (and vice versa). Several studies empirically compare the performance of the hybrid with pure collaborative and content-based methods and demonstrate that hybrid methods can provide more accurate recommendations than pure approaches. These methods can also be used to overcome some of the common problems in recommender systems such as cold start and the sparsity problem.

In [1]:
import numpy as np
import pandas as pd
from datetime import datetime

In [2]:

items = pd.read_csv('articles.csv',dtype={'article_id': str})
customers = pd.read_csv('customers.csv')
transactions = pd.read_csv('transactions.csv',dtype={'article_id': str}, parse_dates=['t_dat'])


# User Clustering 

In [3]:
def make_cluster(transactions):
    import random
    k = 5
    customer_count = transactions.groupby('customer_id', as_index=False)[['price']].count().sort_index()
    not_cold_users = customer_count[customer_count.price > k].index
    customer_last_purchase = transactions[~transactions.customer_id.isin(not_cold_users)].groupby('customer_id', as_index=False)[['t_dat']].last()
    cold_inactive_users = customer_last_purchase[customer_last_purchase.t_dat < "2020-06-01"].index
    cold_active_users = customer_last_purchase[customer_last_purchase.t_dat > "2020-06-01"].index
    customer_last_purchase[customer_last_purchase.t_dat < "2020-06-01"]
    
    return cold_inactive_users, cold_active_users , not_cold_users

# Creating Features and R matrix

In [4]:
def create_features(customers, transactions, items):
    from scipy.sparse import csr_matrix
    from scipy.sparse import coo_matrix

    # find the count of which user has bought which item
    

    
    transactions = transactions[transactions.customer_id.isin(customers.customer_id)]
    
    transactions['count'] = 1
    item_features = transactions.groupby(['customer_id', 'article_id'])['count'].sum().reset_index()



    # add sales_chaneel_id, price and t_dat to the transactions_grouped dataframe from transactions dataframe
    item_features = item_features.merge(transactions, on=['customer_id', 'article_id'], how='left')

    #make t_dat to datetime
    item_features['t_dat'] = pd.to_datetime(item_features['t_dat'])

    # find time difference last transaction date and t_dat
    item_features['time_diff'] = item_features['t_dat'].max() - item_features['t_dat']
    item_features['time_diff'] = item_features['time_diff'].dt.days
    # add  colour_group_code  ,index_code, index_group_no, section_no, garment_group_no to the transactions_grouped dataframe from items dataframe
    item_features = item_features.merge(items[['article_id','colour_group_code','index_code','index_group_no', 'section_no', 'garment_group_no']])



    cust_features = item_features.drop(columns = ['colour_group_code','index_code', 'index_group_no', 'section_no', 'garment_group_no',])
    cust_features = cust_features.merge(customers, on = 'customer_id')
    
    item_features = pd.get_dummies(item_features, columns=['sales_channel_id', 'colour_group_code'
    , 'index_code', 'index_group_no', 'section_no', 'garment_group_no' ],drop_first = True) 
    
    cust_features = pd.get_dummies(cust_features, columns = ['club_member_status', 'fashion_news_frequency','sales_channel_id'], drop_first = True)
    cust_features['age'].fillna(cust_features['age'].mean(), inplace = True)
    
    cust_features = cust_features.drop(columns = ['article_id']).drop_duplicates(subset=['customer_id'])
    item_features = item_features.drop(columns = ['customer_id']).drop_duplicates(subset=['article_id'])
    
    #--------------------------------------------------

    
    ALL_USERS = cust_features['customer_id'].unique().tolist()
    ALL_ITEMS = item_features['article_id'].unique().tolist()

    user_ids = dict(list(enumerate(ALL_USERS)))
    item_ids = dict(list(enumerate(ALL_ITEMS)))

    user_map = {u: uidx for uidx, u in user_ids.items()}
    item_map = {i: iidx for iidx, i in item_ids.items()}
    

    

    transactions['user_id'] = transactions['customer_id'].map(user_map)
    transactions['item_id'] = transactions['article_id'].map(item_map)
    
    
    
    
    #----------------------------------------------------------
    
    

    
    validation_cut = '2020-09-15'
    cust_features_train = cust_features[cust_features['t_dat'] < validation_cut]
    cust_features_train = cust_features[cust_features['t_dat'] > '2019-09-15']
    cust_features_test = cust_features[cust_features['t_dat'] >= validation_cut]
    
    validation_cut = '2020-09-15'
    item_features_train = item_features[item_features['t_dat'] < validation_cut]
    item_features_train = item_features[item_features['t_dat'] > '2019-09-15']
    item_features_test = item_features[item_features['t_dat'] >= validation_cut]
    
    validation_cut = '2020-09-15'
    transactions_t = transactions[transactions['t_dat'] < validation_cut]
    transactions_t = transactions[transactions['t_dat'] > '2019-09-15']
    transactions_v = transactions[transactions['t_dat'] >= validation_cut]
    
    #-------------------------------------------------------------------------
    
    
    row = transactions_t['user_id'].values
    col = transactions_t['item_id'].values
    data = np.ones(transactions_t.shape[0])

    coo_t = coo_matrix((data, (row, col)), shape=(len(ALL_USERS), len(ALL_ITEMS)))
    
    row = transactions_v['user_id'].values
    col = transactions_v['item_id'].values
    data = np.ones(transactions_v.shape[0])
    coo_v = coo_matrix((data, (row, col)), shape=(len(ALL_USERS), len(ALL_ITEMS)))
    
    

    cust_features_train = cust_features_train.drop_duplicates(subset=['customer_id'])
    cust_features_test = cust_features_test.drop_duplicates(subset=['customer_id'])
    
    item_features_train = item_features_train.drop_duplicates(subset=['article_id'])
    item_features_test = item_features_test.drop_duplicates(subset=['article_id'])
    
    

    item_features_csr_train =  csr_matrix(item_features_train.drop(columns = ['count_x', 'count_y','t_dat','article_id']).values.astype(np.float))
    item_features_csr_test =  csr_matrix(item_features_test.drop(columns = ['count_x', 'count_y','t_dat','article_id']).values.astype(np.float))
    
    cust_features_csr_train = csr_matrix(cust_features_train.drop(columns = ['count_x', 'count_y', 'customer_id','t_dat']).values.astype(np.float))
    cust_features_csr_test = csr_matrix(cust_features_test.drop(columns = ['count_x', 'count_y', 'customer_id','t_dat']).values.astype(np.float))
    
    cust_features_csr = coo_matrix(cust_features.drop(columns = ['count_x', 'count_y', 'customer_id','t_dat']).values.astype(np.float))
    item_features_csr =  coo_matrix(item_features.drop(columns = ['count_x', 'count_y','t_dat','article_id']).values.astype(np.float))
    

    return  item_features, cust_features, item_features_csr, cust_features_csr, coo_t, coo_v , transactions_t,transactions_v
    

# Partioning Data

In [5]:
def generate_dataset(rate, transactions, customers):
    import random
    new_not_cold_users = random.sample(not_cold_users.to_list(), round(len(not_cold_users) * rate))
    new_cold_inactive_users = random.sample(cold_inactive_users.to_list(), round(len(cold_inactive_users) * rate))
    new_cold_active_users = random.sample(cold_active_users.to_list(), round(len(cold_active_users) * rate))
    
    new_users = new_not_cold_users + new_cold_inactive_users + new_cold_active_users


    new_customer = customers.iloc[new_users]
    
    return  new_customer

In [6]:
cold_inactive_users, cold_active_users , not_cold_users = make_cluster(transactions)

In [7]:
small_customers = generate_dataset(0.01, transactions, customers)

In [8]:
import warnings
warnings.filterwarnings("ignore")
item_features, cust_features, item_features_csr, cust_features_csr, coo_t, coo_v , trans_t,trans_v = create_features(small_customers, transactions, items)

In [9]:
coo_t.shape

(498061, 98484)

In [10]:
coo_v.shape

(498061, 98484)

In [11]:
item_features_csr.shape, cust_features_csr.shape

((98484, 141), (498061, 9))

# Model Creation 

 LightFM is a hybrid matrix factorisation model representing users and items as linear combinations of their content features’ latent factors. The model outperforms both collaborative and content-based models in cold-start or sparse interaction data scenarios (using both user and item metadata), and performs at least as well as a pure collaborative matrix factorisation model where interaction data is abundant.

In LightFM, like in a collaborative filtering model, users and items are represented as latent vectors (embeddings). However, just as in a CB model, these are entirely defined by functions (in this case, linear combinations) of embeddings of the content features that describe each product or user.

For example, if the movie ‘Wizard of Oz’ is described by the following features: ‘musical fantasy’, ‘Judy Garland’, and ‘Wizard of Oz’, then its latent representation will be given by the sum of these features’ latent representations. In doing so, LightFM unites the advantages of contentbased and collaborative recommenders. [6]

How LightFM works: The LightFM paper describes beautifully how lightFM works. To put it simply in words, lightFM model learns embeddings (latent representations in a high-dimensional space) for users and items in a way that encodes user preferences over items. When multiplied together, these representations produce scores for every item for a given user; items scored highly are more likely to be interesting to the user [5].

The user and item representations are expressed in terms of representations of their features: an embedding is estimated for every feature, and these features are then summed together to arrive at representations for users and items [5].

The latent representation of user u is given by the sum of its features’ latent vectors:  qu=∑j∈fu .

And same for the items:  pi=∑j∈fi 
The model’s prediction for user u and item i is then given by the dot product of user and item representations, adjusted by user and item feature biases:  rui^=f(qu⋅pi+bu+bi) 
This is just a general idea of the model. Please read the lightFM model paper more in depth knowledge.

Why LightFM:

In both cold-start and low density scenarios, LightFM performs at least as well as pure content-based models, substantially outperforming them when either (1) collaborative information is available in the training set or (2) user features are included in the model. This is really useful for our CareerVillage recommendation system beacause we will have a lot of new questions and students that makes a very good environment for cold start problem.

When collaborative data is abundant (warm-start, dense user-item matrix), LightFM performs at least as well as the MF model.

Embeddings produced by LightFM encode important semantic information about features, and can be used for related recommendation tasks such as tag recommendations. This is also very important for our problem. Because there are useful for finding similar tags so that model can recommend questions that has similiar tags to professionals tags.

In [12]:
from lightfm.cross_validation import random_train_test_split
from lightfm.evaluation import auc_score, precision_at_k, recall_at_k
from lightfm import LightFM

model = LightFM(loss='warp',
                random_state=2022,
                learning_rate=0.90,
                user_alpha=0.0005,
                item_alpha=0.0005)
model = model.fit(coo_t,
                  user_features=cust_features_csr,
                  item_features=item_features_csr,
                  epochs=10,
                  num_threads=16, verbose=False)

- WARP: Weighted Approximate-Rank Pairwise 2 loss. Maximises the rank of positive examples by repeatedly sampling negative examples until rank violating one is found. Useful when only positive interactions are present and optimising the top of the recommendation list (precision@k) is desired.
- Measure the precision at k metric for a model: the fraction of known positives in the first k positions of the ranked list of results. A perfect score is 1.0.

In [20]:

user_id = '0015f37f752a41a75c3be6f3f92deedc4c87d039f1758ec41f54f8c7f4729793'

idx = trans_v[trans_v['customer_id'] =='0015f37f752a41a75c3be6f3f92deedc4c87d039f1758ec41f54f8c7f4729793']['user_id'].values[0]
idx = int(idx)
n_users, n_items = coo_v.shape

scores = pd.Series(model.predict(idx,np.arange(n_items), user_features=cust_features_csr, item_features=item_features_csr))
scores.index = item_features.article_id.values


In [21]:
scores[:5]

176209023    1.738753e+19
568601006   -5.680094e+19
568601043   -5.158685e+20
607642008   -3.261786e+20
625548001    4.971098e+20
dtype: float32

In [22]:
items[items['article_id'].isin(scores[:5].index.values)]

Unnamed: 0,article_id,product_code,prod_name,product_type_no,product_type_name,product_group_name,graphical_appearance_no,graphical_appearance_name,colour_group_code,colour_group_name,...,department_name,index_code,index_name,index_group_no,index_group_name,section_no,section_name,garment_group_no,garment_group_name,detail_desc
99,176209023,176209,Mr Harrington w/hood,308,Hoodie,Garment Upper body,1010016,Solid,9,Black,...,Jacket Street,F,Menswear,3,Menswear,31,Mens Outerwear,1007,Outdoor,"Short, padded jacket with a jersey-lined hood ..."
16003,568601006,568601,Mariette Blazer,264,Blazer,Garment Upper body,1010016,Solid,9,Black,...,Suit,A,Ladieswear,1,Ladieswear,11,Womens Tailoring,1008,Dressed,Fitted jacket in woven fabric with notch lapel...
16023,568601043,568601,Mariette Blazer,264,Blazer,Garment Upper body,1010026,Other structure,93,Dark Green,...,Suit,A,Ladieswear,1,Ladieswear,11,Womens Tailoring,1008,Dressed,Fitted jacket in woven fabric with notch lapel...
23996,607642008,607642,The Firm (1),259,Shirt,Garment Upper body,1010017,Stripe,9,Black,...,Blouse,A,Ladieswear,1,Ladieswear,11,Womens Tailoring,1010,Blouses,Top in a crêpe weave with a V-shaped opening a...
29516,625548001,625548,BB Chris puff jkt TP,262,Jacket,Garment Upper body,1010016,Solid,73,Dark Blue,...,Young Boy Outdoor,I,Children Sizes 134-170,4,Baby/Children,45,Kids Outerwear,1007,Outdoor,"Padded jacket with a detachable hood, stand-up..."
