In [2]:
from fastai.collab import *
from fastai.tabular import *

In [3]:
from lightfm import LightFM



In [4]:
import pandas as pd

In [5]:
df_train = pd.read_csv('train.csv')
df_train.head(13)

Unnamed: 0,user_sequence,user_id,challenge_sequence,challenge
0,4576_1,4576,1,CI23714
1,4576_2,4576,2,CI23855
2,4576_3,4576,3,CI24917
3,4576_4,4576,4,CI23663
4,4576_5,4576,5,CI23933
5,4576_6,4576,6,CI25135
6,4576_7,4576,7,CI23975
7,4576_8,4576,8,CI25126
8,4576_9,4576,9,CI24915
9,4576_10,4576,10,CI24957


In [6]:
df_train.shape

(903916, 4)

In [7]:
df_test = pd.read_csv('test.csv')
df_test.head()

Unnamed: 0,user_sequence,user_id,challenge_sequence,challenge
0,4577_1,4577,1,CI23855
1,4577_2,4577,2,CI23933
2,4577_3,4577,3,CI24917
3,4577_4,4577,4,CI24915
4,4577_5,4577,5,CI23714


In [8]:
df_test.shape

(397320, 4)

In [9]:
train = df_train.append(df_test)
train.shape

(1301236, 4)

In [10]:
train['rating']=1
train.head()

Unnamed: 0,user_sequence,user_id,challenge_sequence,challenge,rating
0,4576_1,4576,1,CI23714,1
1,4576_2,4576,2,CI23855,1
2,4576_3,4576,3,CI24917,1
3,4576_4,4576,4,CI23663,1
4,4576_5,4576,5,CI23933,1


In [11]:
challange = pd.read_csv('challenge_data.csv')
challange.head()

Unnamed: 0,challenge_ID,programming_language,challenge_series_ID,total_submissions,publish_date,author_ID,author_gender,author_org_ID,category_id
0,CI23478,2,SI2445,37.0,06-05-2006,AI563576,M,AOI100001,
1,CI23479,2,SI2435,48.0,17-10-2002,AI563577,M,AOI100002,32.0
2,CI23480,1,SI2435,15.0,16-10-2002,AI563578,M,AOI100003,
3,CI23481,1,SI2710,236.0,19-09-2003,AI563579,M,AOI100004,70.0
4,CI23482,2,SI2440,137.0,21-03-2002,AI563580,M,AOI100005,


In [55]:
challange['category_id'] = challange['category_id'].astype('str')
challange.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5606 entries, 0 to 5605
Data columns (total 9 columns):
challenge_ID            5606 non-null object
programming_language    5606 non-null int64
challenge_series_ID     5594 non-null object
total_submissions       5254 non-null float64
publish_date            5606 non-null object
author_ID               5567 non-null object
author_gender           5509 non-null object
author_org_ID           5358 non-null object
category_id             5606 non-null object
dtypes: float64(1), int64(1), object(7)
memory usage: 394.2+ KB


In [56]:
challange.head()

Unnamed: 0,challenge_ID,programming_language,challenge_series_ID,total_submissions,publish_date,author_ID,author_gender,author_org_ID,category_id
0,CI23478,2,SI2445,37.0,06-05-2006,AI563576,M,AOI100001,
1,CI23479,2,SI2435,48.0,17-10-2002,AI563577,M,AOI100002,32.0
2,CI23480,1,SI2435,15.0,16-10-2002,AI563578,M,AOI100003,
3,CI23481,1,SI2710,236.0,19-09-2003,AI563579,M,AOI100004,70.0
4,CI23482,2,SI2440,137.0,21-03-2002,AI563580,M,AOI100005,


In [59]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder, LabelEncoder, KBinsDiscretizer

numeric_features = ['total_submissions']
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('bin', KBinsDiscretizer(strategy='quantile',encode='ordinal'))])

categorical_features = ['challenge_series_ID', 'author_ID', 'author_org_ID','category_id','author_gender']
categorical_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='constant', fill_value='missing'))])

preprocessor = ColumnTransformer(
    transformers=[('num', numeric_transformer, numeric_features),
                  ('cat', categorical_transformer, categorical_features)])

clf = Pipeline(steps=[('preprocessor', preprocessor)])
challange_transform = clf.fit_transform(challange)

In [75]:
new_cols = ['total_submissions_bin', 'challenge_series_ID', 'author_ID', 'author_org_ID', 'category_id','author_gender']
cols = ['challenge_ID','programming_language']
transformed_df = pd.DataFrame(challange_transform, columns=new_cols)
challange_df = pd.concat([challange[cols], transformed_df], axis = 1)
challange_df.head()

Unnamed: 0,challenge_ID,programming_language,total_submissions_bin,challenge_series_ID,author_ID,author_org_ID,category_id,author_gender
0,CI23478,2,0,SI2445,AI563576,AOI100001,,M
1,CI23479,2,0,SI2435,AI563577,AOI100002,32.0,M
2,CI23480,1,0,SI2435,AI563578,AOI100003,,M
3,CI23481,1,3,SI2710,AI563579,AOI100004,70.0,M
4,CI23482,2,2,SI2440,AI563580,AOI100005,,M


In [76]:
dict_a = {0:'bin_1', 1:'bin_2',  2:'bin_3',  3:'bin_4', 4:'bin_5'}
dict_b = {1:'lang_1', 2:'lang_2',  3:'lang_3'}

challange_df['total_submissions_bin'] = challange_df['total_submissions_bin'].map(dict_a) 
challange_df['programming_language'] = challange_df['programming_language'].map(dict_b) 
challange_df.head()

Unnamed: 0,challenge_ID,programming_language,total_submissions_bin,challenge_series_ID,author_ID,author_org_ID,category_id,author_gender
0,CI23478,lang_2,bin_1,SI2445,AI563576,AOI100001,,M
1,CI23479,lang_2,bin_1,SI2435,AI563577,AOI100002,32.0,M
2,CI23480,lang_1,bin_1,SI2435,AI563578,AOI100003,,M
3,CI23481,lang_1,bin_4,SI2710,AI563579,AOI100004,70.0,M
4,CI23482,lang_2,bin_3,SI2440,AI563580,AOI100005,,M


In [261]:
import pandas as pd
import numpy as np
from scipy import sparse
from lightfm import LightFM
from sklearn.metrics.pairwise import cosine_similarity

def create_interaction_matrix(df,user_col, item_col, rating_col, norm= False, threshold = None):
    '''
    Function to create an interaction matrix dataframe from transactional type interactions
    Required Input -
        - df = Pandas DataFrame containing user-item interactions
        - user_col = column name containing user's identifier
        - item_col = column name containing item's identifier
        - rating col = column name containing user feedback on interaction with a given item
        - norm (optional) = True if a normalization of ratings is needed
        - threshold (required if norm = True) = value above which the rating is favorable
    Expected output - 
        - Pandas dataframe with user-item interactions ready to be fed in a recommendation algorithm
    '''
    interactions = df.groupby([user_col, item_col])[rating_col] \
            .sum().unstack().reset_index(). \
            fillna(0).set_index(user_col)
    if norm:
        interactions = interactions.applymap(lambda x: 1 if x > threshold else 0)
    return interactions

def create_user_dict(interactions):
    '''
    Function to create a user dictionary based on their index and number in interaction dataset
    Required Input - 
        interactions - dataset create by create_interaction_matrix
    Expected Output -
        user_dict - Dictionary type output containing interaction_index as key and user_id as value
    '''
    user_id = list(interactions.index)
    user_dict = {}
    counter = 0 
    for i in user_id:
        user_dict[i] = counter
        counter += 1
    return user_dict
    
def create_item_dict(df,id_col,name_col):
    '''
    Function to create an item dictionary based on their item_id and item name
    Required Input - 
        - df = Pandas dataframe with Item information
        - id_col = Column name containing unique identifier for an item
        - name_col = Column name containing name of the item 
    Expected Output - 
        - item_dict = Dictionary type output containing item_id as key and item_name as value
    '''
    item_dict ={}
    for i in range(df.shape[0]):
        item_dict[(df.loc[i,id_col])] = df.loc[i,name_col]
    return item_dict

def create_feature_dict(dataframe, features_name):
    """
    Generate features list for mapping 

    Parameters
    ----------
    dataframe: Dataframe
        Pandas Dataframe for Users or Q&A. 
    features_name : List
        List of feature columns name avaiable in dataframe. 
        
    Returns
    -------
    List of all features for mapping 
    """
    features_dict ={}
    features = dataframe[features_name].apply(lambda x: ','.join(x.map(str)), axis=1)
    features = features.str.split(',')
    features = features.apply(pd.Series).stack().reset_index(drop=True).unique()
    for feature_index, feature_id in enumerate(features):
        features_dict[feature_index] = feature_id
    return features_dict

def create_feature_interaction(dataframe, features_name, product_id):
    """
    Generate features list for mapping 

    Parameters
    ----------
    dataframe: Dataframe
        Pandas Dataframe for Users or Q&A. 
    features_name : List
        List of feature columns name avaiable in dataframe. 
        
    Returns
    -------
    List of all features for mapping 
    """
    features = dataframe[features_name].apply(lambda x: ','.join(x.map(str)), axis=1)
    features_prod = pd.DataFrame(zip(dataframe[product_id],features),columns=[product_id,'feature'])
    features_prod = pd.DataFrame(features_prod['feature'].str.split(',').tolist(), index=dataframe[product_id]).stack().reset_index([0, product_id])
    features_prod.columns = [product_id, 'feature']
    features_prod['feature_count'] = 1
    return features_prod

def runMF(interactions, user_features=None, item_features=None, n_components=30, loss='warp', k=15, epoch=30,n_jobs = 4):
    '''
    Function to run matrix-factorization algorithm
    Required Input -
        - interactions = dataset create by create_interaction_matrix
        - n_components = number of embeddings you want to create to define Item and user
        - loss = loss function other options are logistic, brp
        - epoch = number of epochs to run 
        - n_jobs = number of cores used for execution 
    Expected Output  - 
        - Model - Trained model
    '''
    
    #x = sparse.csr_matrix(interactions.values)
    
    #if isinstance(user_features, pd.DataFrame):
        #user_features = sparse.csr_matrix(user_features.values)
        
    #if isinstance(item_features, pd.DataFrame):
        #item_features = sparse.csr_matrix(item_features.values)
        
    model = LightFM(no_components= n_components, loss=loss,k=k, random_state=2019)
    model.fit(interactions,user_features=user_features,item_features=item_features,epochs=epoch,num_threads = n_jobs)
    return model

def sample_recommendation_user(model, interactions, user_id, user_dict, item_dict, user_features=None, item_features=None, 
                               threshold = 0, nrec_items = 10, show = True):
    '''
    Function to produce user recommendations
    Required Input - 
        - model = Trained matrix factorization model
        - interactions = dataset used for training the model
        - user_id = user ID for which we need to generate recommendation
        - user_dict = Dictionary type input containing interaction_index as key and user_id as value
        - item_dict = Dictionary type input containing item_id as key and item_name as value
        - threshold = value above which the rating is favorable in new interaction matrix
        - nrec_items = Number of output recommendation needed Expected Output - 
        - Prints list of items the given user has already bought
        - Prints list of N recommended items  which user hopefully will be interested in
    '''
    n_users, n_items = interactions.shape
    user_x = user_dict[user_id]
    scores = pd.Series(model.predict(user_x,np.arange(n_items),user_features=user_features,item_features=item_features))
    scores.index = interactions.columns
    scores = list(pd.Series(scores.sort_values(ascending=False).index))
    
    known_items = list(pd.Series(interactions.loc[user_id,:] \
                                 [interactions.loc[user_id,:] > threshold].index) \
								 .sort_values(ascending=False))
    
    scores = [x for x in scores if x not in known_items]
    return_score_list = scores[0:nrec_items]
    known_items = list(pd.Series(known_items).apply(lambda x: item_dict[x]))
    scores = list(pd.Series(return_score_list).apply(lambda x: item_dict[x]))
    if show == True:
        print("Known Likes:")
        counter = 1
        for i in known_items:
            print(str(counter) + '- ' + i)
            counter+=1

        print("\n Recommended Items:")
        counter = 1
        for i in scores:
            print(str(counter) + '- ' + i)
            counter+=1
    return return_score_list
    

def sample_recommendation_item(model,interactions,item_id,user_dict,item_dict,number_of_user):
    '''
    Funnction to produce a list of top N interested users for a given item
    Required Input -
        - model = Trained matrix factorization model
        - interactions = dataset used for training the model
        - item_id = item ID for which we need to generate recommended users
        - user_dict =  Dictionary type input containing interaction_index as key and user_id as value
        - item_dict = Dictionary type input containing item_id as key and item_name as value
        - number_of_user = Number of users needed as an output
    Expected Output -
        - user_list = List of recommended users 
    '''
    n_users, n_items = interactions.shape
    x = np.array(interactions.columns)
    scores = pd.Series(model.predict(np.arange(n_users), np.repeat(x.searchsorted(item_id),n_users)))
    user_list = list(interactions.index[scores.sort_values(ascending=False).head(number_of_user).index])
    return user_list 


def create_item_emdedding_distance_matrix(model,interactions):
    '''
    Function to create item-item distance embedding matrix
    Required Input -
        - model = Trained matrix factorization model
        - interactions = dataset used for training the model
    Expected Output -
        - item_emdedding_distance_matrix = Pandas dataframe containing cosine distance matrix b/w items
    '''
    df_item_norm_sparse = sparse.csr_matrix(model.item_embeddings)
    similarities = cosine_similarity(df_item_norm_sparse)
    item_emdedding_distance_matrix = pd.DataFrame(similarities)
    item_emdedding_distance_matrix.columns = interactions.columns
    item_emdedding_distance_matrix.index = interactions.columns
    return item_emdedding_distance_matrix

def item_item_recommendation(item_emdedding_distance_matrix, item_id, 
                             item_dict, n_items = 10, show = True):
    '''
    Function to create item-item recommendation
    Required Input - 
        - item_emdedding_distance_matrix = Pandas dataframe containing cosine distance matrix b/w items
        - item_id  = item ID for which we need to generate recommended items
        - item_dict = Dictionary type input containing item_id as key and item_name as value
        - n_items = Number of items needed as an output
    Expected Output -
        - recommended_items = List of recommended items
    '''
    recommended_items = list(pd.Series(item_emdedding_distance_matrix.loc[item_id,:]. \
                                  sort_values(ascending = False).head(n_items+1). \
                                  index[1:n_items+1]))
    if show == True:
        print("Item of interest :{0}".format(item_dict[item_id]))
        print("Item similar to the above item:")
        counter = 1
        for i in recommended_items:
            print(str(counter) + '- ' +  item_dict[i])
            counter+=1
    return recommended_items


In [163]:
cols = ['programming_language',
        'total_submissions_bin',
        'challenge_series_ID',
        'author_ID',
        'author_org_ID',
        'category_id',
        'author_gender']
features_dict = create_feature_dict(challange_df,cols)

In [181]:
col = 'challenge_ID'
prod_feature_interaction = create_feature_interaction(challange_df,cols,col)
prod_feature_interaction.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39242 entries, 0 to 39241
Data columns (total 3 columns):
challenge_ID     39242 non-null object
feature          39242 non-null object
feature_count    39242 non-null int64
dtypes: int64(1), object(2)
memory usage: 919.8+ KB


In [88]:
interactions = create_interaction_matrix(df = train,
                                         user_col = 'user_id',
                                         item_col = 'challenge',
                                         rating_col = 'rating')
interactions.shape

(109264, 5502)

In [183]:
prod_feature_ineraction_m = create_interaction_matrix(df = prod_feature_interaction,
                                         user_col = 'challenge_ID',
                                         item_col = 'feature',
                                         rating_col = 'feature_count')
prod_feature_ineraction_m.shape

(5606, 5842)

In [96]:
user_dict = create_user_dict(interactions=interactions)

In [95]:
movies_dict = create_item_dict(df = challange, id_col = 'challenge_ID', name_col = 'challenge_ID')

In [238]:
mf_model = runMF(interactions = sparse.csr_matrix(interactions.values),
                 user_features=None, 
                 item_features=sparse.csr_matrix(prod_feature_ineraction_m.values),
                 #item_features=None,
                 n_components = 30,
                 loss = 'warp',
                 k = 15,
                 epoch = 30,
                 n_jobs = 4)

In [241]:
rec_list = sample_recommendation_user(model = mf_model, 
                                      interactions = interactions, 
                                      user_features=None,
                                      #item_features=None,
                                      item_features=sparse.csr_matrix(prod_feature_ineraction_m),
                                      user_id = 4576, 
                                      user_dict = user_dict,
                                      item_dict = movies_dict, 
                                      threshold = 0,
                                      nrec_items = 3)

Known Likes:
1- CI25135
2- CI25126
3- CI24958
4- CI24957
5- CI24917
6- CI24915
7- CI23975
8- CI23933
9- CI23855
10- CI23714
11- CI23691
12- CI23667
13- CI23663

 Recommended Items:
1- CI24530
2- CI23848
3- CI23746


In [242]:
rec_list

['CI24530', 'CI23848', 'CI23746']

In [243]:
test_user = df_test.iloc[:,1].unique()
len(test_user)

39732

In [244]:
predictions = []

for i in range(0,len(test_user)):
    u = test_user[i]
    a = sample_recommendation_user(model = mf_model,
                                   interactions = interactions, 
                                   user_id = u, 
                                   user_dict = user_dict,
                                   item_dict = movies_dict, 
                                   threshold = 0, 
                                   nrec_items = 3,
                                   show = False)
    for j in range(0,len(a)):
        predictions.append(dict({'user_id' : u, 'challenge': a[j]}))
        
test_preds = pd.DataFrame(predictions)

In [245]:
test_preds.head()

Unnamed: 0,challenge,user_id
0,CI23970,4577
1,CI24843,4577
2,CI24050,4577
3,CI23970,4578
4,CI24843,4578


In [246]:
test_preds.to_csv('preds_lfm_4.csv')

In [255]:
from lightfm import LightFM
from lightfm import cross_validation
train_interactions, test_interactions = cross_validation.random_train_test_split(sparse.csr_matrix(interactions.values), 
                                                                                 random_state=np.random.RandomState(2019))

In [263]:
mf_model_1 = runMF(interactions = train_interactions,
                 user_features=None, 
                 #item_features=prod_feature_ineraction_m,
                 item_features=None,
                 n_components = 30,
                 loss = 'warp',
                 k = 15,
                 epoch = 30,
                 n_jobs = 4)

In [265]:
from lightfm.evaluation import auc_score
score = auc_score(mf_model_1, 
                  train_interactions, 
                  item_features=None, 
                  user_features=None, 
                  num_threads=4).mean()

In [267]:
score_test = auc_score(mf_model_1, 
                  test_interactions, 
                  item_features=None, 
                  user_features=None, 
                  num_threads=4).mean()

In [268]:
print(score,score_test)

0.99792606 0.984387


In [270]:
mf_model_2 = runMF(interactions = train_interactions,
                 user_features=None, 
                 item_features=sparse.csr_matrix(prod_feature_ineraction_m.values),
                 #item_features=None,
                 n_components = 30,
                 loss = 'warp',
                 k = 15,
                 epoch = 30,
                 n_jobs = 4)

In [272]:
from lightfm.evaluation import auc_score
score = auc_score(mf_model_2, 
                  train_interactions, 
                  item_features=sparse.csr_matrix(prod_feature_ineraction_m.values), 
                  user_features=None, 
                  num_threads=4).mean()

In [273]:
score_test = auc_score(mf_model_2, 
                  test_interactions, 
                  item_features=sparse.csr_matrix(prod_feature_ineraction_m.values),
                  user_features=None, 
                  num_threads=4).mean()

In [274]:
print(score,score_test)

0.9962991 0.9721913
