In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sn
import warnings
warnings.filterwarnings("ignore")

In [2]:
from rankfm.rankfm import RankFM
from rankfm.evaluation import hit_rate, reciprocal_rank, discounted_cumulative_gain, precision, recall, diversity

### Preprocessing

##### Getting Datas

In [3]:
data = pd.read_csv("C:/Users/HOME/문서/한양대/3-2/산업공학연구실현장실습2/datas/data_new.csv",index_col=0)
data.head()

Unnamed: 0,AUTH_CUSTOMER_ID,PRODUCT_CODE,ORDER_DATE,BIRTH_YEAR,user_total_fq,item_total_fq,GENDER,total_fq,C,target,DEPTH1,DEPTH2,DEPTH3,DEPTH4
38495,7,1472250,1634565000.0,1994.0,230,97090,0,5582675,3.459891e-05,1,14,1472,147225,1472250
148981,7,1212466,1642638000.0,1994.0,230,594,0,34155,2.116774e-07,1,12,1212,121246,1212466
480601,7,1212466,1654022000.0,1994.0,230,594,0,34155,2.116774e-07,1,12,1212,121246,1212466
160576,7,1212466,1643224000.0,1994.0,230,594,0,34155,2.116774e-07,1,12,1212,121246,1212466
160543,7,1212466,1643220000.0,1994.0,230,594,0,34155,2.116774e-07,1,12,1212,121246,1212466


In [9]:
interactions_df = data[['AUTH_CUSTOMER_ID',"PRODUCT_CODE"]]
interactions_df

Unnamed: 0,AUTH_CUSTOMER_ID,PRODUCT_CODE
38495,7,1472250
148981,7,1212466
480601,7,1212466
160576,7,1212466
160543,7,1212466
...,...,...
1758592,1173285,1472823
1758593,1173285,1472823
1758594,1173285,1472823
1758543,1173285,1509837


In [None]:
from rankfm.utils import get_data

In [11]:
interactions_df = pd.DataFrame((get_data(interactions_df)), columns=['user_id', 'item_id'])
interactions_df

Unnamed: 0,user_id,item_id
0,7,1472250
1,7,1212466
2,7,1212466
3,7,1212466
4,7,1212466
...,...,...
1168410,1173285,1472823
1168411,1173285,1472823
1168412,1173285,1472823
1168413,1173285,1509837


In [13]:
N = interactions_df.shape[0]
shuffle_index = np.arange(N, dtype=np.int32)

In [16]:
for r in range(100):
    row = shuffle_index[r]
    u = interactions_df[row,0]
    i = interactions_df[row,1]

KeyError: (0, 0)

### Check Matrix/Vector Dimensions
데이터 구조 확인

In [5]:
unique_users = interactions_df.AUTH_CUSTOMER_ID.nunique()
unique_items = interactions_df.PRODUCT_CODE.nunique()

print("interaction shape: {}".format(interactions_df.shape))
print("interactions unique users: {}".format(unique_users))
print("interactions unique items: {}".format(unique_items))

interaction shape: (1168415, 2)
interactions unique users: 113578
interactions unique items: 8379


Evaluate Interaction Matrix Sparsity

In [6]:
sparsity = 1-len(interactions_df)/(unique_users * unique_items)
print("interaction matrix sparsity: {}%".format(round(100*sparsity, 1)))

interaction matrix sparsity: 99.9%


### Splliting Data into (Train, Valid) for Model Evaluation

In [7]:
np.random.seed(3)
interactions_df['random'] = np.random.random(size=len(interactions_df))
test_pct_1=0.25
test_pct_2=0.3
test_pct_3=0.1

In [8]:
class train_valid_generator:
    def __init__(self, test_pct):
        self.test_pct = test_pct
        self.train_mask = interactions_df['random']<1-(test_pct)
        self.valid_mask = interactions_df['random'] >= (1-test_pct)
        self.interactions_train = interactions_df[self.train_mask][['AUTH_CUSTOMER_ID','PRODUCT_CODE']]
        self.interactions_valid = interactions_df[self.valid_mask][['AUTH_CUSTOMER_ID','PRODUCT_CODE']]

        self.train_users = np.sort(self.interactions_train.AUTH_CUSTOMER_ID.unique())
        self.valid_users = np.sort(self.interactions_valid.AUTH_CUSTOMER_ID.unique())
        self.cold_start_users = set(self.valid_users) - set(self.train_users)

        self.train_items = np.sort(self.interactions_train.PRODUCT_CODE.unique())
        self.valid_items = np.sort(self.interactions_valid.PRODUCT_CODE.unique())
        self.cold_start_items = set(self.valid_items) - set(self.train_items)

In [9]:
data_1 = train_valid_generator(test_pct_1)
data_2 = train_valid_generator(test_pct_2)
data_3 = train_valid_generator(test_pct_3)

### Fit the Model on the Training Data
* each epoch represents one full pass through all observered user/item ineractions in the training data
* verbose : current training epoch와 penalized log-likelihood 출력
* algorithm attempts to maximize model's log-likelihood : user preference for observed items over unobserved item

In [10]:
model_1 = RankFM(factors=20, loss='warp', max_samples=20, alpha=0.01, sigma=0.1, learning_rate=0.10, learning_schedule='invscaling')
model_2 = RankFM(factors=20, loss='warp', max_samples=20, alpha=0.01, sigma=0.1, learning_rate=0.10, learning_schedule='invscaling')
model_3 = RankFM(factors=20, loss='warp', max_samples=20, alpha=0.01, sigma=0.1, learning_rate=0.10, learning_schedule='invscaling')

In [11]:
model_1.fit(data_1.interactions_train, epochs=10, verbose=True)


training epoch: 0
log likelihood: -349014.65625

training epoch: 1
log likelihood: -345466.09375

training epoch: 2
log likelihood: -344863.96875

training epoch: 3
log likelihood: -343856.34375

training epoch: 4
log likelihood: -342791.71875

training epoch: 5
log likelihood: -340500.9375

training epoch: 6
log likelihood: -338542.40625

training epoch: 7
log likelihood: -335913.1875

training epoch: 8
log likelihood: -332329.75

training epoch: 9
log likelihood: -328900.46875


In [12]:
model_2.fit(data_2.interactions_train, epochs=10, verbose=True)


training epoch: 0
log likelihood: -326010.5625

training epoch: 1
log likelihood: -322542.15625

training epoch: 2
log likelihood: -321596.96875

training epoch: 3
log likelihood: -320667.53125

training epoch: 4
log likelihood: -319760.1875

training epoch: 5
log likelihood: -318341.15625

training epoch: 6
log likelihood: -315553.0625

training epoch: 7
log likelihood: -314046.90625

training epoch: 8
log likelihood: -310880.46875

training epoch: 9
log likelihood: -307888.90625


In [13]:
model_3.fit(data_3.interactions_train, epochs=10, verbose=True)


training epoch: 0
log likelihood: -418336.625

training epoch: 1
log likelihood: -414544.375

training epoch: 2
log likelihood: -413826.625

training epoch: 3
log likelihood: -412937.90625

training epoch: 4
log likelihood: -410650.4375

training epoch: 5
log likelihood: -407951.34375

training epoch: 6
log likelihood: -403925.53125

training epoch: 7
log likelihood: -399896.71875

training epoch: 8
log likelihood: -394732.59375

training epoch: 9
log likelihood: -388852.625


### Generate Model Scores for Validation Interactions
* `predict()` : user/item pair을 위한 실수 model scores 생성
* [user_id, item_id]와 같은 방식으로 specify scoring interactions
* `cold_start` : to either generate missing values for unseen user/itmes or drop them for scoring output

In [14]:
# valid_scores_1 = model_1.predict(data_1.interactions_valid, cold_start='nan')
# valid_scores_2 = model_2.predict(data_2.interactions_valid, cold_start='nan')
# valid_scores_3 = model_3.predict(data_3.interactions_valid, cold_start='nan')
# print(valid_scores_1.shape, valid_scores_2.shape, valid_scores_3.shape)
# pd.Series(valid_scores_1).describe()

In [15]:
# valid_recs_1 = model_1.recommend(data_1.valid_users, n_items=10, filter_previous=True, cold_start="drop")
# valid_recs_2 = model_2.recommend(data_2.valid_users, n_items=10, filter_previous=True, cold_start="drop")
# valid_recs_2 = model_3.recommend(data_3.valid_users, n_items=10, filter_previous=True, cold_start="drop")

### Evaluate Model Performance with Validation Data

In [16]:
class evaluative_metrics:
    def __init__(self, model, data):
        self.valid_hit_rate = hit_rate(model, data.interactions_valid, k=10)
        self.reciprocal_rank = reciprocal_rank(model, data.interactions_valid, k=10)
        self.valid_dcg = discounted_cumulative_gain(model, data.interactions_valid, k=10)
        self.valid_precision = precision(model, data.interactions_valid, k=10)
        self.recall = recall(model, data.interactions_valid, k=10)
    
    def print_metrics(self):
        print("hit rate: {:.4f}".format(self.valid_hit_rate))
        print("reciprocal_rank: {:.4f}".format(self.reciprocal_rank))
        print("dcg: {:.4f}".format(self.valid_dcg))
        print("precision: {:.4f}".format(self.valid_precision))
        print("recall: {}".format(self.recall))

In [17]:
eval_1 = evaluative_metrics(model_1, data_1)
eval_1.print_metrics()

hit rate: 0.3437
reciprocal_rank: 0.1672
dcg: 0.2326
precision: 0.0407
recall: 0.19114823320135513


In [18]:
eval_2 = evaluative_metrics(model_2, data_2)
eval_2.print_metrics()

hit rate: 0.3649
reciprocal_rank: 0.1730
dcg: 0.2452
precision: 0.0437
recall: 0.18775074234878947


In [19]:
eval_3 = evaluative_metrics(model_3, data_3)
eval_3.print_metrics()

hit rate: 0.2858
reciprocal_rank: 0.1522
dcg: 0.1947
precision: 0.0314
recall: 0.20842983116648403
