### Importing Packages

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sn
import warnings
warnings.filterwarnings("ignore")

In [2]:
from rankfm.rankfm import RankFM
from rankfm.evaluation import hit_rate, reciprocal_rank, discounted_cumulative_gain, precision, recall, diversity

### Preprocessing Datas
##### Getting Data

In [None]:
ratings = pd.io.parsers.read_csv("C:/Users/HOME/문서/한양대/3-2/산업공학연구실현장실습2/datas/MovieLens/ratings.dat", names=['userId','movieId','rating','time'], delimiter="::", encoding='ISO-8859-1')

In [49]:
users = pd.io.parsers.read_csv("C:/Users/HOME/문서/한양대/3-2/산업공학연구실현장실습2/datas/MovieLens/users.dat", names=['userId','gender','age','occupation','zipcode'], delimiter="::", encoding='ISO-8859-1')

In [56]:
movies = pd.io.parsers.read_csv("C:/Users/HOME/문서/한양대/3-2/산업공학연구실현장실습2/datas/MovieLens/movies.dat", names=['movieId','title','genre'], delimiter="::", encoding='ISO-8859-1')

In [19]:
ratings.head()

Unnamed: 0,userId,movieId,rating,time
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [18]:
users.head()

Unnamed: 0,userId,gender,age,occupation,zipcode
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455


In [20]:
movies.head()

Unnamed: 0,movieId,title,genre
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


##### Converting to Categorical

In [50]:
user_features = pd.get_dummies(users, columns=['gender','age','occupation'])
user_features.drop(columns=['zipcode'],inplace=True)
user_features = user_features+0
user_features

Unnamed: 0,userId,gender_F,gender_M,age_1,age_18,age_25,age_35,age_45,age_50,age_56,...,occupation_11,occupation_12,occupation_13,occupation_14,occupation_15,occupation_16,occupation_17,occupation_18,occupation_19,occupation_20
0,1,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,0,1,0,0,0,0,0,0,1,...,0,0,0,0,0,1,0,0,0,0
2,3,0,1,0,0,1,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
3,4,0,1,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,0,1,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6035,6036,1,0,0,0,1,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
6036,6037,1,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
6037,6038,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
6038,6039,1,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [57]:
movie_features =  movies['genre'].str.get_dummies("|")
movie_features = movie_features+0
movie_features = pd.concat([movies['movieId'],movie_features],axis=1)
movie_features

In [32]:
movie_titles = movies
movie_titles

Unnamed: 0,movieId,title,genre
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
3878,3948,Meet the Parents (2000),Comedy
3879,3949,Requiem for a Dream (2000),Drama
3880,3950,Tigerland (2000),Drama
3881,3951,Two Family House (2000),Drama


##### Checking Matrix & Vector dimensions

In [33]:
unique_users = ratings.userId.nunique()
unique_items = ratings.movieId.nunique()

print("ratings shape: {}".format(ratings.shape))
print("unique users: {}".format(unique_users))
print("unique items: {}".format(unique_items))

ratings shape: (1000209, 4)
unique users: 6040
unique items: 3706


##### Matrix Sparsity

In [34]:
sparsity = 1-(len(ratings) / (unique_users*unique_users))
print("Matrix Sparsity : {}%".format(round(100*sparsity, 1)))

Matrix Sparsity : 97.3%


### Splitting Data
splitting data for train and validation

In [35]:
np.random.seed(3)
ratings['random'] = np.random.random(size=len(ratings))
test_pct=0.25

In [36]:
train_mask = ratings['random'] < (1-test_pct)
valid_mask = ratings['random'] >= (1-test_pct)

ratings_train = ratings[train_mask][['userId','movieId']]
ratings_valid = ratings[valid_mask][['userId','movieId']]

train_users = np.sort(ratings_train.userId.unique())
valid_users = np.sort(ratings_valid.userId.unique())
cold_start_users = set(valid_users) - set(train_users)

train_movies = np.sort(ratings_train.movieId.unique())
valid_movies = np.sort(ratings_valid.movieId.unique())
cold_start_movies = set(valid_movies) - set(train_movies)

print("train shape :{}".format(ratings_train.shape))
print("valid shape :{}".format(ratings_valid.shape))

print("train users :{}".format(len(train_users)))
print("valid users :{}".format(len(valid_users)))
print("cold-start users :{}".format(cold_start_users))

print("train movies :{}".format(len(train_movies)))
print("valid movies :{}".format(len(valid_movies)))
print("cold-start movies :{}".format(cold_start_movies))

train shape :(749523, 2)
valid shape :(250686, 2)
train users :6040
valid users :6040
cold-start users :set()
train movies :3668
valid movies :3505
cold-start movies :{3202, 3458, 133, 3209, 3337, 139, 2703, 658, 3220, 790, 2592, 2213, 1830, 3881, 2218, 3376, 1714, 2226, 826, 1852, 576, 843, 717, 2254, 3151, 3413, 1115, 3164, 989, 1630, 3295, 226, 3172, 872, 1386, 3312, 758, 2556}


### Using Model

In [41]:
# Initializing Model
model = RankFM(factors=20, loss='warp', max_samples=20, alpha=0.01, sigma=0.1, learning_rate=0.10, learning_schedule='invscaling')
model

<rankfm.rankfm.RankFM at 0x1c3d39d44d0>

In [43]:
help(model.fit)

Help on method fit in module rankfm.rankfm:

fit(interactions, user_features=None, item_features=None, sample_weight=None, epochs=1, verbose=False) method of rankfm.rankfm.RankFM instance
    clear previous model state and learn new model weights using the input data
    
    :param interactions: dataframe of observed user/item interactions: [user_id, item_id]
    :param user_features: dataframe of user metadata features: [user_id, uf_1, ..., uf_n]
    :param item_features: dataframe of item metadata features: [item_id, if_1, ..., if_n]
    :param sample_weight: vector of importance weights for each observed interaction
    :param epochs: number of training epochs (full passes through observed interactions)
    :param verbose: whether to print epoch number and log-likelihood during training
    :return: self



In [67]:
model.fit(ratings_train, epochs=1, verbose=True)

KeyError: 'the items in [item_features] do not match the items in [interactions]'

##### Generate Model Scores for Validation Interaction

In [76]:
valid_scores = model.predict(ratings_valid, cold_start='nan')
print(valid_scores.shape)
pd.Series(valid_scores).describe()

(250686,)


count    250645.000000
mean          0.738618
std           0.923667
min          -3.765935
25%           0.094899
50%           0.730985
75%           1.385614
max           4.065475
dtype: float64

* `predict` : generates real-valued model scores for user/movie pairs

##### Generating Top n Recommendations for `valid_users`

In [80]:
valid_recommendations = model.recommend(valid_users, n_items=10, filter_previous=True, cold_start='nan')
valid_recommendations.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
1,364,318,593,1225,2081,1196,2858,1210,588,356
2,527,457,3418,858,1233,480,377,260,1580,590
3,2918,2000,1073,2797,2716,1220,2028,110,592,1387
4,858,1200,1214,1221,589,1291,1304,1197,2571,3703
5,2858,1617,2324,2396,2336,2710,223,2712,1060,778


##### Evaluating Model Performance on `ratings_valid`

In [81]:
k = 10

In [83]:
most_popular = ratings_train.groupby('movieId')['userId'].count().sort_values(ascending=False)[:k]
most_popular

movieId
2858    2577
1196    2210
260     2207
1210    2176
2028    2005
589     1972
480     1959
2571    1950
1270    1938
593     1918
Name: userId, dtype: int64

In [84]:
test_user_items = ratings_valid.groupby('userId')['movieId'].apply(set).to_dict()
test_user_items = {key: val for key, val in test_user_items.items() if key in set(train_users)}

base_hrt = np.mean([int(len(set(most_popular.index) & set(val)) > 0)                       for key, val in test_user_items.items()])
base_pre = np.mean([len(set(most_popular.index) & set(val)) / len(set(most_popular.index)) for key, val in test_user_items.items()])
base_rec = np.mean([len(set(most_popular.index) & set(val)) / len(set(val))                for key, val in test_user_items.items()])

print("number of test users: {}".format(len(test_user_items)))
print("baseline hit rate: {:.3f}".format(base_hrt))
print("baseline precision: {:.3f}".format(base_pre))
print("baseline recall: {:.3f}".format(base_rec))

number of test users: 6040
baseline hit rate: 0.633
baseline precision: 0.118
baseline recall: 0.045


* `hit_rate` : proportion of users with any relevant recommended item
* `precision` : the number of relevant recommended items / number of recommended items
* `recall` : the number of relevant recommended items / number of relevant items