In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm

from sklearn.metrics import log_loss, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

from deepctr_torch.inputs import SparseFeat, DenseFeat, VarLenSparseFeat,  get_feature_names
from deepctr_torch.models import *
from keras.preprocessing.sequence import pad_sequences

import torch

2022-04-14 00:42:30.939354: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/nvidia/lib:/usr/local/nvidia/lib64
2022-04-14 00:42:30.939422: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


## Load data

In [2]:
rating_data = "/opt/ml/input/data/train/train_ratings.csv"

raw_rating_df = pd.read_csv(rating_data)
raw_rating_df
raw_rating_df['rating'] = 1.0
raw_rating_df.drop(['time'],axis=1,inplace=True)

users = set(raw_rating_df.loc[:, 'user'])
items = set(raw_rating_df.loc[:, 'item'])


raw_rating_df

Unnamed: 0,user,item,rating
0,11,4643,1.0
1,11,170,1.0
2,11,531,1.0
3,11,616,1.0
4,11,2140,1.0
...,...,...,...
5154466,138493,44022,1.0
5154467,138493,4958,1.0
5154468,138493,68319,1.0
5154469,138493,40819,1.0


## Negative sampling

In [3]:
print("Create Nagetive instances")
num_negative = 50
user_group_dfs = list(raw_rating_df.groupby('user')['item'])
first_row = True
user_neg_dfs = pd.DataFrame()

for u, u_items in tqdm(user_group_dfs):
    u_items = set(u_items)
    i_user_neg_item = np.random.choice(list(items - u_items), num_negative, replace=False)
    
    i_user_neg_df = pd.DataFrame({'user': [u]*num_negative, 'item': i_user_neg_item, 'rating': [0]*num_negative})
    if first_row == True:
        user_neg_dfs = i_user_neg_df
        first_row = False
    else:
        user_neg_dfs = pd.concat([user_neg_dfs, i_user_neg_df], axis = 0, sort=False)

raw_rating_df = pd.concat([raw_rating_df, user_neg_dfs], axis = 0, sort=False)

Create Nagetive instances


100%|██████████| 31360/31360 [06:04<00:00, 85.94it/s] 


## Label encoder

In [4]:
user_encoder = LabelEncoder()
item_encoder = LabelEncoder()

raw_rating_df['user'] = user_encoder.fit_transform(raw_rating_df['user'])
raw_rating_df['item'] = item_encoder.fit_transform(raw_rating_df['item'])

## Load_genres

In [5]:
genres = pd.read_csv("/opt/ml/input/data/train/genres.tsv", sep="\t")
genres['item'] = item_encoder.transform(genres['item'])

In [6]:
years = pd.read_csv("/opt/ml/input/data/train/years_new.tsv", sep="\t")
years['item'] = item_encoder.transform(years['item'])
years['year'] = (years['year'] - min(years['year'])) / max(years['year'])

In [7]:
directors = pd.read_csv("/opt/ml/input/data/train/directors.tsv", sep="\t")
directors['item'] = item_encoder.transform(directors['item'])

In [8]:
genre_dict = dict()
for _, (item, genre) in genres.iterrows() :
    if genre_dict.get(item) : genre_dict[item] = genre_dict[item] + '|' + genre
    else : genre_dict[item] = genre

In [11]:
genres = pd.DataFrame(list(genre_dict.items()), columns=['item', 'genres'])

In [12]:
temp = pd.merge(raw_rating_df, genres, how='inner', left_on='item', right_on='item')
temp = pd.merge(temp, years, how='inner', left_on='item', right_on='item')
data = temp

In [13]:
data

Unnamed: 0,user,item,rating,genres,year
0,0,2505,1.0,Action|Adventure|Drama|Sci-Fi,0.049132
1,39,2505,1.0,Action|Adventure|Drama|Sci-Fi,0.049132
2,66,2505,1.0,Action|Adventure|Drama|Sci-Fi,0.049132
3,85,2505,1.0,Action|Adventure|Drama|Sci-Fi,0.049132
4,95,2505,1.0,Action|Adventure|Drama|Sci-Fi,0.049132
...,...,...,...,...,...
6722466,30889,6630,0.0,Action|Adventure|Sci-Fi,0.055087
6722467,31008,6630,0.0,Action|Adventure|Sci-Fi,0.055087
6722468,31057,6630,0.0,Action|Adventure|Sci-Fi,0.055087
6722469,31297,6630,0.0,Action|Adventure|Sci-Fi,0.055087


In [14]:
def split(x):
    key_ans = x.split('|')
    for key in key_ans:
        if key not in key2index:
            # Notice : input value 0 is a special "padding",so we do not use 0 to encode valid feature for sequence input
            key2index[key] = len(key2index) + 1
    return list(map(lambda x: key2index[x], key_ans))

In [15]:
sparse_features = ["user", "item"]
dense_features = ["year"]
target = ['rating']

# 1.Label Encoding for sparse features,and process sequence features

key2index = {}
genres_list = list(map(split, data['genres'].values))
genres_length = np.array(list(map(len, genres_list)))
max_len = max(genres_length)
# Notice : padding=`post`
genres_list = pad_sequences(genres_list, maxlen=max_len, padding='post', )

In [16]:
fixlen_feature_columns = [SparseFeat(feat, data[feat].nunique(), embedding_dim=4)
                          for feat in sparse_features] + [DenseFeat(feat, 1, ) for feat in dense_features]

varlen_feature_columns = [VarLenSparseFeat(SparseFeat('genres', vocabulary_size=len(
    key2index) + 1, embedding_dim=4), maxlen=max_len, combiner='mean')]  # Notice : value 0 is for padding for sequence input feature

linear_feature_columns = fixlen_feature_columns + varlen_feature_columns
dnn_feature_columns = fixlen_feature_columns + varlen_feature_columns

feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)

model_input = {name: data[name] for name in feature_names} 
model_input["genres"] = genres_list

In [25]:
model_input['user']

0              0
1             39
2             66
3             85
4             95
           ...  
6722466    30889
6722467    31008
6722468    31057
6722469    31297
6722470    31359
Name: user, Length: 6722471, dtype: int64

device = 'cpu'
use_cuda = True
if use_cuda and torch.cuda.is_available():
    print('cuda ready...')
    device = 'cuda:0'

model = DeepFM(linear_feature_columns, dnn_feature_columns, task='binary', l2_reg_embedding=1e-5, device=device)

model.compile("adam", "binary_crossentropy", metrics=['binary_crossentropy'], )
history = model.fit(model_input,data[target].values,batch_size=256,epochs=10,verbose=2,validation_split=0.2)

device = 'cpu'
use_cuda = True
if use_cuda and torch.cuda.is_available():
    print('cuda ready...')
    device = 'cuda:0'

xmodel = xDeepFM(linear_feature_columns, dnn_feature_columns, task='binary', l2_reg_embedding=1e-5, device=device)

xmodel.compile("adam", "binary_crossentropy", metrics=['binary_crossentropy'], )
history = xmodel.fit(model_input,data[target].values,batch_size=256,epochs=10,verbose=2,validation_split=0.2)

## NMF(Best)

In [17]:
device = 'cpu'
use_cuda = True
if use_cuda and torch.cuda.is_available():
    print('cuda ready...')
    device = 'cuda:0'

xmodel = NFM(linear_feature_columns, dnn_feature_columns, task='binary', l2_reg_embedding=1e-5, device=device)

xmodel.compile("adam", "binary_crossentropy", metrics=['binary_crossentropy'], )
# history = xmodel.fit(model_input,data[target].values,batch_size=256,epochs=7,verbose=2,validation_split=0.2)

cuda ready...


## Inference

In [18]:
xmodel.load_state_dict(torch.load('./NFM_state_dict.pt'))

<All keys matched successfully>

In [19]:
table = raw_rating_df.pivot_table(values='rating', index='item', columns='user').fillna(0.0).unstack()

In [20]:
data = pd.DataFrame(table)
data.reset_index(inplace=True)
data = data.rename(columns={0:'rating'})

In [21]:
data.drop(data[data['rating']==1.0].index, inplace=True)

In [27]:
temp = pd.merge(data, genres, how='inner', left_on='item', right_on='item')
temp = pd.merge(temp, years, how='inner', left_on='item', right_on='item')
data = temp

In [30]:
data = data.sort_values(['user', 'item'])

In [31]:
data

Unnamed: 0,user,item,rating,genres,year
0,0,1,0.0,Adventure|Children|Fantasy,0.046154
27996,0,2,0.0,Comedy|Romance,0.046154
58622,0,3,0.0,Comedy|Drama|Romance,0.046154
89939,0,4,0.0,Comedy,0.046154
120709,0,5,0.0,Action|Crime|Thriller,0.046154
...,...,...,...,...,...
197795036,31359,6802,0.0,Drama,0.055583
197826336,31359,6803,0.0,Drama,0.055583
197857644,31359,6804,0.0,Children|Comedy|Fantasy|Musical,0.055583
197888882,31359,6805,0.0,Action|Comedy,0.055583


In [32]:
sparse_predicttures = ["user", "item"]
dense_features = ["year"]
target = ['rating']

# 1.Label Encoding for sparse features,and process sequence features

key2index = {}
genres_list = list(map(split, data['genres'].values))
genres_length = np.array(list(map(len, genres_list)))
max_len = max(genres_length)
# Notice : padding=`post`
genres_list = pad_sequences(genres_list, maxlen=max_len, padding='post', )

In [33]:
fixlen_feature_columns = [SparseFeat(feat, data[feat].nunique(), embedding_dim=4)
                          for feat in sparse_features] + [DenseFeat(feat, 1, ) for feat in dense_features]

varlen_feature_columns = [VarLenSparseFeat(SparseFeat('genres', vocabulary_size=len(
    key2index) + 1, embedding_dim=4), maxlen=max_len, combiner='mean')]  # Notice : value 0 is for padding for sequence input feature

linear_feature_columns = fixlen_feature_columns + varlen_feature_columns
dnn_feature_columns = fixlen_feature_columns + varlen_feature_columns

feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)

model_input = {name: data[name] for name in feature_names} 
model_input["genres"] = genres_list

In [None]:
model_input

In [34]:
result = xmodel.predict(model_input)

In [38]:
result.squeeze()

array([0.99073982, 0.9718892 , 0.80510414, ..., 0.70711005, 0.71789944,
       0.7088058 ])

In [39]:
data['result'] = result

In [42]:
score = data.drop(['rating', 'genres', 'year'], axis=1)

In [44]:
score.to_csv('./NMF.csv', index=False)

In [46]:
score = score.sort_values('result',ascending=False)

In [67]:
result = score.groupby('user').head(10)

In [68]:
result['user'] = user_encoder.inverse_transform(result['user'])
result['item'] = item_encoder.inverse_transform(result['item'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  result['user'] = user_encoder.inverse_transform(result['user'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  result['item'] = item_encoder.inverse_transform(result['item'])


In [73]:
result = result.sort_values(['user', 'result'], ascending=[True,False])

In [74]:
result.to_csv('NMF_result.csv', index=False)

In [75]:
result

Unnamed: 0,user,item,result
22799162,11,1485,0.999223
113192305,11,8528,0.999162
41356681,11,2694,0.998915
75128675,11,4963,0.998910
60514660,11,3948,0.998832
...,...,...,...
60541363,138493,3948,0.993244
22827135,138493,1485,0.992416
151742497,138493,54286,0.991279
2105188,138493,104,0.990595
