In [42]:
import pandas as pd
import numpy as np
from tqdm import tqdm

from sklearn.metrics import log_loss, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

from deepctr_torch.inputs import SparseFeat, DenseFeat, get_feature_names
from deepctr_torch.models import *
from keras.preprocessing.sequence import pad_sequences

# Load Data

In [5]:
rating_data = "/opt/ml/input/data/train/train_ratings.csv"

raw_rating_df = pd.read_csv(rating_data)
raw_rating_df
raw_rating_df['rating'] = 1.0
raw_rating_df.drop(['time'],axis=1,inplace=True)

users = set(raw_rating_df.loc[:, 'user'])
items = set(raw_rating_df.loc[:, 'item'])


raw_rating_df

Unnamed: 0,user,item,rating
0,11,4643,1.0
1,11,170,1.0
2,11,531,1.0
3,11,616,1.0
4,11,2140,1.0
...,...,...,...
5154466,138493,44022,1.0
5154467,138493,4958,1.0
5154468,138493,68319,1.0
5154469,138493,40819,1.0


In [6]:
print("Create Nagetive instances")
num_negative = 50
user_group_dfs = list(raw_rating_df.groupby('user')['item'])
first_row = True
user_neg_dfs = pd.DataFrame()

for u, u_items in tqdm(user_group_dfs):
    u_items = set(u_items)
    i_user_neg_item = np.random.choice(list(items - u_items), num_negative, replace=False)
    
    i_user_neg_df = pd.DataFrame({'user': [u]*num_negative, 'item': i_user_neg_item, 'rating': [0]*num_negative})
    if first_row == True:
        user_neg_dfs = i_user_neg_df
        first_row = False
    else:
        user_neg_dfs = pd.concat([user_neg_dfs, i_user_neg_df], axis = 0, sort=False)

raw_rating_df = pd.concat([raw_rating_df, user_neg_dfs], axis = 0, sort=False)

Create Nagetive instances


100%|██████████| 31360/31360 [05:51<00:00, 89.20it/s] 


In [7]:
user_encoder = LabelEncoder()
item_encoder = LabelEncoder()

raw_rating_df['user'] = user_encoder.fit_transform(raw_rating_df['user'])
raw_rating_df['item'] = item_encoder.fit_transform(raw_rating_df['item'])

In [83]:
genres = pd.read_csv("/opt/ml/input/data/train/genres.tsv", sep="\t")

genres['item'] = item_encoder.transform(genres['item'])
item_len = len(genres['item'].unique())
genre_len = len(genres['genre'].unique())          
genre_matrix = pd.DataFrame(np.zeros((item_len, genre_len)), columns=list(genres['genre'].unique()))
for _, (i, g) in genres.iterrows() :
    genre_matrix.loc[i,g] = 1.0

In [9]:
years = pd.read_csv("/opt/ml/input/data/train/years_new.tsv", sep="\t")
years
years['item'] = item_encoder.transform(years['item'])
years['year'] = (years['year'] - min(years['year'])) / max(years['year'])
years.sort_values('item', inplace=True)

In [10]:
directors = pd.read_csv("/opt/ml/input/data/train/directors.tsv", sep="\t")
directors['item'] = item_encoder.transform(directors['item'])
directors_len = len(directors['director'].unique())
directors_matrix = pd.DataFrame(np.zeros((item_len, directors_len)), columns=list(directors['director'].unique()))
for _, (i, d) in directors.iterrows() :
    directors_matrix.loc[i,d] = 1.0

In [84]:
genres = genres.sort_values('item')

In [85]:
def split(key_ans):
    for key in key_ans:
        if key not in key2index:
            # Notice : input value 0 is a special "padding",so we do not use 0 to encode valid feature for sequence input
            key2index[key] = len(key2index) + 1
    return list(map(lambda x: key2index[x], key_ans))

In [86]:
genre_dict = dict()
for i , (item, genre) in genres.iterrows() :
    if genre_dict.get(item) : genre_dict[item].append(genre)
    else : genre_dict[item] = [genre]

In [87]:
genres_list = []
key2index = {}
for item, genre in genre_dict.items() :
    genres_list.append(split(genre))

In [88]:
genres_length = np.array(list(map(len, genres_list)))

In [89]:
max_len = max(genres_length)

In [90]:
genres_list = pad_sequences(genres_list, maxlen=max_len, padding='post')

In [101]:
genre_matrix = pd.DataFrame(genres_list)

In [103]:
data = pd.merge(left=raw_rating_df, right=genre_matrix, how='inner', left_on='item', right_on=genre_matrix.index)

In [104]:
data

Unnamed: 0,user,item,rating,0,1,2,3,4,5,6,7,8,9
0,0,2505,1.0,13,3,7,10,0,0,0,0,0,0
1,39,2505,1.0,13,3,7,10,0,0,0,0,0,0
2,66,2505,1.0,13,3,7,10,0,0,0,0,0,0
3,85,2505,1.0,13,3,7,10,0,0,0,0,0,0
4,95,2505,1.0,13,3,7,10,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6722466,30999,6630,0.0,13,3,10,0,0,0,0,0,0,0
6722467,31247,6630,0.0,13,3,10,0,0,0,0,0,0,0
6722468,31279,6630,0.0,13,3,10,0,0,0,0,0,0,0
6722469,31281,6630,0.0,13,3,10,0,0,0,0,0,0,0


In [14]:
temp = pd.merge(left=raw_rating_df, right=genre_matrix, how='inner', left_on='item', right_on=genre_matrix.index)

In [18]:
data = pd.merge(left=temp, right=years, how='inner', left_on='item', right_on='item')

In [19]:
sparseFeatures = list(genre_matrix.columns)
denseFeatures = ['year']

In [21]:
fixlen_feature_columns= [SparseFeat(feat, data[feat].nunique()) 
 for feat in sparseFeatures] + [DenseFeat(feat, 1,) for feat in denseFeatures]

In [22]:
dnn_feature_columns = fixlen_feature_columns
linear_feature_columns = fixlen_feature_columns

feature_names = get_feature_names(
    linear_feature_columns + dnn_feature_columns)

In [23]:
train, test = train_test_split(data, test_size=0.2)

train_model_input = {name: train[name] for name in feature_names}
test_model_input = {name: test[name] for name in feature_names}

In [26]:
import torch

In [28]:
target = data['rating']

In [29]:
device = 'cpu'
use_cuda = True
if use_cuda and torch.cuda.is_available():
    print('cuda ready...')
    device = 'cuda:0'

model = DeepFM(linear_feature_columns=linear_feature_columns, dnn_feature_columns=dnn_feature_columns,
               task='binary',
               l2_reg_embedding=1e-5, device=device)

model.compile("adagrad", "binary_crossentropy",
              metrics=["binary_crossentropy", "auc"], )
model.fit(train_model_input,train[target].values,batch_size=32,epochs=10,verbose=2,validation_split=0.0)

pred_ans = model.predict(test_model_input, 256)
print("")
print("test LogLoss", round(log_loss(test[target].values, pred_ans), 4))
print("test AUC", round(roc_auc_score(test[target].values, pred_ans), 4))

cuda ready...


KeyError: "None of [Float64Index([1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,\n              ...\n              0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],\n             dtype='float64', length=6722471)] are in the [columns]"

In [None]:
x