In [1]:
import pandas as pd
import numpy as np

BASE_PATH = '/opt/ml/input/data/train/'
ratings = pd.read_csv(BASE_PATH + 'train_ratings.csv')
titles = pd.read_csv(BASE_PATH + 'titles.tsv', sep='\t')
genres = pd.read_csv(BASE_PATH + 'genres.tsv', sep='\t')
directors = pd.read_csv(BASE_PATH + 'directors.tsv', sep='\t')
years = pd.read_csv(BASE_PATH + 'years.tsv', sep='\t')
ratings = ratings.drop('time', axis=1)
ratings['rate'] = 1.0

In [2]:
from sklearn.model_selection import train_test_split

ratings_train, ratings_test = train_test_split(ratings,
                                              stratify=ratings['user'],
                                              random_state=42,
                                              test_size=0.2)

print("N of Train : " , len(ratings_train))
print("N of Test : " , len(ratings_test))

N of Train :  4123576
N of Test :  1030895


In [3]:
ratings_train

Unnamed: 0,user,item,rate
1762082,46685,5954,1.0
4082227,109245,5952,1.0
4784205,128351,1208,1.0
1586419,42033,1721,1.0
589952,15703,357,1.0
...,...,...,...
4216027,112903,91542,1.0
282323,7482,2502,1.0
2167888,57473,1204,1.0
2112878,56034,6807,1.0


In [15]:
def rating_splitter(data) :
    gr_user_like = data.groupby(['rate', 'user'])
    return ([gr_user_like.get_group(gr)['item'].tolist() for gr in gr_user_like.groups])

In [16]:
splitted_movies = rating_splitter(ratings_train)

In [17]:
import random
for movie_list in splitted_movies:
    random.shuffle(movie_list)

In [20]:
from gensim.models import Word2Vec
import datetime

In [23]:
start = datetime.datetime.now()

model = Word2Vec(sentences = splitted_movies, # We will supply the pre-processed list of moive lists to this parameter
                 epochs = 5, # epoch
                 min_count = 10, # a movie has to appear more than 10 times to be keeped
                 vector_size = 200, # size of the hidden layer
                 workers = 4, # specify the number of threads to be used for training
                 sg = 1, # Defines the training algorithm. We will use skip-gram so 1 is chosen.
                 hs = 0, # Set to 0, as we are applying negative sampling.
                 negative = 5, # If > 0, negative sampling will be used. We will use a value of 5.
                 window = 9999999)

print("Time passed: " + str(datetime.datetime.now()-start))
model.save('item2vec_20200908')
del model

Time passed: 0:48:50.295844


In [24]:
import warnings
warnings.filterwarnings(action='ignore', category=UserWarning, module='gensim')

from gensim.models import Word2Vec
model = Word2Vec.load('item2vec_20200908')
word_vectors = model.wv

In [28]:
word_vectors.cosine_similarities(5954,5952)

AxisError: axis 1 is out of bounds for array of dimension 0

In [29]:
def produce_list_of_movieId(list_of_movieName, useRefineSearch=False):
    """
    Turn a list of movie name into a list of movie ids. The movie names has to be exactly the same as they are in the dataset.
    Ambiguous movie names can be supplied if useRefineSearch is set to True
    
    Args:
        list_of_movieName (List): A list of movie names.
        useRefineSearch (boolean): Ambiguous movie names can be supplied if useRefineSearch is set to True

    Returns:
        list_of_movie_id (List of strings): A list of movie ids.
    """
    list_of_movie_id = []
    for movieName in list_of_movieName:
        if useRefineSearch:
            movieName = refine_search(movieName)
            print(movieName)
            print("Refined Name: "+movieName)
        if movieName in name_to_movieId.keys():
            list_of_movie_id.append(str(name_to_movieId[movieName]))
    return list_of_movie_id

def recommender(positive_list=None, negative_list=None, useRefineSearch=False, topn=20):
    recommend_movie_ls = []
    if positive_list:
        positive_list = produce_list_of_movieId(positive_list, useRefineSearch)
    if negative_list:
        negative_list = produce_list_of_movieId(negative_list, useRefineSearch)
    for movieId, prob in model.wv.most_similar_cosmul(positive=positive_list, negative=negative_list, topn=topn):
        recommend_movie_ls.append(movieId)
    return recommend_movie_ls