Reading required files. 

In [0]:
mv genreproper3.h5 gdrive/My\ Drive

In [2]:
import pandas as pd
import numpy as np
from keras.models import load_model

links = pd.read_csv('gdrive/My Drive/links.csv')
movies = pd.read_csv('gdrive/My Drive/movies.csv')
ratings = pd.read_csv('gdrive/My Drive/ratings.csv')
tags = pd.read_csv('gdrive/My Drive/tags.csv')

Using TensorFlow backend.


Cleaning genres string. Replacing "|" with " "

In [0]:
tags_1 = pd.DataFrame(tags.groupby('movieId')['tag'].apply(lambda x: "%s" % ' '.join(x)))
df = pd.merge(movies, tags_1,on='movieId', how='left')
df['genres'] = df['genres'].str.replace('|',' ')
df.fillna("",inplace=True)

Label Encoding "movieId" column so that the movieId's start with 1 and end with 9742 (max number of movies in the dataset).

In [0]:
from sklearn.preprocessing import LabelEncoder
item_enc = LabelEncoder()

df['movieId'] = item_enc.fit_transform(df['movieId'].values)

df['movieId'] = df['movieId']+1

In [7]:
df.head(3)

Unnamed: 0,movieId,title,genres,tag
0,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,pixar pixar fun
1,2,Jumanji (1995),Adventure Children Fantasy,fantasy magic board game Robin Williams game
2,3,Grumpier Old Men (1995),Comedy Romance,moldy old


Converting upper case characters to lower case. To avoid duplicates when the words are tokenized.

In [0]:
df.genres = df.genres.apply(lambda x: x.lower())

In [0]:
import string
df.genres = df.genres.apply(lambda x: x.translate(string.punctuation))


In [0]:
ratings['movieId'] = item_enc.fit_transform(ratings['movieId'].values)

ratings['movieId'] = ratings['movieId']+1


Grouping movies by userId.

In [0]:
user_movies_watched = pd.DataFrame(ratings.groupby(['userId']).apply(lambda x: [list(x['movieId'])]).apply(pd.Series))

In [12]:
user_movies_watched.head(3)

Unnamed: 0_level_0,0
userId,Unnamed: 1_level_1
1,"[1, 3, 6, 44, 47, 63, 90, 98, 125, 131, 137, 1..."
2,"[278, 292, 1284, 2671, 4608, 5295, 6237, 6299,..."
3,"[31, 462, 546, 566, 586, 657, 697, 832, 853, 9..."


Obtaining all the genres of the respective movies.

In [0]:
user_movies_watched['genres']= df['genres'].astype(str)

def genres(j,row):
    a = []
    for i in row:
        a.append(df.loc[df['movieId'] == i, 'genres'].iloc[0])
    if(j<610):
        user_movies_watched.at[j+1, 'genres'] = ' '.join(word for word in a)
        
for i in range(0,610):
    genres(i,(user_movies_watched.iloc[i:i+1,0].iloc[0]))

In [0]:
user_movies_watched.columns = ['movies', 'genres']

In [15]:
user_movies_watched.head(3)

Unnamed: 0_level_0,movies,genres
userId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,"[1, 3, 6, 44, 47, 63, 90, 98, 125, 131, 137, 1...",adventure animation children comedy fantasy co...
2,"[278, 292, 1284, 2671, 4608, 5295, 6237, 6299,...",crime drama comedy comedy horror sci-fi crime ...
3,"[31, 462, 546, 566, 586, 657, 697, 832, 853, 9...",drama drama war action crime drama war action ...


"Multi hot encoding" the output "y" labels. 

In [0]:
y = np.zeros((610, 9743))

In [0]:
def y_s(j,row):
    a = []
    for i in row:
        y[j,i]=1

for i in range(0,610):
    y_s(i,(user_movies_watched.iloc[i:i+1,0].iloc[0]))

In [18]:
y

array([[0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 1., 1., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.]])

Maximum number of movies seen by any user? This will be the movieId input size.

In [19]:
# Knowing the max no. of movies a user has seen.
max1 = 0 
for i in range(0,610):
    if (max1 < len(user_movies_watched.iloc[i:i+1,0].iloc[0])):
        max1 = len(user_movies_watched.iloc[i:i+1,0].iloc[0])
        print(max1)

232
314
502
703
1260
1346
2698


Padding other users to the required size.

In [0]:
def padd_movies_seen(j, row):
    if(len(row))<max1:
        N = max1 - len(row)
        a = np.pad(np.array(row), (0, N), 'constant')
        user_movies_watched.at[j+1, 'movies']=a
        
for i in range(0,610):
    padd_movies_seen(i,(user_movies_watched.iloc[i:i+1,0].iloc[0]))

In [22]:
user_movies_watched.head(3)

Unnamed: 0_level_0,movies,genres
userId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,"[1, 3, 6, 44, 47, 63, 90, 98, 125, 131, 137, 1...",adventure animation children comedy fantasy co...
2,"[278, 292, 1284, 2671, 4608, 5295, 6237, 6299,...",crime drama comedy comedy horror sci-fi crime ...
3,"[31, 462, 546, 566, 586, 657, 697, 832, 853, 9...",drama drama war action crime drama war action ...


Tokenizing all the genres. Every word will have a unique number associated with it.

In [0]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

vocabulary_size = 24
tokenizer = Tokenizer(num_words= vocabulary_size)
tokenizer.fit_on_texts(user_movies_watched['genres'])
sequences = tokenizer.texts_to_sequences(user_movies_watched['genres'])
genres_map = dict(map(reversed, tokenizer.word_index.items()))
genres_array = pad_sequences(sequences,maxlen=6189, padding='post')

movies_array = user_movies_watched.movies.values
movies_array = np.stack(movies_array, axis = 0)

In [24]:
y.shape, genres_array.shape, movies_array.shape

((610, 9743), (610, 6189), (610, 2698))

The deep learning model. 

In [0]:
from keras.layers import Concatenate, Dense, Dropout, Flatten, LSTM
from keras.models import Model
from keras.layers import Input, Reshape, Dot
from keras.layers.embeddings import Embedding
from keras.optimizers import Adam
from keras.regularizers import l2
from keras.layers import Add, Activation, Lambda
#from keras.backend import K

n_factors = 50

def rec(n_factors):
    
    movies = Input(shape=(max1,))
    m = Embedding(9743, n_factors, embeddings_initializer='he_normal')(movies)
    m = Flatten()(m)
    
    
    genres = Input(shape=(genre_size,))
    g = Embedding(25, n_factors)(genres)
    g = Flatten()(g)
    #g = Reshape((n_factors,))(g)
    
    x = Concatenate()([m, g])
    x = Dropout(0.05)(x)
    x = Reshape((1,444350))(x)
#     x = Dense(100, kernel_initializer='he_normal')(x)
#     x = Activation('sigmoid')(x)
#     x = Dropout(0.5)(x)
#     x = Reshape((1,100))(x)
    
    x = LSTM(100, dropout=0.2, recurrent_dropout=0.2) (x)
    x = Activation('relu')(x)
    x = Dropout(0.2)(x)
    
    x = Dense(9743, kernel_initializer='he_normal')(x)
    x = Activation('softmax')(x)
    
    model = Model(inputs=[movies, genres], outputs=x)
    opt = Adam(lr=0.001)
    model.compile(loss='binary_crossentropy', optimizer=opt, metrics=['accuracy'])
    return model


In [0]:
model = rec(50)
model.summary()






Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 2698)         0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, 6189)         0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 2698, 50)     487150      input_1[0][0]                    
__________________________________________________________________________

Train.

In [25]:
#2000+2000+2500+1500+3000
history = model.fit(x=[movies_array, genres_array], y=y, batch_size=8, epochs=2000,
                    verbose=2)

Epoch 1/2000
 - 15s - loss: 0.1074 - acc: 0.9830
Epoch 2/2000
 - 10s - loss: 0.1075 - acc: 0.9830
Epoch 3/2000
 - 10s - loss: 0.1076 - acc: 0.9830
Epoch 4/2000
 - 10s - loss: 0.1074 - acc: 0.9830
Epoch 5/2000
 - 10s - loss: 0.1074 - acc: 0.9830
Epoch 6/2000
 - 10s - loss: 0.1076 - acc: 0.9830
Epoch 7/2000
 - 10s - loss: 0.1073 - acc: 0.9830
Epoch 8/2000
 - 10s - loss: 0.1074 - acc: 0.9830
Epoch 9/2000
 - 10s - loss: 0.1074 - acc: 0.9830
Epoch 10/2000
 - 10s - loss: 0.1076 - acc: 0.9830
Epoch 11/2000
 - 10s - loss: 0.1072 - acc: 0.9830
Epoch 12/2000
 - 10s - loss: 0.1074 - acc: 0.9830
Epoch 13/2000
 - 10s - loss: 0.1074 - acc: 0.9830
Epoch 14/2000
 - 10s - loss: 0.1072 - acc: 0.9830
Epoch 15/2000
 - 10s - loss: 0.1073 - acc: 0.9830
Epoch 16/2000
 - 10s - loss: 0.1072 - acc: 0.9830
Epoch 17/2000
 - 10s - loss: 0.1072 - acc: 0.9830
Epoch 18/2000
 - 10s - loss: 0.1076 - acc: 0.9830
Epoch 19/2000
 - 10s - loss: 0.1073 - acc: 0.9830
Epoch 20/2000
 - 10s - loss: 0.1074 - acc: 0.9830
Epoch 21/

In [0]:
model.save('gdrive/My Drive/genreproper6.h5')

To provide custom inputs.

In [0]:
def prepare_inputs(movie_input):
    """
    With only MovieID's as the input to our model, we are required to 
    obtain the required genre's and tag's and bring our raw input to
    the required shape.
    """
    
    movies_copy = movie_input[:]

    genres = (DF.genres[DF["movieId"].isin(movie_input)].values).tolist()
    genres = [word for line in genres for word in line.split()]

    genres = [
        list(GENRES_MAP.keys())[list(GENRES_MAP.values()).index(i)] for i in genres
    ]
    
    genres = pad(genres, INPUT_LENGTH["genre_len"])    
    movie_input = pad(movie_input, INPUT_LENGTH["movie_len"])
    
    return movie_input, genres, movies_copy

    
#     for i in r1:
#         g_input.append(list(genres_map.keys())[list(genres_map.values()).index(i.split())])
#     for i in r2:
#         t_input.append(list(genres_map.keys())[list(genres_map.values()).index(i.split())])
        
movies, g_input, movie_copy = prepare_inputs([508,597])


Predict:

In [0]:
preds = model.predict([np.array([movies,]),np.array([g_input,])])

Recommended movies.

In [44]:
most_similar = preds[0].argsort()[-(10 + len(movie_copy)) :][::-1]
rec_movies = DF.set_index("movieId").loc[most_similar].reset_index()    
blankIndex=[''] * len(rec_movies)
rec_movies.index=blankIndex
rec_movies

Unnamed: 0,movieId,Title,Genres,Tags
0,4351,Kiss Me Kate (1953),comedy musical romance,
1,909,To Kill a Mockingbird (1962),drama,harper lee racism
2,2282,"Taming of the Shrew, The (1967)",comedy,shakespeare
3,514,Pinocchio (1940),animation children fantasy musical,disney
4,3669,Italian for Beginners (Italiensk for begyndere...,comedy drama romance,
5,1054,Die Hard 2 (1990),action adventure thriller,
6,2587,Buddy Boy (1999),drama thriller,
7,926,Henry V (1989),action drama romance war,shakespeare
8,1063,Young Guns II (1990),action western,
9,1971,My Science Project (1985),adventure sci-fi,
