In [12]:
import json
import pandas as pd
import numpy as np
from sklearn import preprocessing, feature_extraction, model_selection
from sklearn.metrics import mean_absolute_error, mean_squared_error

import stellargraph as sg
from stellargraph.mapper import HinSAGELinkGenerator
from stellargraph.layer import HinSAGE, link_regression
from tensorflow.keras import Model, optimizers, losses, metrics

import multiprocessing
from stellargraph import datasets
from IPython.display import display, HTML
import matplotlib.pyplot as plt
%matplotlib inline

In [6]:
dataset = datasets.MovieLens()
display(HTML(dataset.description))
G, edges_with_ratings = dataset.load()

In [7]:
print(G.info())

StellarGraph: Undirected multigraph
 Nodes: 2625, Edges: 100000

 Node types:
  movie: [1682]
    Features: float32 vector, length 19
    Edge types: movie-rating->user
  user: [943]
    Features: float32 vector, length 24
    Edge types: user-rating->movie

 Edge types:
    movie-rating->user: [100000]
        Weights: all 1 (default)
        Features: none


In [4]:
set(edges_with_ratings)

{'movie_id', 'rating', 'user_id'}

In [21]:
batch_size = 200
epochs = 20
# Use 70% of edges for training, the rest for testing:
train_size = 0.7
test_size = 0.3

In [9]:
edges_train, edges_test = model_selection.train_test_split(
    edges_with_ratings, train_size=train_size, test_size=test_size
)

edgelist_train = list(edges_train[["user_id", "movie_id"]].itertuples(index=False))
edgelist_test = list(edges_test[["user_id", "movie_id"]].itertuples(index=False))

labels_train = edges_train["rating"]
labels_test = edges_test["rating"]

In [10]:
num_samples = [8, 4]

In [13]:
generator = HinSAGELinkGenerator(
    G, batch_size, num_samples, head_node_types=["user", "movie"]
)
train_gen = generator.flow(edgelist_train, labels_train, shuffle=True)
test_gen = generator.flow(edgelist_test, labels_test)

In [14]:
generator.schema.type_adjacency_list(generator.head_node_types, len(num_samples))

[('user', [2]),
 ('movie', [3]),
 ('movie', [4]),
 ('user', [5]),
 ('user', []),
 ('movie', [])]

In [15]:
generator.schema.schema

{'user': [EdgeType(n1='user', rel='rating', n2='movie')],
 'movie': [EdgeType(n1='movie', rel='rating', n2='user')]}

In [17]:
hinsage_layer_sizes = [32, 32]
assert len(hinsage_layer_sizes) == len(num_samples)

hinsage = HinSAGE(
    layer_sizes=hinsage_layer_sizes, generator=generator, bias=True, dropout=0.0
)

In [18]:
# Expose input and output sockets of hinsage:
x_inp, x_out = hinsage.in_out_tensors()

In [19]:
# Final estimator layer
score_prediction = link_regression(edge_embedding_method="concat")(x_out)

link_regression: using 'concat' method to combine node embeddings into edge embeddings


In [20]:
import tensorflow.keras.backend as K


def root_mean_square_error(s_true, s_pred):
    return K.sqrt(K.mean(K.pow(s_true - s_pred, 2)))


model = Model(inputs=x_inp, outputs=score_prediction)
model.compile(
    optimizer=optimizers.Adam(lr=1e-2),
    loss=losses.mean_squared_error,
    metrics=[root_mean_square_error, metrics.mae],
)

  super(Adam, self).__init__(name, **kwargs)


In [22]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_3 (InputLayer)           [(None, 8, 19)]      0           []                               
                                                                                                  
 input_5 (InputLayer)           [(None, 32, 24)]     0           []                               
                                                                                                  
 input_6 (InputLayer)           [(None, 32, 19)]     0           []                               
                                                                                                  
 input_1 (InputLayer)           [(None, 1, 24)]      0           []                               
                                                                                              

In [23]:
# Specify the number of workers to use for model training
num_workers = 4

In [24]:
test_metrics = model.evaluate(
    test_gen, verbose=1, use_multiprocessing=False, workers=num_workers
)

print("Untrained model's Test Evaluation:")
for name, val in zip(model.metrics_names, test_metrics):
    print("\t{}: {:0.4f}".format(name, val))

Untrained model's Test Evaluation:
	loss: 17.0501
	root_mean_square_error: 4.1286
	mean_absolute_error: 3.9687


In [25]:
history = model.fit(
    train_gen,
    validation_data=test_gen,
    epochs=10,
    verbose=1,
    shuffle=False,
    use_multiprocessing=False,
    workers=num_workers,
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
 32/350 [=>............................] - ETA: 36s - loss: 1.0953 - root_mean_square_error: 1.0455 - mean_absolute_error: 0.8432

KeyboardInterrupt: 

In [39]:
tags = pd.read_csv('D:/DOWNLOAD/ml-25m/tags.csv')
movies = pd.read_csv('D:/DOWNLOAD/ml-25m/movies.csv')

In [30]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

In [35]:
tags = tags[:5]
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tags['tag'] = tags['tag'].fillna('')
print(tags)
tag_matrix = tfidf_vectorizer.fit_transform(tags['tag'])
print("")
print(tag_matrix)
tag_similarity = linear_kernel(tag_matrix, tag_matrix)
print(tag_similarity)

   userId  movieId               tag   timestamp
0       3      260           classic  1439472355
1       3      260            sci-fi  1439472256
2       4     1732       dark comedy  1573943598
3       4     1732    great dialogue  1573943604
4       4     7569  so bad it's good  1573943455

  (0, 1)	1.0
  (1, 5)	0.7071067811865475
  (1, 8)	0.7071067811865475
  (2, 2)	0.7071067811865475
  (2, 3)	0.7071067811865475
  (3, 4)	0.7071067811865475
  (3, 7)	0.7071067811865475
  (4, 6)	0.7071067811865475
  (4, 0)	0.7071067811865475
[[1. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0.]
 [0. 0. 1. 0. 0.]
 [0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 1.]]


In [46]:
movie = movies[:5]
# print(movie)
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
movie['title'] = movie['title'].fillna('')
print(movie)
tag_matrix = tfidf_vectorizer.fit_transform(movie['title'])
print("")
print(tag_matrix)
tag_similarity = linear_kernel(tag_matrix, tag_matrix)
print('tag_similarity', tag_similarity)

   movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  
1                   Adventure|Children|Fantasy  
2                               Comedy|Romance  
3                         Comedy|Drama|Romance  
4                                       Comedy  

  (0, 0)	0.3193023297639811
  (0, 9)	0.6700917930430479
  (0, 10)	0.6700917930430479
  (1, 6)	0.9027501480103624
  (1, 0)	0.430165282498796
  (2, 7)	0.5566685141652766
  (2, 8)	0.5566685141652766
  (2, 4)	0.5566685141652766
  (2, 0)	0.26525552965220073
  (3, 2)	0.6700917930430479
  (3, 11)	0.6700917930430479
  (3, 0)	0.3193023297639811
  (4, 5)	0.5566685141652766
  (4, 1)	0.5566685141652766
  (4, 3)	0

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movie['title'] = movie['title'].fillna('')
