In [77]:
import networkx as nx
import matplotlib.pyplot as plt
import csv
import numpy as np

In [6]:
class KnowledgeGraph():
    """
    Class to contain all the knowledge graph related code.
    """
    def similar_movies(self, language="Hindi", year="2020"):
        """
        Method to plot knowledge graph of 1000 Movies.
        """
        G = nx.MultiDiGraph()
        genres_read = []
        genres_colors = ['#5013ED', '#42853C', '#D4E907', '#2A257D', '#EF093B', '#8CA030', '#35B1DA', '#3F4F33', '#CAA341', '#B69BAE', '#E77FE2', '#9483F4', '#77DF5D', '#F3902F', '#E88182', '#713338', '#5CEFAB', '#863771', '#53EF26', '#FF80FF', '#6FF6FF']
        genres_color = {}
        color_map = []
        movies_genres = {}

        with open('C://Users//LG//PycharmProjects//pythonProject//Purdue//CNIT_483//The_Entertainment_Knowledge_Graph//Code//final_dataset_imdb.csv',encoding="utf8") as csv_file:
            csv_reader = csv.reader(csv_file, delimiter=',')
            line_count = 0
            for row in csv_reader:
                if line_count == 0:  # Do not include the header from the data
                    line_count = 1
                    continue
                if row[3]!= year and row[8]!= language:
                    continue
                G.add_node(row[1])
                title = row[1]
                genres = list(row[5].split(", "))
                movies_genres[title]=genres
                for x in genres:
                    if x not in G:
                        G.add_node(x)
                        genres_read.append(x)
                        genres_color[x]=genres_colors[len(genres_color)]
                    G.add_edge(title, x)
                
                if line_count == 1000:
                    break
                line_count += 1

        edge_colors = [genres_color[e[1]] for e in G.edges]
        for node in G:
            if node in genres_read:
                color_map.append('blue')
            else: 
                hex_ = [genres_color[x] for x in movies_genres[str(node)]]
                avg = sum(list(map(lambda x: int(x[1:], 16), hex_)))//len(hex_)
                avg_color = f'#{avg:06x}'
                color_map.append(avg_color)

        plt.figure(figsize=(150,150))
        pos = nx.spring_layout(G,k=0.10,iterations=20)
        nx.draw(G, with_labels=True, node_color=color_map, edge_color=edge_colors, node_size = 4500, prog="dot", edge_cmap=plt.cm.Blues, font_size=16, pos=pos)
        plt.savefig("my_graph.pdf")
        print("\nPlease Check my_graph.pdf in the current code directory\n")
        
        
    def movie_details(self, title):
        
        """
        Method to plot detailed KG of a single movie.
        """
        G = nx.MultiDiGraph()
        color_map = []
        node_sizes = []
        colors = ['#5013ED', '#42853C', '#D4E907', '#2A257D', '#EF093B', '#8CA030', '#35B1DA', '#3F4F33', '#CAA341', '#B69BAE', '#E77FE2', '#9483F4', '#77DF5D', '#F3902F', '#E88182', '#713338', '#5CEFAB', '#863771', '#53EF26', '#FF80FF', '#6FF6FF']
        with open('C:\\Users\\LG\\PycharmProjects\\pythonProject\\Purdue\\CNIT_483\\The_Entertainment_Knowledge_Graph\\Code\\final_dataset_imdb.csv',encoding="utf8") as csv_file:
            csv_reader = csv.reader(csv_file, delimiter=',')
            for row in csv_reader:
                if row[2]==title:
                    row[1]=row[2]
                    a = row[1]
                    G.add_node(row[1])
                    color_map.append('red')
                    node_sizes.append(20000)
                    G.add_node(row[4])
                    color_map.append(colors[1])
                    node_sizes.append(7000)
                    G.add_edge(row[1],row[4], movie='Released on')
                    G.add_node("Genres")
                    color_map.append(colors[2])
                    node_sizes.append(7000)
                    G.add_edge(row[1],"Genres", movie='Genres include')
                    for i in list(row[5].split(", ")):
                        G.add_node(i)
                        color_map.append(colors[4])
                        node_sizes.append(7000)
                        G.add_edge("Genres", i)
                    G.add_node(row[6])
                    color_map.append(colors[5])
                    node_sizes.append(7000)
                    G.add_edge(row[1],row[6], movie='Duration(Mins)')
                    G.add_node(row[7])
                    color_map.append(colors[6])
                    node_sizes.append(7000)
                    G.add_edge(row[1],row[7], movie='Country released in')
                    G.add_node("Languages")
                    color_map.append(colors[7])
                    node_sizes.append(7000)
                    G.add_edge(row[1],"Languages", movie='languages released in')
                    count=0
                    for i in list(row[8].split(", ")):
                        G.add_node(i)
                        color_map.append(colors[18])
                        node_sizes.append(5000)
                        G.add_edge("Languages", i)
                        if count>4:
                            break
                        count+=1
                    G.add_node(row[9])
                    color_map.append(colors[8])
                    node_sizes.append(7000)
                    G.add_edge(row[1],row[9], movie='Directed by')
                    G.add_node("Cast")
                    color_map.append(colors[9])
                    node_sizes.append(7000)
                    G.add_edge(row[1],"Cast", movie='cast includes')
                    count=0
                    for i in list(row[12].split(", ")):
                        G.add_node(i)
                        color_map.append(colors[10])
                        node_sizes.append(5000)
                        G.add_edge("Cast", i)
                        if count>4:
                            break
                        count+=1
                    description = row[13]
                    G.add_node(row[14])
                    color_map.append(colors[11])
                    node_sizes.append(7000)
                    G.add_edge(row[1],row[14], movie='Rating')
                    break

        plt.figure(figsize=(25,25))
        pos = nx.shell_layout(G)
        pos[a] = np.array([0, 0])
        nx.draw(G, with_labels=True, node_color=color_map, node_size = node_sizes, prog="dot", edge_cmap=plt.cm.Blues, font_size=20, pos=pos)
        edge_labels = nx.get_edge_attributes(G, 'movie')
        nx.draw_networkx_edge_labels(G, pos, labels=edge_labels, font_size=20)
        plt.savefig("movie_detail.pdf")
        print("Description of movie: ", description)
        print("\nPlease Check movie_detail.pdf in the current code directory\n")
    

In [9]:
movie_name = input("Enter Movie Name(Case Sensitive): ")


Enter Movie Name(Case Sensitive):  Cleopatra


In [52]:
def movie_details(title):    
    """
    Method to plot detailed KG of a single movie.
    """
    G = nx.MultiDiGraph()
    color_map = []
    node_sizes = []
    colors = ['#5013ED', '#42853C', '#D4E907', '#2A257D', '#EF093B', '#8CA030', '#35B1DA', '#3F4F33', '#CAA341', '#B69BAE', '#E77FE2', '#9483F4', '#77DF5D', '#F3902F', '#E88182', '#713338', '#5CEFAB', '#863771', '#53EF26', '#FF80FF', '#6FF6FF']
    with open('C:\\Users\\LG\\PycharmProjects\\pythonProject\\Purdue\\CNIT_483\\The_Entertainment_Knowledge_Graph\\Code\\final_dataset_imdb.csv', encoding="utf8") as csv_file:
        csv_reader = csv.reader(csv_file, delimiter=',')
        for row in csv_reader:
            if row[2]==title:
                row[1]=row[2]
                a = row[1]
                G.add_node(row[1])
                color_map.append('red')
                node_sizes.append(20000)
                G.add_node(row[4])
                color_map.append(colors[1])
                node_sizes.append(7000)
                G.add_edge(row[1],row[4], movie='Released on')
                G.add_node("Genres")
                color_map.append(colors[2])
                node_sizes.append(7000)
                G.add_edge(row[1],"Genres", movie='Genres include')
                for i in list(row[5].split(", ")):
                    G.add_node(i)
                    color_map.append(colors[4])
                    node_sizes.append(7000)
                    G.add_edge("Genres", i)
                G.add_node(row[6])
                color_map.append(colors[5])
                node_sizes.append(7000)
                G.add_edge(row[1],row[6], movie='Duration(Mins)')
                G.add_node(row[7])
                color_map.append(colors[6])
                node_sizes.append(7000)
                G.add_edge(row[1],row[7], movie='Country released in')
                G.add_node("Languages")
                color_map.append(colors[7])
                node_sizes.append(7000)
                G.add_edge(row[1],"Languages", movie='languages released in')
                count=0
                for i in list(row[8].split(", ")):
                    G.add_node(i)
                    color_map.append(colors[18])
                    node_sizes.append(5000)
                    G.add_edge("Languages", i)
                    if count>4:
                        break
                    count+=1
                G.add_node(row[9])
                color_map.append(colors[8])
                node_sizes.append(7000)
                G.add_edge(row[1],row[9], movie='Directed by')
                G.add_node("Cast")
                color_map.append(colors[9])
                node_sizes.append(7000)
                G.add_edge(row[1],"Cast", movie='cast includes')
                count=0
                for i in list(row[12].split(", ")):
                    G.add_node(i)
                    color_map.append(colors[10])
                    node_sizes.append(5000)
                    G.add_edge("Cast", i)
                    if count>4:
                        break
                    count+=1
                description = row[13]
                G.add_node(row[14])
                color_map.append(colors[11])
                node_sizes.append(7000)
                G.add_edge(row[1],row[14], movie='Rating')
                break

    print(len(G.edges()))
    print(G.edges())

    print("")
    print(G.edges(data=True))
    print("")
    print(G.edges(data='relation'))
    
    print("g_nodes")
    print(G.nodes())
    
    print(G.info())

#         plt.figure(figsize=(25,25))
#         pos = nx.shell_layout(G)
#         pos[a] = np.array([0, 0])
#         nx.draw(G, with_labels=True, node_color=color_map, node_size = node_sizes, prog="dot", edge_cmap=plt.cm.Blues, font_size=20, pos=pos)
#         edge_labels = nx.get_edge_attributes(G, 'movie')
#         nx.draw_networkx_edge_labels(G, pos, labels=edge_labels, font_size=20)
#         plt.savefig("movie_detail.pdf")
#         print("Description of movie: ", description)
#         print("\nPlease Check movie_detail.pdf in the current code directory\n")

In [53]:
movie_details('Cleopatra')

17
[('Cleopatra', '1912-11-13'), ('Cleopatra', 'Genres'), ('Cleopatra', '100'), ('Cleopatra', 'USA'), ('Cleopatra', 'Languages'), ('Cleopatra', 'Charles L. Gaskill'), ('Cleopatra', 'Cast'), ('Cleopatra', '5.2'), ('Genres', 'Drama'), ('Genres', 'History'), ('Languages', 'English'), ('Cast', 'Helen Gardner'), ('Cast', 'Pearl Sindelar'), ('Cast', 'Miss Fielding'), ('Cast', 'Miss Robson'), ('Cast', 'Helene Costello'), ('Cast', 'Charles Sindelar')]

[('Cleopatra', '1912-11-13', {'movie': 'Released on'}), ('Cleopatra', 'Genres', {'movie': 'Genres include'}), ('Cleopatra', '100', {'movie': 'Duration(Mins)'}), ('Cleopatra', 'USA', {'movie': 'Country released in'}), ('Cleopatra', 'Languages', {'movie': 'languages released in'}), ('Cleopatra', 'Charles L. Gaskill', {'movie': 'Directed by'}), ('Cleopatra', 'Cast', {'movie': 'cast includes'}), ('Cleopatra', '5.2', {'movie': 'Rating'}), ('Genres', 'Drama', {}), ('Genres', 'History', {}), ('Languages', 'English', {}), ('Cast', 'Helen Gardner', {}), 

AttributeError: 'MultiDiGraph' object has no attribute 'info'

In [89]:
KG = KnowledgeGraph()
KG.movie_details(movie_name)

NameError: name 'KnowledgeGraph' is not defined

In [116]:
import inspect
import random
print(inspect.getfile(random))

C:\Users\LG\anaconda3\lib\random.py


In [1]:
import sys
print(sys.path)

['C:\\Users\\LG\\Desktop\\github\\PURDUE-2023-FALL\\CNIT_483\\GroupProject', 'C:\\Users\\LG\\anaconda3\\python38.zip', 'C:\\Users\\LG\\anaconda3\\DLLs', 'C:\\Users\\LG\\anaconda3\\lib', 'C:\\Users\\LG\\anaconda3', '', 'C:\\Users\\LG\\anaconda3\\lib\\site-packages', 'C:\\Users\\LG\\anaconda3\\lib\\site-packages\\locket-0.2.1-py3.8.egg', 'C:\\Users\\LG\\anaconda3\\lib\\site-packages\\win32', 'C:\\Users\\LG\\anaconda3\\lib\\site-packages\\win32\\lib', 'C:\\Users\\LG\\anaconda3\\lib\\site-packages\\Pythonwin', 'C:\\Users\\LG\\anaconda3\\lib\\site-packages\\IPython\\extensions', 'C:\\Users\\LG\\.ipython']


In [1]:
import networkx as nx
import pandas as pd
import os

import stellargraph as sg
from stellargraph.mapper import FullBatchNodeGenerator
from stellargraph.layer import GAT

from tensorflow.keras import layers, optimizers, losses, metrics, Model
from sklearn import preprocessing, feature_extraction, model_selection
from stellargraph import datasets
from IPython.display import display, HTML
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
dataset = datasets.MovieLens()
display(HTML(dataset.description))
G, edges_with_ratings = dataset.load()

In [3]:
print(G.info())

StellarGraph: Undirected multigraph
 Nodes: 2625, Edges: 100000

 Node types:
  movie: [1682]
    Features: float32 vector, length 19
    Edge types: movie-rating->user
  user: [943]
    Features: float32 vector, length 24
    Edge types: user-rating->movie

 Edge types:
    movie-rating->user: [100000]
        Weights: all 1 (default)
        Features: none


In [4]:
set(edges_with_ratings)

{'movie_id', 'rating', 'user_id'}

In [None]:
train_subjects, test_subjects = model_selection.train_test_split(
    node_subjects, train_size=140, test_size=None, stratify=node_subjects
)
val_subjects, test_subjects = model_selection.train_test_split(
    test_subjects, train_size=500, test_size=None, stratify=test_subjects
)

In [14]:
batch_size = 200
epochs = 20
# Use 70% of edges for training, the rest for testing:
train_size = 0.7
test_size = 0.3

In [15]:
edges_train, edges_test = model_selection.train_test_split(
    edges_with_ratings, train_size=train_size, test_size=test_size
)

edgelist_train = list(edges_train[["user_id", "movie_id"]].itertuples(index=False))
edgelist_test = list(edges_test[["user_id", "movie_id"]].itertuples(index=False))

labels_train = edges_train["rating"]
labels_test = edges_test["rating"]

In [16]:
num_samples = [8, 4]

In [None]:
from collections import Counter

Counter(train_subjects)

In [11]:
target_encoding = preprocessing.LabelBinarizer()

train_targets = target_encoding.fit_transform(train_subjects)
val_targets = target_encoding.transform(val_subjects)
test_targets = target_encoding.transform(test_subjects)

In [12]:
generator = FullBatchNodeGenerator(G, method="gat")

In [13]:
train_gen = generator.flow(train_subjects.index, train_targets)

In [14]:
gat = GAT(
    layer_sizes=[8, train_targets.shape[1]],
    activations=["elu", "softmax"],
    attn_heads=8,
    generator=generator,
    in_dropout=0.5,
    attn_dropout=0.5,
    normalize=None,
)

In [15]:
x_inp, predictions = gat.in_out_tensors()

In [16]:
model = Model(inputs=x_inp, outputs=predictions)
model.compile(
    optimizer=optimizers.Adam(lr=0.005),
    loss=losses.categorical_crossentropy,
    metrics=["acc"],
)

  super(Adam, self).__init__(name, **kwargs)


In [17]:
val_gen = generator.flow(val_subjects.index, val_targets)

In [18]:
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

if not os.path.isdir("logs"):
    os.makedirs("logs")
es_callback = EarlyStopping(
    monitor="val_acc", patience=20
)  # patience is the number of epochs to wait before early stopping in case of no further improvement
mc_callback = ModelCheckpoint(
    "logs/best_model.h5", monitor="val_acc", save_best_only=True, save_weights_only=True
)

In [20]:
history = model.fit(
    train_gen,
    epochs=50,
    validation_data=val_gen,
    verbose=2,
    shuffle=False,  # this should be False, since shuffling data means shuffling the whole graph
    callbacks=[es_callback, mc_callback],
)

Epoch 1/50
1/1 - 6s - loss: 1.9300 - acc: 0.1714 - val_loss: 1.8390 - val_acc: 0.3760 - 6s/epoch - 6s/step
Epoch 2/50
1/1 - 0s - loss: 1.8351 - acc: 0.3571 - val_loss: 1.7337 - val_acc: 0.3900 - 424ms/epoch - 424ms/step
Epoch 3/50
1/1 - 0s - loss: 1.7236 - acc: 0.4429 - val_loss: 1.6442 - val_acc: 0.3940 - 286ms/epoch - 286ms/step
Epoch 4/50
1/1 - 0s - loss: 1.6200 - acc: 0.4071 - val_loss: 1.5662 - val_acc: 0.4300 - 369ms/epoch - 369ms/step
Epoch 5/50
1/1 - 0s - loss: 1.5342 - acc: 0.4857 - val_loss: 1.4960 - val_acc: 0.4700 - 499ms/epoch - 499ms/step
Epoch 6/50
1/1 - 0s - loss: 1.4674 - acc: 0.4786 - val_loss: 1.4312 - val_acc: 0.5100 - 353ms/epoch - 353ms/step
Epoch 7/50
1/1 - 0s - loss: 1.3657 - acc: 0.5643 - val_loss: 1.3694 - val_acc: 0.5640 - 299ms/epoch - 299ms/step
Epoch 8/50
1/1 - 0s - loss: 1.3659 - acc: 0.5286 - val_loss: 1.3122 - val_acc: 0.6220 - 283ms/epoch - 283ms/step
Epoch 9/50
1/1 - 0s - loss: 1.1523 - acc: 0.6714 - val_loss: 1.2584 - val_acc: 0.6660 - 317ms/epoch - 