In [0]:
#mount data code here add the path to datapath variable
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

Mounted at /content/gdrive


In [0]:
DATA_PATH = '/content/gdrive/My Drive/Capstone Project/'
!ls '/content/gdrive/My Drive/Capstone Project/'

 buildgraph.ipynb		      'Project Report.gdoc'
 Capstone_Video.mp4		       questions.txt
 KnowledgeGraph_presentation.gslides   README.txt.gdoc
 KnowledgeGraph_presentation.pdf      'Team Member Contributions.gdoc'
 movie_metadata.csv


In [0]:
import csv
import networkx as nx
import matplotlib as plt
import nltk
from nltk import word_tokenize, pos_tag, ne_chunk
from nltk.chunk import conlltags2tree, tree2conlltags
from pprint import pprint
from nltk.stem.snowball import SnowballStemmer
from nltk.tree import Tree
import re
import json
import urllib


In [0]:
nltk.download('words')
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')
nltk.download('maxent_ne_chunker')

[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!


True

In [0]:
G=nx.Graph() #Creating the graph

In [0]:
row_index=0
counter=0 #used to assign a unique node ID to each node.
with open(DATA_PATH+'movie_metadata.csv') as csv_file:
  csv_reader = csv.reader(csv_file, delimiter=',')
  #each of the below dictionaries are done in order to maintain state. For example, every actor "gets" at most one node in the graph.
  #When we see the same actor occur again in a dataset, we muct not create another node for them, as the node already exists.
  actors={}
  directors={}
  genres={}
  titles={}
  languages={}
  for row in csv_reader: #iterates through each row of the dataset
    
    row=[re.sub("[:!',.&]","",s) for s in row]
    if row_index!=0: #In order to leave out the header
      
      if row[4] not in titles: #checks if the node is already present ; if so, a new one need not be created.
        counter+=1
        G.add_node(counter,title=row[4]) #adding the node to the graph. The node is of type "title"; the title is selected from the current row of the dataset and it's index is given by the variable counter.
        titles[row[4]]=counter #adding it to the dictionary. The dictionary contains title:nodenumber key value pairs
      
      if row[0] not in directors:
        counter+=1
        G.add_node(counter,director=row[0])
        directors[row[0]]=counter
      
      if row[1] not in actors:
        counter+=1
        G.add_node(counter,actor=row[1])
        actors[row[1]]=counter
      
      genres_list=row[2].split("|") #genres are listed as so- Action|Adventure|Thriller; we split them up into a list
      
      for genre in genres_list:
        if genre not in genres:
          counter+=1
          G.add_node(counter,genre=genre)
          genres[genre]=counter
        
      if row[3] not in actors:
        counter+=1
        G.add_node(counter,actor=row[3])
        actors[row[3]]=counter
      
      if row[5] not in actors:
        counter+=1
        G.add_node(counter,actor=row[5])
        actors[row[5]]=counter
      
      if row[6] not in languages:
        counter+=1
        G.add_node(counter,language=row[6])
        languages[row[6]]=counter
        
      
      G.add_edge(directors[row[0]],titles[row[4]],relation="directs") #adding an attributed edge between the movie and director
      G.add_edge(actors[row[1]],titles[row[4]],relation="acts")
      G.add_edge(actors[row[3]],titles[row[4]],relation="acts")
      G.add_edge(actors[row[5]],titles[row[4]],relation="acts")
      for genre in genres_list:
          G.add_edge(genres[genre],titles[row[4]],relation="genre")
      G.add_edge(languages[row[6]],titles[row[4]],relation="language")
            
    row_index+=1
    


In [0]:
#The below code draws the graph. Since we have nodes in the scale of thousands, this takes a lot of time and hence we do not run it.
#pos = nx.spring_layout(G,scale=4)
#nx.draw(G, with_labels=True, node_size=200,font_size=8)

In [0]:
#https://stackoverflow.com/questions/48660547/how-can-i-extract-gpelocation-using-nltk-ne-chunk
from nltk import word_tokenize, pos_tag, ne_chunk
from nltk import Tree

def get_continuous_chunks(text, label):
    chunked = ne_chunk(pos_tag(word_tokenize(text)))
    prev = None
    continuous_chunk = []
    current_chunk = []

    for subtree in chunked:
        if type(subtree) == Tree and subtree.label() == label:
            current_chunk.append(" ".join([token for token, pos in subtree.leaves()]))
        elif current_chunk:
            named_entity = " ".join(current_chunk)
            if named_entity not in continuous_chunk:
                continuous_chunk.append(named_entity)
                current_chunk = []
        else:
            continue

    return continuous_chunk

In [0]:
#We parse questions from our question dataset for entities and relations. We query the above generated graph with the parsed entities and relation 
#and compare it to the results obtained by querying the Google Knowledge Graph API.
stemmer = SnowballStemmer("english")

def fn_preprocess(art):
    art = nltk.word_tokenize(art) #The sentence is converted into tokens 
    art = nltk.pos_tag(art) #Each of the tokens are tagged with their appropriate parts of speech
    return art
  

#Take the token that is a verb(acts,directs,stars, etc) and convert it to its stemmed form. This will be the relation part of the query to the 
#Entity relation graph.

def create_entity_tuple(query):
  query_processed = fn_preprocess(query)
  NE_list=get_continuous_chunks(query,'PERSON')
  #print(NE_list)
  results = ne_chunk(query_processed)
  #cp = nltk.RegexpParser(pattern)
  cs = results #cp.parse(query_processed)

  #print(cs)

  #Below are dictionaries of words and synonyms that we are concerned with with respect to the relations in our graph such as acts,directs
  function_mapper={'director':'title','actor':'title','genre':'title', 'language':'title'}
  acts={'star','act','play','actor','lead','appear'}
  directs={'direct','director'}
  genres={'genre','type','genr','typ'}
  languages={'language','languag'}
  nltk_names={'PRP','PRP$','VB','VBD','VBG','VBN','VBP','NN','NNS','NNP','NPS','DT'}
  iob_tagged = tree2conlltags(cs) #IOB tagging the words

  relation_token_seen=0
  next_token_seen=0
  rest_of_sentence=[]
  NNP_list=[]
  nnp_token_seen=0
  
  for x in iob_tagged:
    stemmed_x=stemmer.stem(x[0]) #We get the stemmed version of the word making it compatible with the dictionary eg: acted,acting->act
    #print(stemmed_x)
    if stemmed_x in acts: #Looking in each dictionary
      relation="actor"
      relation_token_seen=1
    
    elif stemmed_x in directs:
      relation_token_seen=1
      relation="director"
      
    elif stemmed_x in genres:
      relation_token_seen=1
      relation="genre"
      
    elif stemmed_x in languages:
      relation_token_seen=1
      relation="language"
      
    elif(relation_token_seen==1): #This token indicates that the required relation has already been seen in the string.
      if(x[1] in nltk_names):
        rest_of_sentence.append(x[0])
        next_token_seen=1
        relation_token_seen=0
      
    if(next_token_seen==1):
      rest_of_sentence.append(x[0])
      
    
    entity_name=" ".join(rest_of_sentence[1:-1])
  
  if relation in ['actor','director'] and NE_list and len(NE_list)<=2:
    #print(relation,NE_list[0])
    entity_name=NE_list[0]
    if relation=='actor':
      function_map='actor'
      relation='title'
    elif relation=='director':
      function_map='director'
      relation='title'
  else:
    function_map=function_mapper[relation]
      
    
  return(function_map,entity_name.lower(),relation)

In [0]:
def get_entity_relation_from_graph(entity,entity_name,relation): 
  #In a question such as "Who directed Avengers?"; 
  #we expect the extracted parameters to be entity:title,entity_name:Avengers;relation:direct
  
  node_list=list(G.nodes(data=entity)) #In our example, this gets all the "title" attributed nodes.
  
  match_list=[] 
  
  for node in node_list:
    if node[1]==entity_name: #Looks for a match for "Avengers"
      match_list.append(node[0])
   
  answer=[]
  for matched_node in match_list:
    for entity_node in G.edges(matched_node): #Look for all outgoing edges; follow he edge with the attribute(relation) that we need
      if relation in G.nodes[entity_node[1]]: 
        answer.append(G.nodes[entity_node[1]][relation])
      
  return answer  

In [0]:
#Function example: You can look for how the function creates the entity tuple, and how it obtains the response for a question
#of choice using the below code snippet. We make certain assumptions.
#1)We handle only binary questions: Questions such as "Who directed I am Legend" rather than "Who directs and acts in I am legend"
#2)We do not handle questions in passive voice: Questions such as "I am legend was directed by whom?"
#3)We do not handle cases with movies just named with the entities, such as, "Who directed Robin Hood?"
#4)We do not hande very vague terms to inference a relation, such as, "Who is IN I am legend?"

entity_tuple=create_entity_tuple("Who directed The Avengers?")
print(entity_tuple)
answer_list=get_entity_relation_from_graph(entity_tuple[0],entity_tuple[1],entity_tuple[2])
print(answer_list)

('title', 'the avengers', 'director')
['joss whedon']


In [0]:
#We open the dataset and read line by line. We send this information to the function and collect the outputs. This can then be compared
#to the equivalent results from Google's Knowledge Graph API. We calculate accuracy in terms of the number of questions we get a non-empty response
#from the function.

with open(DATA_PATH+'questions.txt') as file:
  questions=file.readlines()
  query_counter=0
  right_counter=0
  for question in questions:
    query_counter+=1
    question=question.decode("utf-8")
    question=re.sub("[:!',.&]","",question)
    entity_tuple=create_entity_tuple(question)
    answer_list=get_entity_relation_from_graph(entity_tuple[0],entity_tuple[1],entity_tuple[2])
    if(answer_list):
        right_counter+=1
  print("Correct/Overall: "+str(right_counter)+"/"+str(query_counter))
  print("Accuracy: "+str(((1.0*right_counter)/(1.0*query_counter))*100))
  

  # Remove the CWD from sys.path while we load stuff.


Correct/Overall: 158/262
Accuracy: 60.3053435115


In [0]:
#API key given  below is our acquired key.
API_KEY = ''

In [0]:
    api_key = API_KEY
    service_url = 'https://kgsearch.googleapis.com/v1/entities:search'

    query = 'i am legend'
    params = {
        'query': query,
        'limit': 4,
        'indent': True,
        'key': api_key,
    }
    url = service_url + '?' + urllib.urlencode(params)
    response = json.loads(urllib.urlopen(url).read())
    mov_details=[]
    names=[]
    for element in response['itemListElement']:
      if 'detailedDescription' in element['result'] and 'name' in element['result']:
          print (element['result']['name'],"--", element['result']['detailedDescription'])



(u'I Am Legend', '--', {u'url': u'https://en.wikipedia.org/wiki/I_Am_Legend_(film)', u'articleBody': u'I Am Legend is a 2007 American post-apocalyptic science fiction horror film based on the novel of the same name, directed by Francis Lawrence and starring Will Smith, who plays US Army virologist Robert Neville. ', u'license': u'https://en.wikipedia.org/wiki/Wikipedia:Text_of_Creative_Commons_Attribution-ShareAlike_3.0_Unported_License'})
(u'I Am Legend', '--', {u'url': u'https://en.wikipedia.org/wiki/I_Am_Legend_(novel)', u'articleBody': u'I Am Legend is a 1954 science fiction horror novel by American writer Richard Matheson. It was influential in the development of the zombie-vampire genre and in popularizing the concept of a worldwide apocalypse due to disease. ', u'license': u'https://en.wikipedia.org/wiki/Wikipedia:Text_of_Creative_Commons_Attribution-ShareAlike_3.0_Unported_License'})
(u'I Am Legend', '--', {u'url': u'https://en.wikipedia.org/wiki/I_Am_Legend_(TV_series)', u'art