# LELA32051 Computational Linguistics Week 12

This week we are going to complete our journey through the NLP pipeline by looking at computational semantics, and in particular the creation and use of knowledge graphs

In [None]:
!wget https://raw.githubusercontent.com/cbannard/compling23/main/CL_Week_12_Materials/ie_tools.py
import ie_tools as ie
import nltk
nltk.download('punkt')
import spacy
import pandas as pd
from spacy import displacy
from spacy.matcher import Matcher
import networkx as nx
import matplotlib.pyplot as plt

nlp = spacy.load("en_core_web_sm")



Spacy (https://spacy.io/) is a Python natural language toolkit, much like NLTK

It has a single function that applies all steps of the NLP pipeline (up to parsing) to input text.


In [None]:
sent = nlp("John ate the cake")


In [None]:
for token in sent:
    print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
            token.shape_, token.is_alpha, token.is_stop)

We can visualise the dependency parse like this

In [None]:
displacy.render(sent, style='dep', jupyter=True, options={'distance': 90})

We can use the dependency parse to generate a knowledge graph as follows

In [None]:
entity_pairs = []
relations = []

entity_pairs.append(ie.get_entities(sent,nlp))
relations.append(ie.get_relation(sent,nlp))

subject = [i[0] for i in entity_pairs]

# extract object
object = [i[1] for i in entity_pairs]

df = pd.DataFrame({'subject':subject, 'object':object, 'predicate':relations})
df

This get more interesting when we start looking at multiple sentences

In [None]:
sentences = ["John ate the cake","Sam bought the cake", "Robert made the cake"]

In [None]:
entity_pairs = []
relations = []

for sent in sentences:
  entity_pairs.append(ie.get_entities(sent,nlp))
  relations.append(ie.get_relation(sent,nlp))

subject = [i[0] for i in entity_pairs]

# extract object
object = [i[1] for i in entity_pairs]

world = pd.DataFrame({'subject':subject, 'object':object, 'predicate':relations})
world

This simple knowledge graph can be considered to be a model of the world of which we can ask questions. For example the following simple function checks whether a fact it true in this world

In [None]:
def checktrue(world,delc):
  return ie.row_contains(world,decl).astype(int).sum() > 0

In [None]:
decl=ie.get_kg("Sam bought the cake",nlp)
checktrue(world,decl)

In [None]:
decl=ie.get_kg("Sam ate the cake",nlp)
checktrue(world,decl)

The following simple function return an answer to a simple Who question

In [None]:
def who_agent_Q(world,Q):
   decl=ie.get_kg(Q,nlp)
   pred=decl['predicate'].values[0]
   obj=decl['object'].values[0]
   return world.loc[(world['object'] == obj) & (world['predicate'] == pred )]['subject'].values[0]


In [None]:
who_agent_Q(world,"Who bought the cake")

This process of knowledge extraction becomes more interesting when we apply it to longer texts. For example, the by now very familiar first chapter of Crime and Punishment.

In [None]:
!wget https://www.gutenberg.org/files/2554/2554-0.txt

In [None]:
f = open('2554-0.txt')
raw = f.read()
chapter_one = raw[5464:23725]
chapter_one=chapter_one.replace("\n"," ")
C_and_P_sentences = []
for sent in nltk.sent_tokenize(chapter_one):
    C_and_P_sentences.append(sent)

In [None]:
sent=nlp(C_and_P_sentences[0])

In [None]:
for token in sent:
    print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
            token.shape_, token.is_alpha, token.is_stop)

In [None]:
displacy.render(sent, style='dep', jupyter=True, options={'distance': 90})

Proper knowledge graph extraction  from this would require a more powerful function than our "get_entities" and "get_relations", but lets give it a go

In [None]:
entity_pairs = []
relations = []

for s in C_and_P_sentences:
  entity_pairs.append(ie.get_entities(s,nlp))
  relations.append(ie.get_relation(s,nlp))

indices = [i for i, x in enumerate(entity_pairs) if x != None]
entity_pairs = [entity_pairs[i] for i in indices]
relations = [relations[i] for i in indices]
subject = [i[0] for i in entity_pairs]

# extract object
object = [i[1] for i in entity_pairs]

world = pd.DataFrame({'subject':subject, 'object':object, 'predicate':relations})

In [None]:
world.head(20)

In [None]:
G=nx.from_pandas_edgelist(world[world['subject']=="He"], "predicate", "object",
                          edge_attr=True, create_using=nx.MultiDiGraph())

plt.figure(figsize=(12,12))
pos = nx.spring_layout(G, k = 0.5) # k regulates the distance between nodes
nx.draw(G, with_labels=True, node_color='skyblue', node_size=1500, edge_cmap=plt.cm.Blues, pos = pos)
plt.show()