# LELA32051 Computational Linguistics Week 12

This week we are going to look at computational semantics, and in particular the creation and use of knowledge graphs

In [3]:
!wget https://raw.githubusercontent.com/cbannard/compling23/main/CL_Week_12_Materials/ie_tools.py
import ie_tools as ie
import nltk
nltk.download('punkt')
import spacy
import pandas as pd
from spacy import displacy
from spacy.matcher import Matcher
import networkx as nx
import matplotlib.pyplot as plt

nlp = spacy.load("en_core_web_sm")



--2023-12-14 20:44:15--  https://raw.githubusercontent.com/cbannard/compling23/main/CL_Week_12_Materials/ie_tools.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.108.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2982 (2.9K) [text/plain]
Saving to: ‘ie_tools.py’


2023-12-14 20:44:15 (28.9 MB/s) - ‘ie_tools.py’ saved [2982/2982]



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Spacy (https://spacy.io/) is a Python natural language toolkit, much like NLTK

It has a single function that applies all steps of the NLP pipeline (up to parsing) to input text.


In [None]:
sent = nlp("John ate the cake")


In [None]:
for token in sent:
    print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
            token.shape_, token.is_alpha, token.is_stop)

We can visualise the dependency parse like this

In [None]:
displacy.render(sent, style='dep', jupyter=True, options={'distance': 90})

We can use the dependency parse to generate a knowledge graph as follows

In [None]:
entity_pairs = []
relations = []

entity_pairs.append(ie.get_entities(sent,nlp))
relations.append(ie.get_relation(sent,nlp))

subject = [i[0] for i in entity_pairs]

# extract object
object = [i[1] for i in entity_pairs]

df = pd.DataFrame({'subject':subject, 'object':object, 'predicate':relations})
df

This get more interesting when we start looking at multiple sentences

In [4]:
sentences = ["John ate the cake","Sam bought the cake", "Robert made the cake"]

In [5]:
entity_pairs = []
relations = []

for sent in sentences:
  entity_pairs.append(ie.get_entities(sent,nlp))
  relations.append(ie.get_relation(sent,nlp))

subject = [i[0] for i in entity_pairs]

# extract object
object = [i[1] for i in entity_pairs]

world = pd.DataFrame({'subject':subject, 'object':object, 'predicate':relations})
world

Unnamed: 0,subject,object,predicate
0,John,cake,ate
1,Sam,cake,bought
2,Robert,cake,made


This simple knowledge graph can be considered to be a model of the world of which we can ask questions. For example the following simple function checks whether a fact it true in this world

In [6]:
def checktrue(world,delc):
  return ie.row_contains(world,decl).astype(int).sum() > 0

In [9]:
decl=ie.get_kg("Sam ate the cake",nlp)

In [10]:
checktrue(world,decl)

False

In [None]:
world[world['object']=="cake & world['predicate'] == cake]

In [None]:
def who_agent_Q(world,Q):
    world[world['object']=="cake & world['predicate'] == "ate"]
  

In [None]:
G=nx.from_pandas_edgelist(world, "subject", "object",
                          edge_attr=True, create_using=nx.MultiDiGraph())
plt.figure(figsize=(12,12))

pos = nx.spring_layout(G)
nx.draw(G, with_labels=True, node_color='skyblue', edge_cmap=plt.cm.Blues, pos = pos)
edge_labels = dict([((n1, n2), f'{n1}->{n2}')
                    for n1, n2 in G.edges])

nx.draw_networkx_edge_labels(G, pos, edge_labels=edge_labels)

plt.show()

In [None]:
G=nx.from_pandas_edgelist(world, "predicate", "subject",
                          edge_attr=True, create_using=nx.MultiDiGraph())

plt.figure(figsize=(12,12))
pos = nx.spring_layout(G, k = 0.5) # k regulates the distance between nodes
nx.draw(G, with_labels=True, node_color='skyblue', node_size=1500, edge_cmap=plt.cm.Blues, pos = pos)


plt.show()

In [None]:
G=nx.from_pandas_edgelist(world[world['object']=="cake"], "predicate", "subject",
                          edge_attr=True, create_using=nx.MultiDiGraph())

plt.figure(figsize=(12,12))
pos = nx.spring_layout(G, k = 0.5) # k regulates the distance between nodes
nx.draw(G, with_labels=True, node_color='skyblue', node_size=1500, edge_cmap=plt.cm.Blues, pos = pos)


plt.show()

In [None]:
!wget https://www.gutenberg.org/files/2554/2554-0.txt

In [None]:
f = open('2554-0.txt')
raw = f.read()
chapter_one = raw[5464:23725]
chapter_one=chapter_one.replace("\n"," ")
C_and_P_sentences = []
for sent in nltk.sent_tokenize(chapter_one):
    C_and_P_sentences.append(sent)

In [None]:
entity_pairs = []
relations = []

for s in C_and_P_sentences:
  entity_pairs.append(ie.get_entities(s,nlp))
  relations.append(ie.get_relation(s,nlp))

indices = [i for i, x in enumerate(entity_pairs) if x != None]
entity_pairs = [entity_pairs[i] for i in indices]
relations = [relations[i] for i in indices]
source = [i[0] for i in entity_pairs]

# extract object
target = [i[1] for i in entity_pairs]

world = pd.DataFrame({'source':source, 'target':target, 'edge':relations})

In [None]:
world

In [None]:
G=nx.from_pandas_edgelist(world[world['subject']=="he"], "edge", "target",
                          edge_attr=True, create_using=nx.MultiDiGraph())

plt.figure(figsize=(12,12))
pos = nx.spring_layout(G, k = 0.5) # k regulates the distance between nodes
nx.draw(G, with_labels=True, node_color='skyblue', node_size=1500, edge_cmap=plt.cm.Blues, pos = pos)
plt.show()

In [None]:
samp=world.sample(1)

In [None]:
samp

In [None]:
world.isin(samp).astype(int).sum().sum() > 0

In [None]:
def checktrue(world,delc):
  return world.isin(decl).astype(int).sum().sum() > 0

In [None]:
checktrue(world,decl)

In [None]:
decl