In [None]:
!pip install allennlp==2.1.0 allennlp-models==2.1.0
!pip install spacy
!python -m spacy download en_core_web_sm
!pip install stanza

In [None]:
from allennlp.predictors.predictor import Predictor
import allennlp_models.tagging
import stanza
stanza.install_corenlp()
from stanza.server import CoreNLPClient

In [None]:
import math
import re
from collections import Counter
import spacy
import json
import os
import collections
import string

WORD = re.compile(r"\w+")
coref = Predictor.from_path("https://storage.googleapis.com/allennlp-public-models/coref-spanbert-large-2021.03.10.tar.gz")
nlp = spacy.load('en_core_web_sm')

In [None]:
# from google.colab import drive
# drive.mount('/content/GoogleDrive/')
# data_dir = '/content/GoogleDrive/MyDrive/results/meta-transcript-readable-pymupdf.txt'
!unzip cleaned_json.zip
data_dir = '/content/cleaned_json'

In [None]:
# Cosine similarity between two vectors
def get_cosine(vec1, vec2):
  intersection = set(vec1.keys()) & set(vec2.keys())
  numerator = sum([vec1[x] * vec2[x] for x in intersection])

  sum1 = sum([vec1[x] ** 2 for x in list(vec1.keys())])
  sum2 = sum([vec2[x] ** 2 for x in list(vec2.keys())])
  denominator = math.sqrt(sum1) * math.sqrt(sum2)

  if not denominator:
      return 0.0
  else:
      return float(numerator) / denominator

# Convert string to a vector
def text_to_vector(text):
    words = WORD.findall(text)
    return Counter(words)

# check if 2 triples exist with similar subject, object, predicate longer subject is taken
def subset_phrase(triples, simScore):
  n = len(triples)
  new_triple = triples[:]
  for i in range(n):
    firstTri = triples[i]
    for j in range(i + 1, n):
      secondTri = triples[j]
      text1 = firstTri[0] + " " + firstTri[1] + " " + firstTri[2] + " " + firstTri[3]
      text2 = secondTri[0] + " " + secondTri[1] + " " + secondTri[2] + " " + secondTri[3]
      vector1 = text_to_vector(text1)
      vector2 = text_to_vector(text2)
      # Doing the above eliminates worrying about scenarios of exactly the same subject, object or predicate
      if get_cosine(vector1, vector2) >= simScore:
        # temp = firstTri if len(firstTri[0]) > len(secondTri[0]) else secondTri #can make this based on the subject, not text
        temp = firstTri if len(text1) > len(text2) else secondTri
        if temp == secondTri:
          if firstTri in new_triple:
            new_triple.remove(firstTri)
        elif secondTri in new_triple:
            new_triple.remove(secondTri)
  return new_triple


def min_char_count(triple):
    subject = triple[0]
    predicate = triple[1]
    object_ = triple[2]
    if (len(subject)) < 2 or (len(predicate) < 2) or (len(object_) < 2):
        return False
    return True

def duplicate(triple):
    # duplicants should not exist in subject and object
    subject = triple[0]
    predicate = triple[1]
    object_ = triple[2]
        
    def count_duplicate(string_input):
        split_list = string_input.split()
        word_counts = collections.Counter(split_list)
        for word, count in word_counts.items():
            if count > 1:
                return True
        return False

    if any([count_duplicate(subject), count_duplicate(predicate), count_duplicate(object_)]):
        return False
    return True

def special_characters(triple):
    subject = triple[0]
    predicate = triple[1]
    object_ = triple[2]
  
    def find_sc(string_input):
        for s in string_input:    
            if s.isalpha():
                pass
            elif s.isdigit():
                pass
            elif s in string.punctuation:
                # sc.append(s)
                return True

    if any([find_sc(subject), find_sc(predicate), find_sc(object_)]):
        return False
    return True

def special_characters(triple):
    subject = triple[0]
    predicate = triple[1]
    object_ = triple[2]
  
    def find_sc(string_input):
        for s in string_input:    
            if s.isalpha():
                pass
            elif s.isdigit():
                pass
            elif s in string.punctuation:
                # sc.append(s)
                return True

    if any([find_sc(subject), find_sc(predicate), find_sc(object_)]):
        return False
    return True

# return true or false if triple is a valid triple
def filter_triple(triple):
  DAY_OF_THE_WEEK = ['monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday', 'sunday']
  subject = triple[0]
  predicate = triple[1]
  object_ = triple[2]
  doc_sub = nlp(subject)
  doc_obj = nlp(object_)
  doc_pred = nlp(predicate)
  subject_pos = [token.pos_ for token in doc_sub] #all parts of speech
  object_pos = [token.pos_ for token in doc_obj] #all parts of speech
  predicate_pos = [token.pos_ for token in doc_pred] #all parts of speech
  all_words_day_week = [True if word.lower() in DAY_OF_THE_WEEK else False for word in subject.split()]
  contains_day_of_week = any(all_words_day_week) #performs OR operation of booleans in list
  if 'NOUN' not in subject_pos and 'PROPN' not in subject_pos:
    return False
  elif contains_day_of_week:
    return False
  elif ('VERB' in subject_pos) or ('VERB' in object_pos):
    return False
  elif any(["PRON" in subject_pos,"PRON" in predicate_pos, "PRON" in object_pos]):
    return False
  elif subject == object_:
    return False
  elif not special_characters(ele):
    return False
  elif not duplicate(ele):
    return False
  elif not min_char_count(ele):
    return False
  elif len(list(filter(lambda x: x != 'PUNCT', subject_pos))) > 3:
    return False
  elif len(list(filter(lambda x: x != 'PUNCT', object_pos))) > 3:
    return False
  return True

filter_triple(("Tuesday", "is", "hard"))

False

In [None]:
def addTriple(document, simThresh):
  allTriples = []
  triples = []
  for sentence in document['sentences']:
    # Add temporal relation
    cursentTemp = ""
    for ele in sentence['entitymentions']:
      if ele['ner'] == "DATE":
        temp = ele['text'] 
        cursentTemp = temp if len(temp) > len(cursentTemp) else cursentTemp    
    sentTrip = []
    for triple in sentence['openie']:
      if filter_triple((triple['subject'], triple['relation'], triple['object'])):
        sentTrip.append((triple['subject'], triple['relation'], triple['object'], cursentTemp))
    valid_triples = subset_phrase(sentTrip, simThresh) #more similarity score increases the number of sentences retrieved since we're making sure sentences are extremely close in order to drop the shorter one. 
    triples.append(valid_triples)
    
  triples = [item for sublist in triples for item in sublist]
  for ele in triples:
    tmp_arg = ele[3]
    if len(tmp_arg) > 0:
      allTriples.append((ele[2], ele[1], ele[3]))
    allTriples.append((ele[0], ele[1], ele[2]))
  return allTriples

In [None]:
# PREPROCESS TEXT
def clean_text(sentence):
  """
  Input sentence: Raw sentence
  Output sentence: cleaned sentence with - 
                  (i) no extra whitespaces, no new lines, no tabs
                  (ii) lemmatized sentence
                  (iii) generic sentences with no NER are returned as empty string

  """
  doc=nlp(sentence)
  # lemmatization
  lemmatized_sentence=""
  for token in doc:
    if token.lemma_ !="-PRON-":
      lemmatized_sentence=lemmatized_sentence+token.lemma_+" "
    else:
      if token.lemma_ == " . ":
        print("MAD OO")
      lemmatized_sentence=lemmatized_sentence+str(token)+" "
  sentence=lemmatized_sentence[:-1]

  # removing whitespace, /n, tabs
  sentence = sentence.replace('\\n', ' ').replace('\n', ' ').replace('\t',' ').replace('\\', ' ')
  pattern = re.compile(r'\s+') 
  Without_whitespace = re.sub(pattern, ' ', sentence)
  # There are some instances where there is no space after '?' & ')', 
  # So I am replacing these with one space so that It will not consider two words as one token.
  sentence = Without_whitespace.replace('?', ' ? ').replace(')', ') ')
  return sentence


text = "Ridwan is not a person. He is nice."
print(clean_text(text))

Ridwan be not a person . He be nice .


In [None]:
# Load files from source document
allTriples = []
files = sorted(os.listdir(data_dir))
# allText = ""
for file in files:
  with open(f'{data_dir}/{file}') as f:
    d = json.load(f)
  text = d['text']
  all_authors = ""
  for ele in d['auhtors']:
    all_authors += ele + ", "
  allTriples.append((all_authors[:-2], 'author(s)', d['title']))
  if len(text) > 0:
    text = clean_text(text)
    coref_text = coref.coref_resolved(document=text)
    with CoreNLPClient(timeout=150000000, be_quiet=False, annotators=['ner', 'openie'], memory='25G', endpoint='http://localhost:9001') as client:
      document = client.annotate(coref_text, output_format='json', properties={"openie.triple.all_nominals": True})
      allTriples.extend(addTriple(document, 0.5))
    # allText += coref_text + " "

In [None]:
dr = {"results" : allTriples}
json_object = json.dumps(dr)
with open("stanfordopenie.json", "w") as f:
  f.write(json_object)

In [None]:
!pip install rdflib
from rdflib import Graph
import pprint
g = Graph()
g.parse("/content/demo.nt")

In [None]:
for stmt in g:
    pprint.pprint(stmt)

(rdflib.term.URIRef('http://example.com/drewp'),
 rdflib.term.URIRef('http://www.w3.org/1999/02/22-rdf-syntax-ns#type'),
 rdflib.term.URIRef('http://xmlns.com/foaf/0.1/Person'))
(rdflib.term.URIRef('http://example.com/drewp'),
 rdflib.term.URIRef('http://example.com/says'),
 rdflib.term.Literal('Hello World'))
