In [1]:
from google.colab import drive
drive.mount("/content/gdrive")

Mounted at /content/gdrive


In [7]:
!pip install spacy

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [10]:
import re
import spacy
nlp = spacy.load('en_core_web_sm')

def check_generic(doc):
  if doc.ents:
      for ent in doc.ents:
          return(ent.text+' - ' +str(ent.start_char) +' - '+ str(ent.end_char) +' - '+ent.label_+ ' - '+str(spacy.explain(ent.label_)))
  else:
      return('No named entities found')

def clean_text(sentence):
  """
  Input sentence: Raw sentence
  Output sentence: cleaned sentence with - 
                  (i) no extra whitespaces, no new lines, no tabs
                  (ii) lemmatized sentence
                  (iii) generic sentences with no NER are returned as empty string

  """

  # removing whitespace, /n, tabs
  sentence = sentence.replace('\\n', ' ').replace('\n', ' ').replace('\t',' ').replace('\\', ' ')
  pattern = re.compile(r'\s+') 
  Without_whitespace = re.sub(pattern, ' ', sentence)
  # There are some instances where there is no space after '?' & ')', 
  # So I am replacing these with one space so that It will not consider two words as one token.
  sentence = Without_whitespace.replace('?', ' ? ').replace(')', ') ')

  doc=nlp(sentence)
  
  # lemmatization
  lemmatized_sentence=""
  for token in doc:
    if token.lemma_ !="-PRON-":
      lemmatized_sentence=lemmatized_sentence+token.lemma_+" "
    else
      lemmatized_sentence=lemmatized_sentence+token+" "
    
  sentence=lemmatized_sentence[:-1]

  # check for generic sentences
  ner=check_generic(doc)
  if ner=="No named entities found":
    sentence=""

  return sentence

# How to use the functions above

In [None]:
# Load research papers
import json

# "/content/gdrive/MyDrive/Capstone/research_paper_cleaning/"  
# Opening JSON file 1312.2048 1404.4275
f = open("/content/gdrive/MyDrive/Capstone/research_paper_cleaning/1312.2048.json")
 
# returns JSON object as a dictionary
data = json.load(f)
  
# Closing file
f.close()


In [13]:
data["abstract"]

"Designed to compete with fiat currencies, bitcoin proposes it is a crypto-currency alternative. Bitcoin makes a number of false claims, including: solving the double-spending problem is a good thing; bitcoin can be a reserve currency for banking; hoarding equals saving; and that we should believe bitcoin can expand by deflation to become a global transactional currency supply. Bitcoin's developers combine technical implementation proficiency with ignorance of currency and banking fundamentals."

In [11]:
# cleaning sentence by sentence
new_abstract=[]
for sentence in data["abstract"].split("."):
  new_abstract.append(clean_text(sentence))
new_abstract

['',
 '  Bitcoin make a number of false claim , include : solve the double - spending problem be a good thing ; bitcoin can be a reserve currency for banking ; hoarding equal saving ; and that we should believe bitcoin can expand by deflation to become a global transactional currency supply',
 "  Bitcoin 's developer combine technical implementation proficiency with ignorance of currency and banking fundamental",
 '']