In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
import pandas as pd
import json

# Install java
! apt-get install -y openjdk-8-jdk-headless -qq > /dev/null
!wget -q "https://downloads.apache.org/spark/spark-3.1.1/spark-3.1.1-bin-hadoop2.7.tgz" > /dev/null
!tar -xvf spark-3.1.1-bin-hadoop2.7.tgz
!pip install -q findspark

os.environ["SPARK_HOME"] = "/content/spark-3.1.1-bin-hadoop2.7"
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["PATH"] = os.environ["JAVA_HOME"] + "/bin:" + os.environ["PATH"]
! java -version

# Install spark-nlp and pyspark
! pip install spark-nlp==3.0.0 pyspark==3.1.1

# Quick SparkSession start
import sparknlp
spark = sparknlp.start()

print("Spark NLP version")
sparknlp.version()
print("Apache Spark version")
spark.version

spark-3.1.1-bin-hadoop2.7/
spark-3.1.1-bin-hadoop2.7/NOTICE
spark-3.1.1-bin-hadoop2.7/kubernetes/
spark-3.1.1-bin-hadoop2.7/kubernetes/tests/
spark-3.1.1-bin-hadoop2.7/kubernetes/tests/python_executable_check.py
spark-3.1.1-bin-hadoop2.7/kubernetes/tests/autoscale.py
spark-3.1.1-bin-hadoop2.7/kubernetes/tests/worker_memory_check.py
spark-3.1.1-bin-hadoop2.7/kubernetes/tests/py_container_checks.py
spark-3.1.1-bin-hadoop2.7/kubernetes/tests/decommissioning.py
spark-3.1.1-bin-hadoop2.7/kubernetes/tests/pyfiles.py
spark-3.1.1-bin-hadoop2.7/kubernetes/tests/decommissioning_cleanup.py
spark-3.1.1-bin-hadoop2.7/kubernetes/dockerfiles/
spark-3.1.1-bin-hadoop2.7/kubernetes/dockerfiles/spark/
spark-3.1.1-bin-hadoop2.7/kubernetes/dockerfiles/spark/decom.sh
spark-3.1.1-bin-hadoop2.7/kubernetes/dockerfiles/spark/entrypoint.sh
spark-3.1.1-bin-hadoop2.7/kubernetes/dockerfiles/spark/bindings/
spark-3.1.1-bin-hadoop2.7/kubernetes/dockerfiles/spark/bindings/R/
spark-3.1.1-bin-hadoop2.7/kubernetes/docker

'3.1.1'

In [None]:
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline

import sparknlp
from sparknlp.annotator import *
from sparknlp.common import *
from sparknlp.base import *

In [None]:
def start(gpu=False):
    builder = SparkSession.builder \
        .appName("Spark NLP") \
        .master("local[*]") \
        .config("spark.driver.memory", "8G") \
        .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")\
        .config("spark.kryoserializer.buffer.max", "1000M")
    if gpu:
        builder.config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp-gpu_2.11:2.5.1")
    else:
        builder.config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.11:2.5.1")

    return builder.getOrCreate()
  
spark = start(gpu=False)

In [None]:
bert = BertEmbeddings.pretrained('bert_base_cased', 'en')\
.setInputCols(["sentence",'token'])\
.setOutputCol("bert")\
.setCaseSensitive(True)\
# .tokenizer.encode(text, add_special_tokens=True, max_length=600)
# .setPoolingLayer(0) # default 0

bert_base_cased download started this may take some time.
Approximate size to download 389.1 MB
[OK!]


In [None]:
nerTagger = NerDLApproach()\
.setInputCols(["sentence", "token", "bert"])\
.setLabelColumn("label")\
.setOutputCol("ner")\
.setMaxEpochs(1)\
.setRandomSeed(0)\
.setVerbose(1)\
.setValidationSplit(0.2)\
.setEvaluationLogExtended(True)\
.setEnableOutputLogs(True)\
.setIncludeConfidence(True)\
# .setTestDataset("test_withEmbeds.parquet")

In [None]:
loaded_ner_model = NerDLModel.load("/content/drive/MyDrive/BERT_model") \
   .setInputCols(["sentence", "token", "bert"])\
   .setOutputCol("ner")

In [None]:
document = DocumentAssembler()\
    .setInputCol("text")\
    .setOutputCol("document")

sentence = SentenceDetector()\
    .setInputCols(['document'])\
    .setOutputCol('sentence')

token = Tokenizer()\
    .setInputCols(['sentence'])\
    .setOutputCol('token')

bert = BertEmbeddings.pretrained('bert_base_cased', 'en') \
 .setInputCols(["sentence",'token'])\
 .setOutputCol("bert")\
 .setCaseSensitive(True)

converter = NerConverter()\
  .setInputCols(["sentence","document", "token"])\
  .setOutputCol("ner_span")

ner_prediction_pipeline = Pipeline(
    stages = [
        document,
        sentence,
        token,
        bert,
        loaded_ner_model,
        converter])

bert_base_cased download started this may take some time.
Approximate size to download 389.1 MB
[OK!]


In [None]:
def makeTuple(ner_tags,combined,tupleAdd, paragraph): # code duplication : works for now, try to make it better
  """
  args: ner_tags - list, combined - list of starts and ends of the predicted sentence(paragraph),
        tupleAdd - final dictionary to append to json - {'tagged_entities': [[195, 206, 'Section  302', 'Section']} - format: start, end, actual token, type of token.
        paragraph - string - the current paragraph including para,subPara, and blockQuotes - used for slicing to find actual tokens
  returns: updated tupleAdd - dict
  """
  count_article = False
  count_section = False
  count_case = False
  count_act = False
  count_cite = False

  local_article = list()
  local_section = list()
  local_case = list()
  local_act = list()
  local_cite = list()

  for i in range(0,len(ner_tags)):
    if ner_tags[i].find("O") != -1:
      if count_section == True:
        tupleAdd['tagged_entities'].append([local_section[0],local_section[-1],str(paragraph[local_section[0]:local_section[-1]+1]),"Section"])
        local_section = list()
        count_section = False
      if count_case == True:
        tupleAdd['tagged_entities'].append([local_case[0],local_case[-1],str(paragraph[local_case[0]:local_case[-1]+1]),"Case"])
        local_case = list()
        count_case = False
      if count_act == True:
        tupleAdd['tagged_entities'].append([local_act[0],local_act[-1],str(paragraph[local_act[0]:local_act[-1]+1]),"Act"])
        local_act = list()
        count_act = False
      if count_article == True:
        tupleAdd['tagged_entities'].append([local_article[0],local_article[-1],str(paragraph[local_article[0]:local_article[-1]+1]),"Article"])
        local_article = list()
        count_article = False
      if count_cite == True:
        tupleAdd['tagged_entities'].append([local_cite[0],local_cite[-1],str(paragraph[local_cite[0]:local_cite[-1]+1]),"Cite"])
        local_cite = list()
        count_cite = False

    # article check
    elif ner_tags[i].find("Article") != -1:
      if count_section == True:
        tupleAdd['tagged_entities'].append([local_section[0],local_section[-1],str(paragraph[local_section[0]:local_section[-1]+1]),"Section"])
        local_section = list()
      if count_case == True:
        tupleAdd['tagged_entities'].append([local_case[0],local_case[-1],str(paragraph[local_case[0]:local_case[-1]+1]),"Case"])
        local_case = list()
      if count_act == True:
        tupleAdd['tagged_entities'].append([local_act[0],local_act[-1],str(paragraph[local_act[0]:local_act[-1]+1]),"Act"])
        local_act = list()
      if count_cite == True:
        tupleAdd['tagged_entities'].append([local_cite[0],local_cite[-1],str(paragraph[local_cite[0]:local_cite[-1]+1]),"Cite"])
        local_cite = list()

      count_section = False
      count_act = False
      count_case = False
      count_cite = False
  
      if count_article == False:
        local_article.append(int(combined[i][0]))
        local_article.append(int(combined[i][1]))
        count_article = True
      else:
        local_article.append(int(combined[i][1]))

    # section check
    elif ner_tags[i].find("Section") != -1:
      if count_article == True:
        tupleAdd['tagged_entities'].append([local_article[0],local_article[-1],str(paragraph[local_article[0]:local_article[-1]+1]),"Article"])
        local_article = list()
      if count_case == True:
        tupleAdd['tagged_entities'].append([local_case[0],local_case[-1],str(paragraph[local_case[0]:local_case[-1]+1]),"Case"])
        local_case = list()
      if count_act == True:
        tupleAdd['tagged_entities'].append([local_act[0],local_act[-1],str(paragraph[local_act[0]:local_act[-1]+1]),"Act"])
        local_act = list()
      if count_cite == True:
        tupleAdd['tagged_entities'].append([local_cite[0],local_cite[-1],str(paragraph[local_cite[0]:local_cite[-1]+1]),"Cite"])
        local_cite = list()

      count_article = False
      count_act = False
      count_case = False
      count_cite = False

      if count_section == False:
        local_section.append(int(combined[i][0]))
        local_section.append(int(combined[i][1]))
        count_section = True
      else:
        local_section.append(int(combined[i][1]))

    # case check
    elif ner_tags[i].find("Case") != -1:
      if count_article == True:
        tupleAdd['tagged_entities'].append([local_article[0],local_article[-1],str(paragraph[local_article[0]:local_article[-1]+1]),"Article"])
        local_list = list()
      if count_act == True:
        tupleAdd['tagged_entities'].append([local_act[0],local_act[-1],str(paragraph[local_act[0]:local_act[-1]+1]),"Act"])
        local_act = list()
      if count_section == True:
        tupleAdd['tagged_entities'].append([local_section[0],local_section[-1],str(paragraph[local_section[0]:local_section[-1]+1]),"Section"])
        local_section = list()
      if count_cite == True:
        tupleAdd['tagged_entities'].append([local_cite[0],local_cite[-1],str(paragraph[local_cite[0]:local_cite[-1]+1]),"Cite"])
        local_cite = list()
      
      count_article = False
      count_act = False
      count_section = False
      count_cite = False

      if count_case == False:
        local_case.append(int(combined[i][0]))
        local_case.append(int(combined[i][1]))
        count_case = True
      else:
        local_case.append(int(combined[i][1]))

    # act check
    elif ner_tags[i].find("Act") != -1:
      if count_section == True:
        tupleAdd['tagged_entities'].append([local_section[0],local_section[-1],str(paragraph[local_section[0]:local_section[-1]+1]),"Section"])
        local_section = list()
      if count_article == True:
        tupleAdd['tagged_entities'].append([local_article[0],local_article[-1],str(paragraph[local_article[0]:local_article[-1]+1]),"Article"])
        local_article = list()
      if count_case == True:
        tupleAdd['tagged_entities'].append([local_case[0],local_case[-1],str(paragraph[local_case[0]:local_case[-1]+1]),"Case"])
        local_case = list()
      if count_cite == True:
        tupleAdd['tagged_entities'].append([local_cite[0],local_cite[-1],str(paragraph[local_cite[0]:local_cite[-1]+1]),"Cite"])
        local_cite = list()

      count_article = False
      count_case = False
      count_section = False
      count_cite = False
      
      if count_act == False:
        local_act.append(int(combined[i][0]))
        local_act.append(int(combined[i][1]))
        count_act = True
      else:
        local_act.append(int(combined[i][1]))    

    # cite check    
    elif ner_tags[i].find("Cite") != -1:
      if count_article == True:
        tupleAdd['tagged_entities'].append([local_article[0],local_article[-1],str(paragraph[local_article[0]:local_article[-1]+1]),"Article"])
        local_article = list()
      if count_case == True:
        tupleAdd['tagged_entities'].append([local_case[0],local_case[-1],str(paragraph[local_case[0]:local_case[-1]+1]),"Case"])
        local_case = list()
      if count_act == True:
        tupleAdd['tagged_entities'].append([local_act[0],local_act[-1],str(paragraph[local_act[0]:local_act[-1]+1]),"Act"])
        local_act = list()
      if count_section == True:
        tupleAdd['tagged_entities'].append([local_section[0],local_section[-1],str(paragraph[local_section[0]:local_section[-1]+1]),"Section"])
        local_section = list()

      count_article = False
      count_act = False
      count_case = False
      count_section = False

      if count_cite == False:
        local_cite.append(int(combined[i][0]))
        local_cite.append(int(combined[i][1]))
        count_cite = True
      else:
        local_cite.append(int(combined[i][1]))

  return tupleAdd

In [None]:
def modelPredict(sent): # takes one sentence and gives predictions. 
  """
  args: sentence (one entire parapraph including subquotes and blockquotes in our case)
  returns: dataframe consisting of columns: 'sent_id','token','start','end','token2','ner' (note: ignore token2 column)
  """
  model2 = ner_prediction_pipeline.fit(spark.createDataFrame([[""]]).toDF("text"))
  detailed_result = LightPipeline(model2).fullAnnotate(sent)
  tuples = []

  for x,y,z in zip(detailed_result[0]["token"], detailed_result[0]["bert"], detailed_result[0]["ner"]):

    tuples.append((int(x.metadata['sentence']), x.result, x.begin, x.end, y.result, z.result))

  predicted_df = pd.DataFrame(tuples, columns=['sent_id','token','start','end','token2','ner'])
 
  return predicted_df

In [None]:
def appendToJson(df,json_data,para_id, all_together): # appends to the json
  """
  args: df - dataFrame as returned by modelPredict, json_data - dictionary - the json data to update,
        para_id - int - current para_id in the iteration - to update paragraphs at this index in json_data,
        all_together - string - string - consists of the current para, subPara, and blockQuotes
  returns: the updated json data with tagged entities - dict
  note: this method calls makeTuple method to get the final tagged entities dictionary
  """
  tupleAdd = {"tagged_entities":[]}  # the tuple to be added to the final json
  starts = list(df['start'])
  ends = list(df['end'])
  ner_tags = list(df['ner'])
  tokens = list(df['token'])
  combined = []  # [start index of each token, end index of each token]
  
  for i in range(len(starts)): 
    combined.append([starts[i],ends[i]])

  final_add = makeTuple(ner_tags,combined,tupleAdd, all_together) 
  print("to be added",final_add)

  if(len(final_add['tagged_entities']) != 0):
    json_data['paragraphs'][para_id-1].update(final_add)
    return json_data
  else:
    tupleAdd = {"tagged_entities":[]}
    json_data['paragraphs'][para_id-1].update(final_add)
    return json_data

In [None]:
def mainCall(input_csv_path,json_file_path): # main function to call other functions
  """
  args: input_csv - path - csv columns - ['paraID', 'contentID', 'Paragraph']
        json_file_path - path to json file - need to update this json
  returns: the updated json file
  note: this method calls modelPredict method and appendToJson method
  """
  input_csv = pd.read_csv(input_csv_path)
  input_csv.columns = ['ParaID','ContentID','Paragraph']
  json_data = dict()

  with open(json_file_path, 'r') as j:
    json_data = json.loads(j.read())
 
  content_id = list(input_csv['ContentID'])
  paragraphs = list(input_csv['Paragraph']) # main content

  updated_json = open('updated_json.json','w')  # final json to return 
  df = pd.DataFrame()
  
  # iteration counters
  content_counter = 0
  sent_count = 0
  k = 0
  last_counter = 0
  updated_data = "" 
  all_together = "" # string to append a particular para id's para, subPara, and blockQuotes

  prev_para_id = list(input_csv['ParaID'])[0]

  for para_id in list(input_csv['ParaID']): # goes through all the para_id(s) in the json paragraphs
    if para_id == prev_para_id:

      print("para_id", para_id)
      print("content_id", content_id[content_counter])
      
      all_together += paragraphs[k]
      print("checking for last",input_csv['ParaID'].iloc[-1] == para_id)
      if (input_csv['ParaID'].iloc[-1] == para_id):
        if content_counter >= len(paragraphs)-1:
          print("para_id to send", para_id)
          df = df.append(modelPredict(all_together)) # calling modelPredict
          print("all_together",all_together)
          updated_data = appendToJson(df,json_data,para_id, all_together)  # calling appendToJson
          all_together = " "
          break      
      k += 1
      
    else:
      print("para_id to send", prev_para_id)
      df = df.append(modelPredict(all_together)) # calling modelPredict
      
      print("all_together",all_together)
      updated_data = appendToJson(df,json_data,prev_para_id, all_together)  # calling appendToJson 
      all_together = " "

      df = pd.DataFrame()
      print("else")

      if list(input_csv['ParaID'])[(list(input_csv['ParaID'])).index(para_id)] == para_id:
        print("changed para_id",para_id) 

        print("content_id", content_id[content_counter])
        all_together += paragraphs[k]
        k += 1
      if input_csv['ParaID'].iloc[-1] == para_id:

        if content_counter >= len(paragraphs)-1:
          print("para_id to send", para_id)
          df = df.append(modelPredict(all_together)) # calling modelPredict
          print("all_together",all_together)
          updated_data = appendToJson(df,json_data,para_id, all_together)  # calling appendToJson
          all_together = " "
          break      

    prev_para_id = para_id
    content_counter += 1
    print("___________________________")
  json.dump(updated_data, updated_json)
  print("success!")
  updated_json.close()
  return updated_json

In [None]:
# print(mainCall('/content/5_sp.csv','/content/5_sp.json'))
# print(mainCall('/content/3_bq.csv','/content/3_bq.json'))
# print(mainCall('/content/4_multiple_bq.csv','/content/4_multiple_bq.json'))
# print(mainCall('/content/5.csv','/content/5.json'))
# print(mainCall('/content/4.csv','/content/4.json'))
# print(mainCall('/content/3.csv','/content/3.json'))
# print(mainCall('/content/2.csv','/content/2.json'))
# print(mainCall('/content/1.csv','/content/1.json'))

para_id 1
content_id p_1
checking for last False
___________________________
para_id to send 1
all_together Performance of  judicial duty in  the  manner  prescribed  by  law  isfundamental to the concept of rule of law in  a  democratic  State.  It  hasbeen quite often said and, rightly so, that the judiciary is  the  protectorand preserver of rule of law.  Effective functioning of the said  sacrosanctduty has been entrusted to the judiciary and that  entrustment  expects  thecourts to conduct the judicial  proceeding  with  dignity,  objectivity  andrationality and finally determine the same in accordance  with  law.  Errorsare bound to occur but there cannot  be  deliberate  peccability  which  cannever be countenanced.   The  plinth  of  justice   dispensation  system  isfounded on the faith, trust and confidence of the people and nothing can  beallowed to contaminate and corrode the same.  A  litigant  who  comes  to  acourt of law expects that inherent and essential principles of

In [None]:
    #### not needed ####
    # if input_csv['ParaID'].iloc[-1] == para_id: 
    #   last_counter += 1
      # prev_para_id = para_id
      # if last_counter == len(paragraphs)-1:
    #   print("para_id to send", para_id)
    #   df = df.append(modelPredict(all_together))
    #   print("all_together",all_together)
    #   updated_data = appendToJson(df,json_data,para_id, all_together)  # calling appendToJson # changed
    #   all_together = " "
    #   break
      # continue