In [None]:
import os
import pandas as pd
import json

# Install java
! apt-get install -y openjdk-8-jdk-headless -qq > /dev/null
!wget -q "https://downloads.apache.org/spark/spark-3.1.1/spark-3.1.1-bin-hadoop2.7.tgz" > /dev/null
!tar -xvf spark-3.1.1-bin-hadoop2.7.tgz
!pip install -q findspark

os.environ["SPARK_HOME"] = "/content/spark-3.1.1-bin-hadoop2.7"
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["PATH"] = os.environ["JAVA_HOME"] + "/bin:" + os.environ["PATH"]
! java -version

# Install spark-nlp and pyspark
! pip install spark-nlp==3.0.0 pyspark==3.1.1

# Quick SparkSession start
import sparknlp
spark = sparknlp.start()

print("Spark NLP version")
sparknlp.version()
print("Apache Spark version")
spark.version

spark-3.1.1-bin-hadoop2.7/
spark-3.1.1-bin-hadoop2.7/NOTICE
spark-3.1.1-bin-hadoop2.7/kubernetes/
spark-3.1.1-bin-hadoop2.7/kubernetes/tests/
spark-3.1.1-bin-hadoop2.7/kubernetes/tests/python_executable_check.py
spark-3.1.1-bin-hadoop2.7/kubernetes/tests/autoscale.py
spark-3.1.1-bin-hadoop2.7/kubernetes/tests/worker_memory_check.py
spark-3.1.1-bin-hadoop2.7/kubernetes/tests/py_container_checks.py
spark-3.1.1-bin-hadoop2.7/kubernetes/tests/decommissioning.py
spark-3.1.1-bin-hadoop2.7/kubernetes/tests/pyfiles.py
spark-3.1.1-bin-hadoop2.7/kubernetes/tests/decommissioning_cleanup.py
spark-3.1.1-bin-hadoop2.7/kubernetes/dockerfiles/
spark-3.1.1-bin-hadoop2.7/kubernetes/dockerfiles/spark/
spark-3.1.1-bin-hadoop2.7/kubernetes/dockerfiles/spark/decom.sh
spark-3.1.1-bin-hadoop2.7/kubernetes/dockerfiles/spark/entrypoint.sh
spark-3.1.1-bin-hadoop2.7/kubernetes/dockerfiles/spark/bindings/
spark-3.1.1-bin-hadoop2.7/kubernetes/dockerfiles/spark/bindings/R/
spark-3.1.1-bin-hadoop2.7/kubernetes/docker

'3.1.1'

In [None]:
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline

import sparknlp
from sparknlp.annotator import *
from sparknlp.common import *
from sparknlp.base import *

In [None]:
from sparknlp.training import CoNLL

In [None]:
training_data = CoNLL().readDataset(spark,"/content/AbhiramSinghCoNLL.txt")

In [None]:
training_data.show(5)

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|                text|            document|            sentence|               token|                 pos|               label|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|J U D G M E N T K...|[{document, 0, 32...|[{document, 0, 32...|[{token, 0, 0, J,...|[{pos, 0, 0, NN, ...|[{named_entity, 0...|
|After spending co...|[{document, 0, 22...|[{document, 0, 22...|[{token, 0, 4, Af...|[{pos, 0, 4, IN, ...|[{named_entity, 0...|
|We would , howeve...|[{document, 0, 17...|[{document, 0, 17...|[{token, 0, 1, We...|[{pos, 0, 1, PRP,...|[{named_entity, 0...|
|This appeal under...|[{document, 0, 22...|[{document, 0, 22...|[{token, 0, 3, Th...|[{pos, 0, 3, DT, ...|[{named_entity, 0...|
|The respondent , ...|[{document, 0, 15...|[{document, 0, 15...|[{token, 0, 2, Th...|[{pos, 0, 2, DT, ..

In [None]:
# training_data.describe(['document']).show()
# training_data.describe()

In [None]:
bert = BertEmbeddings.pretrained('bert_base_cased', 'en')\
.setInputCols(["sentence",'token'])\
.setOutputCol("bert")\
.setCaseSensitive(True)\

bert_base_cased download started this may take some time.
Approximate size to download 389.1 MB
[OK!]


In [None]:
nerTagger = NerDLApproach()\
.setInputCols(["sentence", "token", "bert"])\
.setLabelColumn("label")\
.setOutputCol("ner")\
.setMaxEpochs(1)\
.setRandomSeed(0)\
.setVerbose(1)\
.setValidationSplit(0.2)\
.setEvaluationLogExtended(True)\
.setEnableOutputLogs(True)\
.setIncludeConfidence(True)\

In [None]:
ner_pipeline = Pipeline(stages = [bert, nerTagger])

In [None]:
ner_model = ner_pipeline.fit(training_data) #takes time

In [None]:
ner_model

PipelineModel_03aca288171f

In [None]:
ner_model.stages[1].write().save('NER_bert_model')

In [None]:
#if running more than once in the same runtime use the code below to save the model

In [None]:
# ner_model.stages[1].write().overwrite().save('NER_bert_20200221') 

In [None]:
loaded_ner_model = NerDLModel.load("/content/NER_bert_model") \
   .setInputCols(["sentence", "token", "bert"])\
   .setOutputCol("ner")
   

In [None]:
document = DocumentAssembler()\
    .setInputCol("text")\
    .setOutputCol("document")

sentence = SentenceDetector()\
    .setInputCols(['document'])\
    .setOutputCol('sentence')

token = Tokenizer()\
    .setInputCols(['sentence'])\
    .setOutputCol('token')

bert = BertEmbeddings.pretrained('bert_base_cased', 'en') \
 .setInputCols(["sentence",'token'])\
 .setOutputCol("bert")\
 .setCaseSensitive(True)

converter = NerConverter()\
  .setInputCols(["sentence","document", "token"])\
  .setOutputCol("ner_span")

ner_prediction_pipeline = Pipeline(
    stages = [
        document,
        sentence,
        token,
        bert,
        loaded_ner_model,
        converter])

bert_base_cased download started this may take some time.
Approximate size to download 389.1 MB
[OK!]


In [None]:
# text = "whether the High Court ought to have entertained the writ petition filed by the respondent? As regards the power of the High Court to issue directions, orders or writs in exercise of its jurisdiction under Article 226 of the Constitution of India, the same is no more res integra. Even though the High Court can entertain a writ petition against any order or direction passed/action taken by the State under Article 226 of the Constitution, it ought not to do so as a matter of course when the  21 aggrieved person could have availed of an effective alternative remedy in the manner prescribed by law (see Baburam Prakash Chandra Maheshwari vs. Antarim Zila Parishad now Zila Parishad, Muzaffarnagar8 and also Nivedita Sharma vs. Cellular Operators Association of India & Ors.9). In Thansingh Nathmal & Ors. vs. Superintendent of Taxes, Dhubri & Ors.10, the Constitution Bench of this Court made it amply clear that although the power of the High Court under Article 226 of the Constitution is very wide, the Court must exercise self­imposed restraint and not entertain the writ petition, if an alternative effective remedy is available to the aggrieved person. In paragraph 7, the Court observed thus: ­ 7. Against the order of the Commissioner an order for reference could have been claimed if the appellants satisfied the Commissioner or the High Court that a question of law arose out of the order. But the procedure provided by the Act to invoke the jurisdiction of the High Court was bypassed, the appellants moved the High Court challenging the competence of the Provincial Legislature to extend the concept of sale, and invoked the extraordinary jurisdiction of the High Court under Article 226 and sought to reopen the decision of the Taxing Authorities on question of fact. The jurisdiction of the High Court under Article 226 of the Constitution is couched in wide terms and the exercise thereof is not subject to any restrictions except the territorial restrictions which are expressly provided in the Articles."
empty_data = spark.createDataFrame([[""]]).toDF("text")

ner_pipelineFit = ner_prediction_pipeline.fit(empty_data)

ner_lp_pipeline = LightPipeline(ner_pipelineFit)

# prediction_data = spark.createDataFrame([[text]]).toDF("text")

In [None]:
# prediction_model = ner_prediction_pipeline.fit(prediction_data)

In [None]:
# ner_lp_pipeline = LightPipeline(prediction_model)

In [None]:
model2 = ner_prediction_pipeline.fit(spark.createDataFrame([[""]]).toDF("text"))
# detailed_result = LightPipeline(model2).fullAnnotate(open("/content/file_loc_test.txt").read())
detailed_result = LightPipeline(model2).fullAnnotate("After perusing the whole record scanning the evidence of the prosecution witnesses and hearing lengthy arguments from both sides we are satisfied that the High Court was right in holding that charge of criminal conspiracy against A2 had not been proved beyond doubt She was therefore rightly acquitted of the charge under Section 302 read with Section 120B of the IPC. However, as she was found to have actively participated in causing disappearance of the dead body of the deceased knowing and having reason to believe that his murder has been committed by A1 was convicted and sentenced under Section 201 of the IPC")
# detailed_result = LightPipeline.fullAnnotate("After perusing the whole record scanning the evidence of the prosecution witnesses and hearing lengthy arguments from both sides we are satisfied that the High Court was right in holding that charge of criminal conspiracy against A2 had not been proved beyond doubt She was therefore rightly acquitted of the charge under Section 302 read with Section 120B of the IPC. However, as she was found to have actively participated in causing disappearance of the dead body of the deceased knowing and having reason to believe that his murder has been committed by A1 was convicted and sentenced under Section 201 of the IPC")

# detailed_result[0]['entities']
tuples = []

for x,y,z in zip(detailed_result[0]["token"], detailed_result[0]["bert"], detailed_result[0]["ner"]):

  tuples.append((int(x.metadata['sentence']), x.result, x.begin, x.end, y.result, z.result))

df3 = pd.DataFrame(tuples, columns=['sent_id','token','start','end','token','ner'])


In [None]:
df3 

Unnamed: 0,sent_id,token,start,end,token.1,ner
0,0,After,0,4,After,O
1,0,perusing,6,13,perusing,O
2,0,the,15,17,the,O
3,0,whole,19,23,whole,O
4,0,record,25,30,record,O
...,...,...,...,...,...,...
101,1,Section,595,601,Section,O
102,1,201,603,605,201,O
103,1,of,607,608,of,O
104,1,the,610,612,the,O


In [None]:
from google.colab import files

df3.to_csv('df3.csv')
files.download('df3.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# txt = """whether the High Court ought to have entertained the writ petition filed by the respondent As regards the power of the High Court to issue directions orders or writs in exercise of its jurisdiction under Article 226 of the Constitution of India the same is no more res integra Even though the High Court can entertain a writ petition against any order or direction passed/action taken by the State under Article 226 of the Constitution it ought not to do so as a matter of course when the 21 aggrieved person could have availed of an effective alternative remedy in the manner prescribed by law see Baburam Prakash Chandra Maheshwari vs. Antarim Zila Parishad now Zila Parishad Muzaffarnagar8 and also Nivedita Sharma vs Cellular Operators Association of India & Ors9 In Thansingh Nathmal & Ors vs Superintendent of Taxes, Dhubri & Ors10 the Constitution Bench of this Court made it amply clear that although the power of the High Court under Article 226 of the Constitution is very wide, the Court must exercise self­imposed restraint and not entertain the writ petition if an alternative effective remedy is available to the aggrieved person In paragraph 7 the Court observed thus 7 Against the order of the Commissioner an order for reference could have been claimed if the appellants satisfied the Commissioner or the High Court that a question of law arose out of the order But the procedure provided by the Act to invoke the jurisdiction of the High Court was bypassed the appellants moved the High Court challenging the competence of the Provincial Legislature to extend the concept of sale and invoked the extraordinary jurisdiction of the High Court under Article 226 and sought to reopen the decision of the Taxing Authorities on question of fact The jurisdiction of the High Court under Article 226 of the Constitution is couched in wide terms and the exercise thereof is not subject to any restrictions except the territorial restrictions which are expressly provided in the Articles"""
txt2 = """After perusing the whole record scanning the evidence of the prosecution witnesses and hearing lengthy arguments from both sides we are satisfied that the High Court was right in holding that charge of criminal conspiracy against A2 had not been proved beyond doubt She was therefore rightly acquitted of the charge under Section 302 read with Section 120B of the IPC. However, as she was found to have actively participated in causing disappearance of the dead body of the deceased knowing and having reason to believe that his murder has been committed by A1 was convicted and sentenced under Section 201 of the IPC"""
# txt3 = "After referring to some judgments of the United States Supreme Court and of this Court in Yash Pal Mittal v State of Punjab 1977 4 SCC 540 Ajay Aggarwal vs Union of India AIR 1993 SCW 1866 the Court in State of Maharashtra v Som Nath Thapa AIR 1996 SC 1744 summarised the position of law and the requirements to establish the charge of conspiracy, as under"
# txt4 = """ve, consistently with the policy underlined in  section 354  (3), a  bearing on the choice of sentence. The  present legislative  policy discernible	from section 235(2)	read with  section 354(3)  is that in fixing the degree  of punishment  or making  the choice of sentence for various offences, including one under section 302, Penal Code,  the   Court  should  not	 confine  its  consideration "principally" or  "merely" to  the  circumstances  connected with the  particular crime,  but also give due consideration to the circumstances of the criminal. [237 C-E]"""
# txt5 = "After referring to some judgments of the United States Supreme Court and of this Court in Yash Pal Mittal v. State of Punjab[1977 (4) SCC 540]; Ajay Aggarwal vs. Union of India [AIR 1993 SCW 1866], the Court in State of Maharashtra v. Som Nath Thapa [AIR 1996 SC 1744] summarised the position of law and the requirements to establish the charge of conspiracy, as under:"
# txt6 = "The learned counsel for the appellant contends that the latter decision of this Court in Venkatesh Thimmaiahs case, must be held to have been not correctly decided, as it does not take notice of the earlier decision in Vishnu Narayans case 1995 Supp.(4) SCC 428, which was a decision interpreting the very same provision of the Karnataka Rent Control Act and which also relied upon the Constitution Bench decision of this Court in Gian Devi Anands case 1985(2) SCC 683, wherein the pari materia provision of Delhi Rent Control Act, 1958 was under consideration. The learned counsel further urged that acquisition of a premises by the partnership firm of which the tenant was merely a partner to the extent of 15%, cannot be held to be an acquisition of alternative premises by the tenant in view of the definition of tenant in Section 3(r) of the Act and the High Court, therefore committed serious error of law."
# txt7 = "Mr. V.A. Bobde, learned senior counsel appearing on behalf of the appellant herein, would submit that the High Court committed a manifest error in arriving at the said findings insofar as it failed to take into consideration that the provisions of the C.P.C. and in particular Order XXXIV Rule 7 read with Rule 8 thereof cannot supersede Article 137 of the Limitation Act, 1963. The learned counsel would contend that having regard to the plain language used in Order XXXIV Rule 8 C.P.C. read with Article 137 of the Limitation Act, there cannot be any doubt whatsoever that the period of limitation as prescribed therein shall apply in an application for preparation of a final decree in a suit of redemption of usufructuary mortgage. It was contended that the provisions of the Limitation Act are applicable in such a suit independent of the provisions of the C.P.C. Strong reliance in support of the said contentions was placed in K. Parameswaran Pillai Dead v. K. Sumathi alias Jesis Jessie Jacquiline and Anr., [1993] 4 SCC 431 and Mohd. Abdul Khader Mohd. Kastim and Anr. v. Pareethij Kunju Sayed A hammed and Ors. [1996] 11 SCC 83."
parsed = ner_lp_pipeline.annotate(txt2)
parsed

{'bert': ['After',
  'perusing',
  'the',
  'whole',
  'record',
  'scanning',
  'the',
  'evidence',
  'of',
  'the',
  'prosecution',
  'witnesses',
  'and',
  'hearing',
  'lengthy',
  'arguments',
  'from',
  'both',
  'sides',
  'we',
  'are',
  'satisfied',
  'that',
  'the',
  'High',
  'Court',
  'was',
  'right',
  'in',
  'holding',
  'that',
  'charge',
  'of',
  'criminal',
  'conspiracy',
  'against',
  'A2',
  'had',
  'not',
  'been',
  'proved',
  'beyond',
  'doubt',
  'She',
  'was',
  'therefore',
  'rightly',
  'acquitted',
  'of',
  'the',
  'charge',
  'under',
  'Section',
  '302',
  'read',
  'with',
  'Section',
  '120B',
  'of',
  'the',
  'IPC',
  '.',
  'However',
  ',',
  'as',
  'she',
  'was',
  'found',
  'to',
  'have',
  'actively',
  'participated',
  'in',
  'causing',
  'disappearance',
  'of',
  'the',
  'dead',
  'body',
  'of',
  'the',
  'deceased',
  'knowing',
  'and',
  'having',
  'reason',
  'to',
  'believe',
  'that',
  'his',
  'murder',

In [None]:
# from transformers import BertModel, BertConfig

# # Initializing a BERT bert-base-uncased style configuration
# configuration = BertConfig()

# # Initializing a model from the bert-base-uncased style configuration
# model = loaded_ner_model(configuration)

# # Accessing the model configuration
# configuration = model.config
# loaded_ner_model.getConfigProtoBytes.to_json_file('/content/jsonTry1')

In [None]:
# def bulk_predict(docs, batch_size=256):
#     for i in range(0, len(docs), batch_size):
#         batch_docs = docs[i: i+batch_size]
#         embeddings = bc.encode([doc['abstract'] for doc in batch_docs])
#         for emb in embeddings:
#             yield emb

In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

In [None]:
from sparknlp.training import CoNLL
test = CoNLL().readDataset(spark,"/content/file_loc_test.txt")
test_data = bert.transform(test)

In [None]:
predictions = loaded_ner_model.transform(test_data)

In [None]:
# ! python run_classifier.py --task_name=cola --do_predict=true --data_dir=./data --vocab_file=./cased_L-12_H-768-A-12/bert_config.json --init_checkpoint=./model_output/model.ckpt-<highest checkpoint number> --max_seq_length=128 --output_dir=./model_output

/bin/bash: highest: No such file or directory


In [None]:
predictions.show(3)

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|                text|            document|            sentence|               token|                 pos|               label|                bert|                 ner|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|The detenu who ha...|[[document, 0, 69...|[[document, 0, 69...|[[token, 0, 2, Th...|[[pos, 0, 2, DT, ...|[[named_entity, 0...|[[word_embeddings...|[[named_entity, 0...|
|The detaining aut...|[[document, 0, 15...|[[document, 0, 15...|[[token, 0, 2, Th...|[[pos, 0, 2, DT, ...|[[named_entity, 0...|[[word_embeddings...|[[named_entity, 0...|
|Mr Anil Kumar Nau...|[[document, 0, 35...|[[document, 0, 35...|[[token, 0, 1, Mr...|[[pos, 0, 1, NN, ...|[[named_entity, 0...|[[word_embeddings...|[[

In [None]:
predictions.select('token.result','label.result','ner.result').show(truncate=40)

+----------------------------------------+----------------------------------------+----------------------------------------+
|                                  result|                                  result|                                  result|
+----------------------------------------+----------------------------------------+----------------------------------------+
|[The, detenu, who, has, been, detaine...|[O, O, O, O, O, O, O, O, O, O, O, O, ...|[O, O, O, O, O, O, O, O, O, O, O, B-S...|
|[The, detaining, authority, on, being...|[O, O, O, O, O, O, O, O, O, O, O, O, ...|[O, O, O, O, O, O, O, O, O, O, O, O, ...|
|[Mr, Anil, Kumar, Nauriya, the, learn...|[O, O, O, O, O, O, O, O, O, O, O, O, ...|[O, O, O, O, O, O, O, O, O, O, O, O, ...|
|[Mr, Anil, Kumar, the, learned, couns...|[O, O, O, O, O, O, O, O, O, O, O, O, ...|[O, O, O, O, O, O, O, O, O, O, O, O, ...|
|[The, next, contention, raised, by, t...|[O, O, O, O, O, O, O, O, O, O, O, O, ...|[O, O, O, O, O, O, O, O, O, O, O, O, ...|


In [None]:
import pyspark.sql.functions as F

predictions.select(F.explode(F.arrays_zip('token.result','label.result','ner.result')).alias("cols")) \
.select(F.expr("cols['0']").alias("token"),
        F.expr("cols['1']").alias("ground_truth"),
        F.expr("cols['2']").alias("prediction")).show()

+----------+------------+----------+
|     token|ground_truth|prediction|
+----------+------------+----------+
|       The|           O|         O|
|    detenu|           O|         O|
|       who|           O|         O|
|       has|           O|         O|
|      been|           O|         O|
|  detained|           O|         O|
|        by|           O|         O|
|       the|           O|         O|
| detaining|           O|         O|
| authority|           O|         O|
|     under|           O|         O|
|   Section|           O| B-Section|
|         3|           O| I-Section|
|         2|           O| I-Section|
|        of|           O|         O|
|       the|           O|         O|
|   Gujarat|           O|     B-Act|
|Prevention|           O|     I-Act|
|        of|           O|     I-Act|
|      Anti|           O|     I-Act|
+----------+------------+----------+
only showing top 20 rows



In [None]:
import pyspark.sql.functions as F

kk = predictions.select(F.explode(F.arrays_zip('token.result','ner.result')).alias("cols")) \
.select(F.expr("cols['0']").alias("token"),
        F.expr("cols['1']").alias("prediction"))
kk.show()


+----------+----------+
|     token|prediction|
+----------+----------+
|       The|         O|
|    detenu|         O|
|       who|         O|
|       has|         O|
|      been|         O|
|  detained|         O|
|        by|         O|
|       the|         O|
| detaining|         O|
| authority|         O|
|     under|         O|
|   Section| B-Section|
|         3| I-Section|
|         2| I-Section|
|        of|         O|
|       the|         O|
|   Gujarat|     B-Act|
|Prevention|     I-Act|
|        of|     I-Act|
|      Anti|     I-Act|
+----------+----------+
only showing top 20 rows



In [None]:
import pandas as pd
# print(predictions)
df = predictions.select('token.result','label.result','ner.result').toPandas()
# df = predictions.select('ground_truth.result','prediction.result').toPandas()
# .toDF(["toks"],["truth"], ["result"])
df

Unnamed: 0,result,result.1,result.2
0,"[The, detenu, who, has, been, detained, by, th...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[O, O, O, O, O, O, O, O, O, O, O, B-Section, I..."
1,"[The, detaining, authority, on, being, satisfi...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
2,"[Mr, Anil, Kumar, Nauriya, the, learned, couns...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
3,"[Mr, Anil, Kumar, the, learned, counsel, then,...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
4,"[The, next, contention, raised, by, the, learn...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
...,...,...,...
360,"[On, the, first, question, decided, against, t...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
361,"[In, Municipal, Board, Hapur, v, Raghuvendra, ...","[B-Cases, I-Cases, I-Cases, I-Cases, I-Cases, ...","[O, I-Cases, I-Cases, I-Cases, I-Cases, I-Case..."
362,"[Apart, from, the, fact, that, the, Board, was...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
363,"[In, Municipal, Corporation, of, Delhi, v, Bir...","[B-Cases, I-Cases, I-Cases, I-Cases, I-Cases, ...","[O, B-Cases, I-Cases, I-Cases, I-Cases, I-Case..."


In [None]:
import pandas as pd
pred_df = kk.toPandas()
# df.columns
pred_df

Unnamed: 0,token,prediction
0,The,O
1,detenu,O
2,who,O
3,has,O
4,been,O
...,...,...
66671,is,O
66672,also,O
66673,contained,O
66674,in,O


In [None]:
from google.colab import files

pred_df.to_csv('df_2.csv')
files.download('df_2.csv')

In [None]:
# from sklearn.metrics import classification_report
# toks = df[df.columns[1]]
# labs = df[df.columns[2]]
# df = df.rename(index={1: "yo"})
# toks
# target_names = ['toks', 'labs']
# df['result']
# df.dtypes
# df.values
# print(classification_report(pred_df[pred_df.columns[5]], pred_df[pred_df.columns[7]]))

In [None]:
pred_df.head()

Unnamed: 0,text,document,sentence,token,pos,label,bert,ner
0,The detenu who has been detained by the detain...,"[(document, 0, 696, The detenu who has been de...","[(document, 0, 696, The detenu who has been de...","[(token, 0, 2, The, {'sentence': '0'}, []), (t...","[(pos, 0, 2, DT, {'word': 'The'}, []), (pos, 4...","[(named_entity, 0, 2, O, {'word': 'The'}, []),...","[(word_embeddings, 0, 2, The, {'sentence': '0'...","[(named_entity, 0, 2, O, {'word': 'The', 'conf..."
1,The detaining authority on being satisfied fro...,"[(document, 0, 1539, The detaining authority o...","[(document, 0, 1539, The detaining authority o...","[(token, 0, 2, The, {'sentence': '0'}, []), (t...","[(pos, 0, 2, DT, {'word': 'The'}, []), (pos, 4...","[(named_entity, 0, 2, O, {'word': 'The'}, []),...","[(word_embeddings, 0, 2, The, {'sentence': '0'...","[(named_entity, 0, 2, O, {'word': 'The', 'conf..."
2,Mr Anil Kumar Nauriya the learned counsel appe...,"[(document, 0, 3592, Mr Anil Kumar Nauriya the...","[(document, 0, 3592, Mr Anil Kumar Nauriya the...","[(token, 0, 1, Mr, {'sentence': '0'}, []), (to...","[(pos, 0, 1, NN, {'word': 'Mr'}, []), (pos, 3,...","[(named_entity, 0, 1, O, {'word': 'Mr'}, []), ...","[(word_embeddings, 0, 1, Mr, {'sentence': '0',...","[(named_entity, 0, 1, O, {'word': 'Mr', 'confi..."
3,Mr Anil Kumar the learned counsel then urged t...,"[(document, 0, 5548, Mr Anil Kumar the learned...","[(document, 0, 5548, Mr Anil Kumar the learned...","[(token, 0, 1, Mr, {'sentence': '0'}, []), (to...","[(pos, 0, 1, NN, {'word': 'Mr'}, []), (pos, 3,...","[(named_entity, 0, 1, O, {'word': 'Mr'}, []), ...","[(word_embeddings, 0, 1, Mr, {'sentence': '0',...","[(named_entity, 0, 1, O, {'word': 'Mr', 'confi..."
4,The next contention raised by the learned coun...,"[(document, 0, 4514, The next contention raise...","[(document, 0, 4514, The next contention raise...","[(token, 0, 2, The, {'sentence': '0'}, []), (t...","[(pos, 0, 2, DT, {'word': 'The'}, []), (pos, 4...","[(named_entity, 0, 2, O, {'word': 'The'}, []),...","[(word_embeddings, 0, 2, The, {'sentence': '0'...","[(named_entity, 0, 2, O, {'word': 'The', 'conf..."


In [None]:
pred_df.iloc[361]['label']

token          and
prediction    None
Name: 361, dtype: object