In [1]:
import pandas as pd
import numpy as np
import re
import html2text
import requests
from urllib.parse import urljoin
from datetime import datetime
import pandas as pd

In [2]:
#define the IndianKanoon API class
class IndianKanoon:
  """
    https://github.com/aishik-pyne/kanoon
    Search query	https://api.indiankanoon.org/search/?formInput=<query>&pagenum=<pagenum>
    Document	https://api.indiankanoon.org/doc/<docid>/
    Document fragments	https://api.indiankanoon.org/docfragment/<docid>/?formInput=<query>
    Document Metainfo	https://api.indiankanoon.org/docmeta/<docid>/
  """

  def __init__(self):
    self.base_url = "https://api.indiankanoon.org/"
    #self.auth_token is a newly generated token from the IndiaKanoon API website(you have to generate it manually)
    self.auth_token = "9cd44463828341ca7d7401135bf82b1c14520677"
    
    self.headers = {
        'authorization': "Token {}".format(self.auth_token),
        'cache-control': "no-cache",
    }
    self.api_session = requests.Session()
    self.api_session.headers = self.headers

  def search(self, formInput, pagenum=0,
             fromdate=None, todate=None,
             title=None, author=None,
             cite=None, bench=None, maxpages=50):
    #  Creating parameters
    params = {
        'formInput': formInput,
        'pagenum': pagenum,
        'maxpages': maxpages
    }
    if fromdate:
      assert isinstance(fromdate, datetime) 
      params['fromdate'] = fromdate.strftime('%d-%m-&Y')

    if todate:
      assert isinstance(todate, datetime) 
      params['todate'] = todate.strftime('%d-%m-&Y')

    # Making the request
    response = self.api_session.post(
        urljoin(self.base_url, 'search/'), params=params)
    response.raise_for_status()
    return response.json()

  def doc(self, docid):
    response = self.api_session.post(
        urljoin(self.base_url, 'doc/{}/'.format(docid)))
    response.raise_for_status()
    return response.json()

  def docfragment(self, docid, formInput):
    params = {
        'formInput': formInput,
    }
    response = self.api_session.post(
        urljoin(self.base_url, 'docfragment/{}/'.format(docid)), params=params)
    response.raise_for_status()
    return response.json()

  def docmeta(self, docid):
    response = self.api_session.post(
        urljoin(self.base_url, 'docmeta/{}/'.format(docid)))
    response.raise_for_status()
    return response.json()

In [3]:
#initialize object
ik = IndianKanoon()

In [35]:
#read the casemine file obtained from the pipeline(contains the judgement and similar judgements along with the links)
df = pd.read_csv('casemine.csv')

In [37]:
#returns document id from link
def get_id(link):
    regex = re.compile(r'[0-9]+')
    result = regex.search(link)
    
    if(result):
        return result.group()
    else:
        return ""

In [38]:
#doclink dictionary is used to map judgement name with its ID
doclink = {}

for i in range(df.shape[0]):
    m_j = df.iloc[i,0]
    s_j = df.iloc[i,2]
    m_j_l = df.iloc[i,1]
    s_j_l = df.iloc[i,3]
    
    if m_j not in doclink.keys() and s_j_l != "":
        doclink[m_j] = get_id(str(m_j_l))
    if s_j not in doclink.keys() and m_j_l != "":
        doclink[s_j] = get_id(str(s_j_l))

In [40]:
doc_with_id = pd.DataFrame(columns=["name","id"])

counter = 0

for i in doclink.keys():
    if doclink[i] != '':
        row = [i , doclink[i]]
        doc_with_id.loc[counter] = row
        counter = counter + 1
        

In [41]:
case_with_id = dict(zip(doc_with_id.name,doc_with_id.id))

In [42]:
case_with_id

{'Wander Ltd. And Another v. Antox India P. Ltd. .': '330608',
 'Astrazeneca Uk Ltd. v. Orchid Chemicals & Pharmaceuticals Ltd.': '1509899',
 'S.M Dyechem Ltd. v. Cadbury (India) Ltd. .': '1025815',
 'Veerumal Praveen Kumar v. Needle Industries (India) Ltd. And Anr.': '1945028',
 'N.R Dongre And Others v. Whirlpool Corporation And Another': '850381',
 'Pfizer Products Inc. v. Rajesh Chopra': '1674695',
 'Mahendra & Mahendra Paper Mills Ltd. v. Mahindra & Mahindra Ltd. .': '67204010',
 'Lupin Limited v. Johnson & Johnson': '19267666',
 'Cadila Health Care Ltd. v. Cadila Pharmaceuticals Ltd. .': '1114158',
 'Purshottam Vishandas Raheja And Another v. Shrichand Vishandas Raheja (Dead) Through Lrs. And Others': '81162651',
 'Fedders North American v. Show Line': '1269197',
 'Cadila Healthcare Limited v. Cadila Pharmaceuticals Limited': '1114158',
 'Ramesh Khatanmal Lulla v. Mohammad Yusuf Abdul Gaffar': '70200400',
 'Cadbury India Ltd. v. Neeraj Food Products': '652828',
 'Cadbury India Li

In [43]:
#remove duplicate ids from dictionary

temp = []
res = dict()
for key, val in case_with_id.items():
    if val not in temp:
        temp.append(val)
        res[key] = val

In [44]:
res

{'Wander Ltd. And Another v. Antox India P. Ltd. .': '330608',
 'Astrazeneca Uk Ltd. v. Orchid Chemicals & Pharmaceuticals Ltd.': '1509899',
 'S.M Dyechem Ltd. v. Cadbury (India) Ltd. .': '1025815',
 'Veerumal Praveen Kumar v. Needle Industries (India) Ltd. And Anr.': '1945028',
 'N.R Dongre And Others v. Whirlpool Corporation And Another': '850381',
 'Pfizer Products Inc. v. Rajesh Chopra': '1674695',
 'Mahendra & Mahendra Paper Mills Ltd. v. Mahindra & Mahindra Ltd. .': '67204010',
 'Lupin Limited v. Johnson & Johnson': '19267666',
 'Cadila Health Care Ltd. v. Cadila Pharmaceuticals Ltd. .': '1114158',
 'Purshottam Vishandas Raheja And Another v. Shrichand Vishandas Raheja (Dead) Through Lrs. And Others': '81162651',
 'Fedders North American v. Show Line': '1269197',
 'Ramesh Khatanmal Lulla v. Mohammad Yusuf Abdul Gaffar': '70200400',
 'Cadbury India Ltd. v. Neeraj Food Products': '652828',
 'Peshawar Soap & Chemicals Ltd. v. Godrej Soap Ltd.': '645115',
 'Ramdev Food Products (P) L

In [45]:
docs = []

count = 0
for i in res.values():
    docs.append(ik.doc(docid=str(i)))
    print("FILE NUMBER: ",(count+1))
    count = count + 1

FILE NUMBER:  1
FILE NUMBER:  2
FILE NUMBER:  3
FILE NUMBER:  4
FILE NUMBER:  5
FILE NUMBER:  6
FILE NUMBER:  7
FILE NUMBER:  8
FILE NUMBER:  9
FILE NUMBER:  10
FILE NUMBER:  11
FILE NUMBER:  12
FILE NUMBER:  13
FILE NUMBER:  14
FILE NUMBER:  15
FILE NUMBER:  16
FILE NUMBER:  17
FILE NUMBER:  18
FILE NUMBER:  19
FILE NUMBER:  20
FILE NUMBER:  21
FILE NUMBER:  22
FILE NUMBER:  23
FILE NUMBER:  24
FILE NUMBER:  25
FILE NUMBER:  26
FILE NUMBER:  27
FILE NUMBER:  28
FILE NUMBER:  29
FILE NUMBER:  30
FILE NUMBER:  31
FILE NUMBER:  32
FILE NUMBER:  33
FILE NUMBER:  34
FILE NUMBER:  35
FILE NUMBER:  36
FILE NUMBER:  37
FILE NUMBER:  38
FILE NUMBER:  39
FILE NUMBER:  40
FILE NUMBER:  41
FILE NUMBER:  42
FILE NUMBER:  43
FILE NUMBER:  44
FILE NUMBER:  45
FILE NUMBER:  46
FILE NUMBER:  47
FILE NUMBER:  48
FILE NUMBER:  49
FILE NUMBER:  50
FILE NUMBER:  51
FILE NUMBER:  52
FILE NUMBER:  53
FILE NUMBER:  54
FILE NUMBER:  55
FILE NUMBER:  56
FILE NUMBER:  57
FILE NUMBER:  58
FILE NUMBER:  59
FILE N

In [46]:
ik_result = pd.DataFrame.from_dict(docs)

In [47]:
ik_result

Unnamed: 0,tid,covertids,publishdate,firstname,secondname,lastname,title,filename,doc,numcites,...,url,showurl,docsource,covers,citeList,citedbyList,divtype,relatedqs,courtcopy,errmsg
0,330608.0,[],1990-04-26,o,,venkatachaliah,Wander Ltd. And Anr. vs Antox India P. Ltd. on...,www.manupatra.co.in/NXT/gateway.dll/sc/1980-20...,"<div class=""docsource_main"">Supreme Court of I...",5.0,...,,True,Supreme Court of India,[],"[{'tid': 1891720, 'title': 'the Drugs and Cosm...","[{'tid': 1960724, 'title': 'Cadbury Ltd. And 2...",judgments,"[{'value': 'antox', 'formInput': 'antox'}, {'v...",False,
1,1509899.0,[],2006-05-16,a,,kumar,Astrazeneca Uk Ltd. And Anr. vs Orchid Chemica...,www.manupatra.co.in/NXT/gateway.dll/highcourt1...,"<div class=""docsource_main"">Delhi High Court</...",45.0,...,,True,Delhi High Court,[],"[{'tid': 1331119, 'title': 'Section 124 in The...","[{'tid': 69071842, 'title': 'Data Infosys Ltd....",judgments,"[{'value': 'trademark', 'formInput': 'trademar...",False,
2,1025815.0,[],1999-08-24,a,,kapadia,Cadbury India Limited vs Sm Dyechem Limited on...,www.manupatra.co.in/NXT/gateway.dll/HighCourt3...,"<div class=""docsource_main"">Gujarat High Court...",20.0,...,,True,Gujarat High Court,[],"[{'tid': 1353758, 'title': 'The Companies Act,...","[{'tid': 1965135, 'title': 'Torrent Pharmaceut...",judgments,"[{'value': 'picnic', 'formInput': 'picnic'}, {...",False,
3,1945028.0,[],2001-08-24,s,k,kaul,M/S. Veerumal Praveen Kumar vs M/S. Needle Ind...,www.manupatra.co.in/NXT/gateway.dll/highcourt1...,"<div class=""docsource_main"">Delhi High Court</...",26.0,...,,True,Delhi High Court,[],"[{'tid': 931654, 'title': 'the Designs Act, 20...","[{'tid': 63429287, 'title': 'Radico Khaitan Lt...",judgments,"[{'value': 'trademark', 'formInput': 'trademar...",False,
4,850381.0,[],1995-04-21,a,d,singh,N.R. Dongre And Ors. vs Whirlpool Corporation ...,www.manupatra.co.in/NXT/gateway.dll/highcourt1...,"<div class=""docsource_main"">Delhi High Court</...",24.0,...,,True,Delhi High Court,[],"[{'tid': 1664136, 'title': 'Section 27(2) in T...","[{'tid': 109727818, 'title': 'Toyota Jidosha K...",judgments,"[{'value': 'whirlpool', 'formInput': 'whirlpoo...",False,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
91,1460395.0,[],1997-04-24,m,,sharma,Bimal Govindji Shah vs Panna Lal Chandu Lal on...,www.manupatra.co.in/NXT/gateway.dll/highcourt1...,"<div class=""docsource_main"">Delhi High Court</...",14.0,...,,True,Delhi High Court,[],"[{'tid': 1365016, 'title': 'Section 27(2) in T...","[{'tid': 26238666, 'title': 'Rubaljit Singh vs...",judgments,"[{'value': 'hardware', 'formInput': 'hardware'...",False,
92,1981161.0,[],1996-11-08,k,,ramamoorthy,Rakesh Kumar Aggarwal vs Locks & Locking Devic...,www.manupatra.co.in/NXT/gateway.dll/highcourt1...,"<div class=""docsource_main"">Delhi High Court</...",8.0,...,,True,Delhi High Court,[],"[{'tid': 1005493, 'title': 'The Trade And Merc...","[{'tid': 1235568, 'title': 'Century Continuous...",judgments,"[{'value': 'trade mark', 'formInput': 'trade%2...",False,
93,1281453.0,[],2006-02-01,s,k,kaul,Visakha Chemicals vs Mala Ram And Sons on 1 Fe...,www.manupatra.co.in/NXT/gateway.dll/highcourt1...,"<div class=""docsource_main"">Delhi High Court</...",13.0,...,,True,Delhi High Court,[],"[{'tid': 1098485, 'title': 'Section 45 in the ...","[{'tid': 1209615, 'title': 'Alkem Laboratories...",judgments,"[{'value': 'trademark', 'formInput': 'trademar...",False,
94,145549327.0,[],2012-07-30,m,,singh,Cadila Healthcare Ltd. vs Aureate Healthcare P...,delhi/2012-07-30/IA--800-2011,"<div class=""docsource_main"">Delhi High Court</...",15.0,...,http://lobis.nic.in/dhc/MAN/judgement/30-07-20...,True,Delhi High Court,[],"[{'tid': 84096, 'title': 'Section 29 in The Tr...","[{'tid': 97049398, 'title': 'Charak Pharma Pvt...",judgments,"[{'value': 'trademark', 'formInput': 'trademar...",True,


In [48]:
ik_result.to_csv("ikanoon_result.csv",index=False)

In [49]:
df_with_tid_doc = pd.DataFrame(columns = ["DOC_ID","CONTENT"])
df_with_tid_doc["DOC_ID"] = ik_result["tid"]
df_with_tid_doc["CONTENT"] = ik_result["doc"]

In [50]:
df_with_tid_doc

Unnamed: 0,DOC_ID,CONTENT
0,330608.0,"<div class=""docsource_main"">Supreme Court of I..."
1,1509899.0,"<div class=""docsource_main"">Delhi High Court</..."
2,1025815.0,"<div class=""docsource_main"">Gujarat High Court..."
3,1945028.0,"<div class=""docsource_main"">Delhi High Court</..."
4,850381.0,"<div class=""docsource_main"">Delhi High Court</..."
...,...,...
91,1460395.0,"<div class=""docsource_main"">Delhi High Court</..."
92,1981161.0,"<div class=""docsource_main"">Delhi High Court</..."
93,1281453.0,"<div class=""docsource_main"">Delhi High Court</..."
94,145549327.0,"<div class=""docsource_main"">Delhi High Court</..."


In [51]:
df_with_tid_doc.to_csv("doc_id_with_content.csv",index=False)

In [4]:
h = html2text.HTML2Text()
h.ignore_links = True

In [53]:
t = df_with_tid_doc["CONTENT"]
d = df_with_tid_doc["DOC_ID"]

In [65]:
#get text

for i in range(ik_result.shape[0]):
    text=ik_result.iloc[i,8]
    text = h.handle(str(text))
    if np.isnan(ik_result.iloc[i,0]):
        name = ik_result.iloc[i,6]
    else:
        name = int(ik_result.iloc[i,0])
    f = open("text_files/"+str(name)+".txt",'w+',encoding='utf-8')
    f.write(text)
    f.close()

In [73]:
#save html files

data = pd.read_csv("ikanoon_result.csv")

for i in range(ik_result.shape[0]):
    f = open("text_files_html/"+str(int(ik_result.iloc[i,0]))+".html","w+",encoding="utf-8")
    f.write(data.iloc[i,8])

In [15]:
docs = []
name = "139585580"
docs.append(ik.doc(docid=name))
ik_result = pd.DataFrame.from_dict(docs)
text = h.handle(str(ik_result.iloc[0,8]))
f = open("text_files/"+str(int(ik_result.iloc[0,0]))+".txt",'w+',encoding='utf-8')
f.write(text)
f.close()

In [13]:
ik_result

Unnamed: 0,tid,covertids,publishdate,firstname,secondname,lastname,title,filename,doc,numcites,numcitedby,covertitles,url,showurl,docsource,covers,citeList,citedbyList,divtype,courtcopy
0,139585580,[],2017-11-28,,,,Triumphant Institute Of ... vs Mahesh Yadav & ...,delhi/2017-11-28/CS COMM--934-2016,"<div class=""docsource_main"">Delhi High Court</...",0,0,[],http://lobis.nic.in/ddir/dhc/VJM/judgement/29-...,True,Delhi High Court,[],[],[],judgments,True


In [14]:
ik_result.iloc[0,8]

'<div class="docsource_main">Delhi High Court</div>\n<div class="doc_title">Triumphant Institute Of ... vs Mahesh Yadav &amp; Anr. on 28 November, 2017</div><pre id="pre_1">*            IN THE HIGH COURT OF DELHI AT NEW DELHI\n\n+                   CS(COMM) No. 934/2016 &amp; I.A 10996/2016\n\n%                                                28th November, 2017\n\nTRIUMPHANT INSTITUTE OF MANAGEMENT EDUCATION\nPVT. LTD.                                  ..... Plaintiff\n                 Through: Ms. Bitika Sharma and Mr.\n                          Lakshay Kaushik, Advocates.\n\n                           versus\n\nMAHESH YADAV &amp; ANR.                                      ..... Defendants\n\nCORAM:\nHON\'BLE MR. JUSTICE VALMIKI J.MEHTA\n\nTo be referred to the Reporter or not?\n\nVALMIKI J. MEHTA, J (ORAL)\n\n1.           The present suit is filed by the plaintiff seeking the reliefs\n\nof injunction restraining the defendants from violating the rights in the\n\ntrademark \'T.I.M.E\' (

In [17]:
f = open("text_files_html/"+str(int(ik_result.iloc[0,0]))+".html","w+",encoding="utf-8")
f.write(str(ik_result.iloc[0,8]))
f.close()