In [20]:
import requests
from urllib.parse import urljoin
from datetime import datetime
import pandas as pd

class IndianKanoon:
  """
    https://github.com/aishik-pyne/kanoon
    Search query	https://api.indiankanoon.org/search/?formInput=<query>&pagenum=<pagenum>
    Document	https://api.indiankanoon.org/doc/<docid>/
    Document fragments	https://api.indiankanoon.org/docfragment/<docid>/?formInput=<query>
    Document Metainfo	https://api.indiankanoon.org/docmeta/<docid>/
  """

  def __init__(self):
    self.base_url = "https://api.indiankanoon.org/"
    self.auth_token = "6c0262cd025351a53ab784b21634260276288d19"
    
    self.headers = {
        'authorization': "Token {}".format(self.auth_token),
        'cache-control': "no-cache",
    }
    self.api_session = requests.Session()
    self.api_session.headers = self.headers

  def search(self, formInput, pagenum=0,
             fromdate=None, todate=None,
             title=None, author=None,
             cite=None, bench=None, maxpages=50):
    #  Creating parameters
    params = {
        'formInput': formInput,
        'pagenum': pagenum,
        'maxpages': maxpages
    }
    if fromdate:
      assert isinstance(fromdate, datetime) 
      params['fromdate'] = fromdate.strftime('%d-%m-&Y')

    if todate:
      assert isinstance(todate, datetime) 
      params['todate'] = todate.strftime('%d-%m-&Y')

    # Making the request
    response = self.api_session.post(
        urljoin(self.base_url, 'search/'), params=params)
    response.raise_for_status()
    return response.json()

  def doc(self, docid):
    response = self.api_session.post(
        urljoin(self.base_url, 'doc/{}/'.format(docid)))
    response.raise_for_status()
    return response.json()

  def docfragment(self, docid, formInput):
    params = {
        'formInput': formInput,
    }
    response = self.api_session.post(
        urljoin(self.base_url, 'docfragment/{}/'.format(docid)), params=params)
    response.raise_for_status()
    return response.json()

  def docmeta(self, docid):
    response = self.api_session.post(
        urljoin(self.base_url, 'docmeta/{}/'.format(docid)))
    response.raise_for_status()
    return response.json()

In [21]:
import pandas as pd
import numpy as np
import re

In [22]:
data = pd.read_csv('casemine.csv')

In [23]:
data.head()

Unnamed: 0,Judgement Name,IndianKanoon Link,Similar Judgement,IndianKanoon Link.1,Rank
0,Wander Ltd. And Another v. Antox India P. Ltd. .,https://indiankanoon.org/doc/330608/,Astrazeneca Uk Ltd. v. Orchid Chemicals & Phar...,https://indiankanoon.org/doc/1509899/,1
1,Wander Ltd. And Another v. Antox India P. Ltd. .,https://indiankanoon.org/doc/330608/,S.M Dyechem Ltd. v. Cadbury (India) Ltd. .,https://indiankanoon.org/doc/1025815/,2
2,Wander Ltd. And Another v. Antox India P. Ltd. .,https://indiankanoon.org/doc/330608/,Veerumal Praveen Kumar v. Needle Industries (I...,https://indiankanoon.org/doc/1945028/,3
3,Wander Ltd. And Another v. Antox India P. Ltd. .,https://indiankanoon.org/doc/330608/,N.R Dongre And Others v. Whirlpool Corporation...,https://indiankanoon.org/doc/850381/,4
4,Wander Ltd. And Another v. Antox India P. Ltd. .,https://indiankanoon.org/doc/330608/,Pfizer Products Inc. v. Rajesh Chopra,https://indiankanoon.org/doc/1674695/,5


In [24]:
tids = []

for i in range(data.shape[0]):
    
    regex = re.compile(r'[0-9]+')
    
    tid = str(data.iloc[i,1])
    if regex.search(tid):
        tids.append(str(regex.search(tid).group()))
    
    tid = str(data.iloc[i,3])
    if regex.search(tid):
        tids.append(str(regex.search(tid).group()))

tids = np.unique(tids)
    
    

In [25]:
tids

array(['1004544', '102042281', '1025815', '1062207', '106681665',
       '1114158', '1118022', '112995556', '1165153', '117903', '1202388',
       '1238420', '1239673', '125686531', '1269197', '1281050', '1281453',
       '1300598', '1306228', '13352', '136111', '1375229', '141920684',
       '14349', '145549327', '1460395', '147470112', '1502681', '1509899',
       '152715128', '1557873', '157791679', '1611564', '1640143',
       '1657978', '1674695', '170719338', '172383107', '1741869',
       '1771880', '1784548', '1789265', '179972913', '18135822',
       '181424088', '1857898', '1864885', '1900841', '192230665',
       '19267666', '1945028', '19504407', '1965135', '1981161', '1983225',
       '27816811', '304538', '321682', '3304039', '330608', '340501',
       '357841', '374690', '421030', '42490190', '49386818', '508915',
       '52700318', '574884', '60866758', '614155', '614536', '615459',
       '618763', '645115', '646292', '652828', '6544121', '663392',
       '67204010', '

In [26]:
ik = IndianKanoon()

In [27]:
docs = []
for i in range(0,len(tids)):
    docs.append(ik.doc(docid=str(tids[i])))
    print("FILE NUMBER: ",(i+1))

FILE NUMBER:  1
FILE NUMBER:  2
FILE NUMBER:  3
FILE NUMBER:  4
FILE NUMBER:  5
FILE NUMBER:  6
FILE NUMBER:  7
FILE NUMBER:  8
FILE NUMBER:  9
FILE NUMBER:  10
FILE NUMBER:  11
FILE NUMBER:  12
FILE NUMBER:  13
FILE NUMBER:  14
FILE NUMBER:  15
FILE NUMBER:  16
FILE NUMBER:  17
FILE NUMBER:  18
FILE NUMBER:  19
FILE NUMBER:  20
FILE NUMBER:  21
FILE NUMBER:  22
FILE NUMBER:  23
FILE NUMBER:  24
FILE NUMBER:  25
FILE NUMBER:  26
FILE NUMBER:  27
FILE NUMBER:  28
FILE NUMBER:  29
FILE NUMBER:  30
FILE NUMBER:  31
FILE NUMBER:  32
FILE NUMBER:  33
FILE NUMBER:  34
FILE NUMBER:  35
FILE NUMBER:  36
FILE NUMBER:  37
FILE NUMBER:  38
FILE NUMBER:  39
FILE NUMBER:  40
FILE NUMBER:  41
FILE NUMBER:  42
FILE NUMBER:  43
FILE NUMBER:  44
FILE NUMBER:  45
FILE NUMBER:  46
FILE NUMBER:  47
FILE NUMBER:  48
FILE NUMBER:  49
FILE NUMBER:  50
FILE NUMBER:  51
FILE NUMBER:  52
FILE NUMBER:  53
FILE NUMBER:  54
FILE NUMBER:  55
FILE NUMBER:  56
FILE NUMBER:  57
FILE NUMBER:  58
FILE NUMBER:  59
FILE N

In [28]:
import html2text

In [29]:
h = html2text.HTML2Text()
h.ignore_links = True

In [30]:
docs

[{'tid': 1004544,
  'covertids': [],
  'publishdate': '2001-08-21',
  'firstname': 'a',
  'secondname': None,
  'lastname': 'sikri',
  'title': 'Khandelwal Laboratories Ltd. vs Fdc Limited on 21 August, 2001',
  'filename': 'www.manupatra.co.in/NXT/gateway.dll/highcourt1/Delhi/delhi2001/dl2001/d010838.htm',
  'doc': '<div class="docsource_main">Delhi High Court</div>\n<div class="doc_title">Khandelwal Laboratories Ltd. vs Fdc Limited on 21 August, 2001</div><div class="doc_citations">Equivalent citations: 94 (2001) DLT 141</div>\n\n<div class="doc_author">Author: A Sikri</div>\n\n<div class="doc_bench">Bench: A Sikri</div>\n\n<p id="p_1">ORDER\n \n\n A.K. Sikri, J.\n</p>\n \n\n<p id="p_2">1. The plaintiff is a company incorporated under the <a href="/doc/257409/" id="a_1">Indian Companies Act</a> which is carrying on business, inter alia, of manufacturer and seller of pharmaceuticals and medicinal preparations. It claims to be a well known pharmaceutical company enjoying very high repu

In [31]:
x = pd.DataFrame.from_dict(docs)

In [32]:
x

Unnamed: 0,tid,covertids,publishdate,firstname,secondname,lastname,title,filename,doc,numcites,...,url,showurl,docsource,covers,citeList,citedbyList,divtype,relatedqs,courtcopy,errmsg
0,1004544.0,[],2001-08-21,a,,sikri,Khandelwal Laboratories Ltd. vs Fdc Limited on...,www.manupatra.co.in/NXT/gateway.dll/highcourt1...,"<div class=""docsource_main"">Delhi High Court</...",2.0,...,,True,Delhi High Court,[],"[{'tid': 257409, 'title': 'The Indian Companie...","[{'tid': 140343706, 'title': 'Fdc Limited vs M...",judgments,"[{'value': 'pharmaceutical', 'formInput': 'pha...",False,
1,102042281.0,[],2014-07-01,s,,sachdeva,"Stiefel Laboratories, Inc & Anr vs Ajanta Phar...",delhi/2014-07-01/CS(OS)--2373-2013,"<div class=""docsource_main"">Delhi High Court</...",9.0,...,http://lobis.nic.in/dhc/SAS/judgement/01-07-20...,True,Delhi High Court,[],"[{'tid': 1181080, 'title': 'Century Traders vs...","[{'tid': 117371033, 'title': 'M/S Deelux Cable...",judgments,"[{'value': 'trademark', 'formInput': 'trademar...",True,
2,1025815.0,[],1999-08-24,a,,kapadia,Cadbury India Limited vs Sm Dyechem Limited on...,www.manupatra.co.in/NXT/gateway.dll/HighCourt3...,"<div class=""docsource_main"">Gujarat High Court...",20.0,...,,True,Gujarat High Court,[],"[{'tid': 1353758, 'title': 'The Companies Act,...","[{'tid': 1965135, 'title': 'Torrent Pharmaceut...",judgments,"[{'value': 'picnic', 'formInput': 'picnic'}, {...",False,
3,1062207.0,[],2001-03-21,d,,deshmukh,Agromore (P) Ltd. And Anr. vs Chembond Chemica...,www.manupatra.co.in/NXT/gateway.dll/highcourt1...,"<div class=""docsource_main"">Bombay High Court<...",20.0,...,,True,Bombay High Court,[],"[{'tid': 181078, 'title': 'Section 33 in The C...","[{'tid': 1373362, 'title': 'Ms. Supriya Prabhu...",judgments,"[{'value': 'trade mark use', 'formInput': 'tra...",False,
4,106681665.0,[],1983-09-30,d,,desai.,Vishwa Mitter Of Vijay Bharat ... vs O.P. Podd...,www.manupatra.co.in/NXT/gateway.dll/sc/1980-20...,"<div class=""docsource_main"">Supreme Court of I...",17.0,...,,True,Supreme Court of India,[],"[{'tid': 1017213, 'title': 'The Trade Marks Ac...","[{'tid': 47858029, 'title': 'A.C. Narayanan vs...",judgments,"[{'value': 'beedi', 'formInput': 'beedi'}, {'v...",False,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
91,8433581.0,[],2012-02-22,m,,singh,Premier Tissues India Ltd. vs Rolia Tissues In...,delhi/2012-02-22/IA--10846-2011,"<div class=""docsource_main"">Delhi High Court</...",8.0,...,http://lobis.nic.in/dhc/MAN/judgement/23-02-20...,True,Delhi High Court,[],"[{'tid': 1136195, 'title': 'the Copyright Act,...","[{'tid': 149551393, 'title': 'Itc Limited vs W...",judgments,"[{'value': 'trade mark', 'formInput': 'trade%2...",True,
92,850381.0,[],1995-04-21,a,d,singh,N.R. Dongre And Ors. vs Whirlpool Corporation ...,www.manupatra.co.in/NXT/gateway.dll/highcourt1...,"<div class=""docsource_main"">Delhi High Court</...",24.0,...,,True,Delhi High Court,[],"[{'tid': 1664136, 'title': 'Section 27(2) in T...","[{'tid': 109727818, 'title': 'Toyota Jidosha K...",judgments,"[{'value': 'whirlpool', 'formInput': 'whirlpoo...",False,
93,858066.0,[],2006-08-29,,,s.b.sinha,Ramdev Food Products Pvt. Ltd vs Arvindbhai Ra...,judis.nic.in/2006-08-29/Ramdev Food Products P...,"<div class=""docsource_main"">Supreme Court of I...",55.0,...,http://judis.nic.in/supremecourt/imgst.aspx?fi...,True,Supreme Court of India,[],"[{'tid': 32969400, 'title': 'The Prevention of...","[{'tid': 58880080, 'title': 'Lifestyle Equitie...",judgments,"[{'value': 'ramdev', 'formInput': 'ramdev'}, {...",True,
94,94612663.0,[],2012-05-18,,,a.k.sikri,United Biotech Pvt. Ltd. vs Orchid Chemicals &...,delhi/2012-05-18/LPA--679-2011,"<div class=""docsource_main"">Delhi High Court</...",30.0,...,http://lobis.nic.in/dhc/AKS/judgement/25-05-20...,True,Delhi High Court,[],"[{'tid': 1331119, 'title': 'Section 124 in The...","[{'tid': 176801297, 'title': 'Anchor Health An...",judgments,"[{'value': 'trademark', 'formInput': 'trademar...",True,


In [33]:
text_df = pd.DataFrame(columns=["DOC_ID","CONTENT"])

In [35]:
text_df["CONTENT"] = x["doc"]
text_df["DOC_ID"] = x["tid"]

In [36]:
text_df

Unnamed: 0,DOC_ID,CONTENT
0,1004544.0,"<div class=""docsource_main"">Delhi High Court</..."
1,102042281.0,"<div class=""docsource_main"">Delhi High Court</..."
2,1025815.0,"<div class=""docsource_main"">Gujarat High Court..."
3,1062207.0,"<div class=""docsource_main"">Bombay High Court<..."
4,106681665.0,"<div class=""docsource_main"">Supreme Court of I..."
...,...,...
91,8433581.0,"<div class=""docsource_main"">Delhi High Court</..."
92,850381.0,"<div class=""docsource_main"">Delhi High Court</..."
93,858066.0,"<div class=""docsource_main"">Supreme Court of I..."
94,94612663.0,"<div class=""docsource_main"">Delhi High Court</..."


In [40]:
text_df.dropna(how='any',axis=0,inplace=True)

In [41]:
text_df

Unnamed: 0,DOC_ID,CONTENT
0,1004544.0,"<div class=""docsource_main"">Delhi High Court</..."
1,102042281.0,"<div class=""docsource_main"">Delhi High Court</..."
2,1025815.0,"<div class=""docsource_main"">Gujarat High Court..."
3,1062207.0,"<div class=""docsource_main"">Bombay High Court<..."
4,106681665.0,"<div class=""docsource_main"">Supreme Court of I..."
...,...,...
91,8433581.0,"<div class=""docsource_main"">Delhi High Court</..."
92,850381.0,"<div class=""docsource_main"">Delhi High Court</..."
93,858066.0,"<div class=""docsource_main"">Supreme Court of I..."
94,94612663.0,"<div class=""docsource_main"">Delhi High Court</..."


In [42]:
text_df['DOC_ID'] = text_df['DOC_ID'].astype(int)

In [43]:
text_df

Unnamed: 0,DOC_ID,CONTENT
0,1004544,"<div class=""docsource_main"">Delhi High Court</..."
1,102042281,"<div class=""docsource_main"">Delhi High Court</..."
2,1025815,"<div class=""docsource_main"">Gujarat High Court..."
3,1062207,"<div class=""docsource_main"">Bombay High Court<..."
4,106681665,"<div class=""docsource_main"">Supreme Court of I..."
...,...,...
91,8433581,"<div class=""docsource_main"">Delhi High Court</..."
92,850381,"<div class=""docsource_main"">Delhi High Court</..."
93,858066,"<div class=""docsource_main"">Supreme Court of I..."
94,94612663,"<div class=""docsource_main"">Delhi High Court</..."


In [44]:
d = text_df['CONTENT']
t = text_df['DOC_ID']

In [None]:
for i in range(len(t)):
    text = d[i]
    text = h.handle(text)
    