In [1]:
import requests
from urllib.parse import urljoin
from datetime import datetime
import pandas as pd

class IndianKanoon:
  """
    https://github.com/aishik-pyne/kanoon
    Search query	https://api.indiankanoon.org/search/?formInput=<query>&pagenum=<pagenum>
    Document	https://api.indiankanoon.org/doc/<docid>/
    Document fragments	https://api.indiankanoon.org/docfragment/<docid>/?formInput=<query>
    Document Metainfo	https://api.indiankanoon.org/docmeta/<docid>/
  """

  def __init__(self):
    self.base_url = "https://api.indiankanoon.org/"
    #self.auth_token = "7c7dd7090ed8a1461a6e08f959e9a46c4c0427ee"
    #self.auth_token = "6fcba396dd9961c011621640abf8195f20b918be"
    self.auth_token = "6c0262cd025351a53ab784b21634260276288d19"
    
    self.headers = {
        'authorization': "Token {}".format(self.auth_token),
        'cache-control': "no-cache",
    }
    self.api_session = requests.Session()
    self.api_session.headers = self.headers

  def search(self, formInput, pagenum=0,
             fromdate=None, todate=None,
             title=None, author=None,
             cite=None, bench=None, maxpages=50):
    #  Creating parameters
    params = {
        'formInput': formInput,
        'pagenum': pagenum,
        'maxpages': maxpages
    }
    if fromdate:
      assert isinstance(fromdate, datetime) 
      params['fromdate'] = fromdate.strftime('%d-%m-&Y')

    if todate:
      assert isinstance(todate, datetime) 
      params['todate'] = todate.strftime('%d-%m-&Y')

    # Making the request
    response = self.api_session.post(
        urljoin(self.base_url, 'search/'), params=params)
    response.raise_for_status()
    return response.json()

  def doc(self, docid):
    response = self.api_session.post(
        urljoin(self.base_url, 'doc/{}/'.format(docid)))
    response.raise_for_status()
    return response.json()

  def docfragment(self, docid, formInput):
    params = {
        'formInput': formInput,
    }
    response = self.api_session.post(
        urljoin(self.base_url, 'docfragment/{}/'.format(docid)), params=params)
    response.raise_for_status()
    return response.json()

  def docmeta(self, docid):
    response = self.api_session.post(
        urljoin(self.base_url, 'docmeta/{}/'.format(docid)))
    response.raise_for_status()
    return response.json()

In [2]:
ik = IndianKanoon()
x = ik.search(formInput="ipr+judgement",maxpages=1000)
print(len(x["docs"]))
y = x["docs"]
z = pd.DataFrame.from_dict(y)
z.columns

224


Index(['title', 'covertitles', 'numcites', 'numcitedby', 'url', 'headline',
       'docsize', 'tid', 'covertids', 'doctype', 'publishdate', 'firstname',
       'secondname', 'lastname', 'fragment', 'docsource', 'author',
       'authorEncoded', 'covers'],
      dtype='object')

In [3]:
doc_ids = z['tid']
print(doc_ids)
docs=[]

0      148082701
1       98328847
2       25809464
3      179710454
4      198289988
         ...    
219     21299781
220    135705013
221     50727941
222    192036518
223    175973391
Name: tid, Length: 224, dtype: int64


In [4]:
for i in range(len(doc_ids)):
    docs.append(ik.doc(docid=str(doc_ids[i])))
    print("FILE NUMBER: ",(i+1))

FILE NUMBER:  1
FILE NUMBER:  2
FILE NUMBER:  3
FILE NUMBER:  4
FILE NUMBER:  5
FILE NUMBER:  6
FILE NUMBER:  7
FILE NUMBER:  8
FILE NUMBER:  9
FILE NUMBER:  10
FILE NUMBER:  11
FILE NUMBER:  12
FILE NUMBER:  13
FILE NUMBER:  14
FILE NUMBER:  15
FILE NUMBER:  16
FILE NUMBER:  17
FILE NUMBER:  18
FILE NUMBER:  19
FILE NUMBER:  20
FILE NUMBER:  21
FILE NUMBER:  22
FILE NUMBER:  23
FILE NUMBER:  24
FILE NUMBER:  25
FILE NUMBER:  26
FILE NUMBER:  27
FILE NUMBER:  28
FILE NUMBER:  29
FILE NUMBER:  30
FILE NUMBER:  31
FILE NUMBER:  32
FILE NUMBER:  33
FILE NUMBER:  34
FILE NUMBER:  35
FILE NUMBER:  36
FILE NUMBER:  37
FILE NUMBER:  38
FILE NUMBER:  39
FILE NUMBER:  40
FILE NUMBER:  41
FILE NUMBER:  42
FILE NUMBER:  43
FILE NUMBER:  44
FILE NUMBER:  45
FILE NUMBER:  46
FILE NUMBER:  47
FILE NUMBER:  48
FILE NUMBER:  49
FILE NUMBER:  50
FILE NUMBER:  51
FILE NUMBER:  52
FILE NUMBER:  53
FILE NUMBER:  54
FILE NUMBER:  55
FILE NUMBER:  56
FILE NUMBER:  57
FILE NUMBER:  58
FILE NUMBER:  59
FILE N

In [5]:
z2 = pd.DataFrame.from_dict(docs)

In [6]:
print(z2)

           tid covertids publishdate firstname secondname lastname  \
0    148082701        []  2008-09-26         p          c    ghose   
1     98328847        []  2021-09-06      None       None     None   
2     25809464        []  2015-07-03      None       None     None   
3    179710454        []  2009-04-06         s          d    singh   
4    198289988        []  2009-04-06         s          d    singh   
..         ...       ...         ...       ...        ...      ...   
219   21299781        []  2013-05-23         m          n   sharma   
220  135705013        []  2009-05-05         s          d    singh   
221   50727941        []  2010-05-28         s          h   dudani   
222  192036518        []  2021-03-03      None       None     None   
223  175973391        []  2019-01-08      None       None     None   

                                                 title  \
0    Radio Today Broadcasting Ltd vs Indian Perform...   
1    M/S Nysb Ventures Pvt. Ltd., A ... vs 

In [7]:
import pandas as pd
pd.set_option("display.max_rows", None, "display.max_columns",None)

In [8]:
z3 = pd.DataFrame(columns=["DOC_ID","CONTENT"])

In [9]:
z3['DOC_ID'] = z2['tid']
z3['CONTENT'] = z2['doc']

In [10]:
print(z3)

        DOC_ID                                            CONTENT
0    148082701  <div class="docsource_main">Calcutta High Cour...
1     98328847  <div class="docsource_main">Delhi High Court -...
2     25809464  <div class="docsource_main">Custom, Excise & S...
3    179710454  <div class="docsource_main">Delhi District Cou...
4    198289988  <div class="docsource_main">Delhi District Cou...
5     21986035  <div class="docsource_main">Delhi District Cou...
6     78969837  <div class="docsource_main">Delhi High Court -...
7    156395582  <div class="docsource_main">Custom, Excise & S...
8    195686272  <div class="docsource_main">Delhi District Cou...
9     12318794  <div class="docsource_main">Delhi District Cou...
10    39790934  <div class="docsource_main">Delhi District Cou...
11    75424765  <div class="docsource_main">Delhi District Cou...
12     6615285  <div class="docsource_main">Delhi District Cou...
13    49794879  <div class="docsource_main">Delhi District Cou...
14     351

In [11]:
z.to_csv("search_ipr_judgements_results.csv",index=False)
z2.to_csv("doc_id_search_results_ipr_judgements.csv",index=False)
z3.to_csv("doc_contents_only_ipr_judgements.csv",index=False)

In [13]:
import html2text

In [14]:
h = html2text.HTML2Text()
h.ignore_links = True

In [15]:
import pandas as pd
data=pd.read_csv("doc_contents_only_ipr_judgements.csv")

In [16]:
t = data["CONTENT"]
d = data["DOC_ID"]

In [17]:
#import os
#os.mkdir("text_files")

In [22]:
for i in range(len(t)):
    text=t[i]
    text = h.handle(text)
    f = open("text files/"+str(d[i])+".txt",'w+',encoding='utf-8')
    f.write(text)
    f.close()

In [None]:
#================================================================================================
#================================================================================================

#================================================================================================

#                          24TH FEBRUARY 2021, RETRIEVING FURTHER DOCS

#================================================================================================
#================================================================================================

#================================================================================================

In [23]:
import pandas as pd
data = pd.read_csv("doc_id_search_results_ipr_judgements.csv")
#print(data)
tid = data['tid'].tolist()
print(type(tid))
citeList = data['citeList'].tolist()
for i in range(len(tid)):
    tid[i] = int(tid[i])

<class 'list'>


In [27]:
import re
citelist=[]
t=[]
for i in range(len(citeList)):
    print("======================= ITERATION: ",i," =======================")
    dic = (citeList[i])
    x = dic.split(",")
    citelist=[]
    for j in range(len(x)):
      if 'tid' in x[j]:
        x[j] = re.sub('\D','',x[j])
        print(x[j])
        x[j] = int(x[j])
        citelist.append(x[j])
        #tid.append(x[j])
    #tids = extract_tids(citeList[i])
    t.append(citelist)
    #print(t)

1940444
1136195
797096
43316752
243280
104566
1139927
180317
257409
1132672
517539
268919
1823824
750738
1132672
517539
268919
1823824
750738
1136195
445276
999236
969470
121966659
84091218
1017213
104566
1136195
1038145
363642
588136
1129646
1569253
1136195
1038145
588136
1704109
745003
445276
1132672
745003
445276
1132672
445276
1132672
1136195
1233094
363642
1817184
262262
1017213
967946
1368430
1599302
1136195
999236
1666329
359894
298017
865758
217797
564101
1674695
425768
1981786
185112174
814605
789969
844026
2959282
117237824
814605
789969
2959282
117237824
1965344
814605
789969
12972852
938899
163813330
814605
789969
844026
117237824
2959282
817828
300602
946858
368047
237570
1353758
789969
1668825
633848
248374
104566
1656199
1937976
356807
110162683
104566
237570
20329360
1406924
17425835
104566
1700055
110162683
104566
73645399
1641262
1937835
1653206
104566
104566
1444915
1266611
104566
1444915
1266611
82982698
1469183
85233116
134179606
23088775
104566
994832
1569253
1136

In [28]:
print(t)
for i in range(len(t)):
    for j in range(len(t[i])):
        tid.append((t[i])[j])

[[1940444, 1136195, 797096, 43316752, 243280], [], [104566, 1139927, 180317, 257409], [1132672, 517539, 268919, 1823824, 750738], [1132672, 517539, 268919, 1823824, 750738], [1136195, 445276], [999236, 969470, 121966659, 84091218, 1017213], [104566], [1136195, 1038145, 363642, 588136, 1129646], [1569253, 1136195, 1038145, 588136, 1704109], [745003, 445276, 1132672], [745003, 445276, 1132672], [445276, 1132672], [1136195, 1233094, 363642, 1817184], [262262, 1017213, 967946, 1368430, 1599302], [1136195, 999236, 1666329, 359894, 298017], [865758, 217797, 564101, 1674695, 425768], [1981786], [185112174], [814605, 789969, 844026, 2959282, 117237824], [814605, 789969, 2959282, 117237824, 1965344], [814605, 789969, 12972852, 938899, 163813330], [814605, 789969, 844026, 117237824, 2959282], [], [817828, 300602, 946858, 368047, 237570], [1353758, 789969, 1668825, 633848, 248374], [104566, 1656199, 1937976, 356807, 110162683], [104566, 237570, 20329360, 1406924, 17425835], [104566, 1700055, 1101

In [33]:
import numpy as np

In [44]:
tid = np.unique(tid)

In [61]:
for i in range(len(tid)):
    if tid[i]==47600234:
        print(i)

470


In [60]:
ik = IndianKanoon()
content = []
for i in range(470,len(tid)):
    print("Iteration: ",i+1)
    x = ik.doc(docid=tid[i])
    content.append(x)

Iteration:  471


HTTPError: 403 Client Error: Forbidden for url: https://api.indiankanoon.org/doc/47600234/

In [52]:
z_cites_doc = pd.DataFrame.from_dict(content)
similar_doc_content = pd.DataFrame(columns=['DOC_ID','CONTENT'])
similar_doc_content['DOC_ID'] = z_cites_doc['tid']
similar_doc_content['CONTENT'] = z_cites_doc['doc']

similar_doc_content.to_csv("cites_text_files/cites_doc_content.csv",index=False)

In [53]:
z_cites_doc

Unnamed: 0,tid,covertids,publishdate,firstname,secondname,lastname,title,filename,doc,numcites,numcitedby,covertitles,url,showurl,docsource,covers,citeList,citedbyList,divtype,relatedqs,courtcopy
0,6749,[],1974-01-01,,,,"The Interest- Tax Act, 1974",indiacode/197445,"<div class=""docsource_main"">Central Government...",0,2146,[],,True,Central Government Act,[],[],"[{'tid': 1782963, 'title': 'Unit Trust Of Indi...",acts,"[{'value': 'interest tax', 'formInput': 'inter...",False
1,7496,[],2004-11-29,,,y.k.sabharwal,State Of Orissa vs Debendra Nath Padhi on 29 N...,judis.nic.in/2004-11-29/State of Orissa Vs. De...,"<div class=""docsource_main"">Supreme Court of I...",32,345,[],http://judis.nic.in/supremecourt/imgst.aspx?fi...,True,Supreme Court of India,[],"[{'tid': 1056165, 'title': 'Section 227 in The...","[{'tid': 293005, 'title': 'M.V.P.Maharaja vs S...",judgments,"[{'value': 'section 227', 'formInput': 'sectio...",True
2,7832,[],1800-01-01,,,,"The Land Acquisition Act, 1894",UniEncyclopaedia/1/1380/final.htm,"<div class=""docsource_main"">Central Government...",0,21009,[],,True,Central Government Act,[],[],"[{'tid': 113268978, 'title': 'Ravindra Ramchan...",acts,[{'value': 'land acquisition act filter: ...,False
3,25127,[],1800-01-01,,,,The Mines and Minerals (Development and Regula...,UniEncyclopaedia/1/2283/final.htm,"<div class=""docsource_main"">Central Government...",0,1333,[],,True,Central Government Act,[],[],"[{'tid': 181202406, 'title': 'Union Of India v...",acts,"[{'value': 'mining lease ', 'formInput': 'mini...",False
4,30524,[789969],1961-01-01,,,,"Section 80HHC in The Income- Tax Act, 1995",indiacode/196143/80HHC,"<div class=""docsource_main"">Central Government...",0,2542,[Complete Act],,True,Central Government Act,"[{'tid': 789969, 'title': 'Complete Act'}]",[],"[{'tid': 843329, 'title': 'Deputy Commissioner...",acts,"[{'value': 'total turnover', 'formInput': 'tot...",False
5,32503,[789969],1961-01-01,,,,"Section 43A in The Income- Tax Act, 1995",indiacode/196143/43A,"<div class=""docsource_main"">Central Government...",0,568,[Complete Act],,True,Central Government Act,"[{'tid': 789969, 'title': 'Complete Act'}]",[],"[{'tid': 1027487, 'title': 'Commissioner Of In...",acts,,False
6,35876,[1353758],1956-01-01,,,,"Section 139 in The Companies Act, 1956",indiacode/195601/139,"<div class=""docsource_main"">Central Government...",0,159,[Complete Act],,True,Central Government Act,"[{'tid': 1353758, 'title': 'Complete Act'}]",[],"[{'tid': 651202, 'title': 'Kerala State Bamboo...",acts,,False
7,42009,[1136195],1800-01-01,,,,"Section 4 in the Copyright Act, 1957",UniEncyclopaedia/1/1269/final.htm/4,"<div class=""docsource_main"">Central Government...",0,40,[Complete Act],,True,Central Government Act,"[{'tid': 1136195, 'title': 'Complete Act'}]",[],"[{'tid': 100402341, 'title': 'Shri Shamsher Ka...",acts,"[{'value': 'copyright', 'formInput': 'copyrigh...",False
8,43654,[7832],1800-01-01,,,,"Section 4 in The Land Acquisition Act, 1894",UniEncyclopaedia/1/1380/final.htm/4,"<div class=""docsource_main"">Central Government...",0,14930,[Complete Act],,True,Central Government Act,"[{'tid': 7832, 'title': 'Complete Act'}]",[],"[{'tid': 195081303, 'title': 'Gajraj And Other...",acts,"[{'value': 'section 2(e)', 'formInput': 'secti...",False
9,60334,[],2007-01-02,,,,"Cce vs Sundaram Finance Ltd. on 2 January, 2007",www.manupatra.co.in/NXT/gateway.dll/tribunal/t...,"<div class=""docsource_main"">Customs, Excise an...",2,2,[],,True,"Customs, Excise and Gold Tribunal - Tamil Nadu",[],"[{'tid': 104566, 'title': 'The Finance Act, ...","[{'tid': 1163459, 'title': 'Gujarat Chem. Port...",judgments,"[{'value': 'consultancy\""', 'formInput': 'cons...",False


In [54]:
z6 = pd.read_csv("cites_text_files/cites_doc_content.csv")
z6.columns


Index(['DOC_ID', 'CONTENT'], dtype='object')

In [56]:
d = z6['DOC_ID']
t = z6['CONTENT']

for i in range(len(t)):
    text=t[i]
    text = h.handle(text)
    f = open("cites_text_files/"+str(d[i])+".txt",'w+',encoding='utf-8')
    f.write(text)
    f.close()

In [4]:
'''file_ids = pd.read_csv("/Users/jaspreetsingh/Desktop/temp.csv")
data = file_ids[['KEYWORD','TITLE','LINK']]
#print(file_ids.head())
file_ids = data['LINK'].tolist()
#print(file_ids)
for i in range(len(file_ids)):
    x = file_ids[i]
    x = (x.split('/'))[-2]
    #print(x)
    file_ids[i] = x
print(len(file_ids))
data['TID'] = file_ids
print(data.head())'''
ik = IndianKanoon()
content = []
file_ids = N
print(file_ids)
for i in range(len(file_ids)):
    print("Iteration: ",i+1)
    x = ik.doc(docid=file_ids[i])
    content.append(x)
    

['100123', '1006468', '1006709', '1007946', '101008236', '1011356', '1012649', '1013766', '1014138', '1019490', '1022090', '1022584', '1023674', '10239019', '1024124', '102653512', '102852', '102886851', '1030976', '1033637', '1033812', '103434922', '1038563', '104312764', '104313664', '104377', '104810373', '1048632', '1052178', '1053542', '1056935', '105936170', '1073828', '107853771', '108020655', '108509106', '1086058', '1092394', '109700315', '1097490', '1097981', '1098305', '109874394', '1099140', '1102062', '1103984', '1104087', '1104701', '1105134', '1110013', '1115541', '111777378', '1120137', '1120277', '1120373', '1121229', '112850760', '112896520', '112937069', '1129833', '1130742', '1131957', '1132299', '1132362', '1134429', '1134697', '1136663', '1136885', '1147125', '1147963', '1149874', '1151918', '1152518', '1154981', '1156538', '115718142', '1159790', '1163546', '1165503', '1166220', '1168060', '1180101', '118046405', '118501220', '1186368', '1186410', '1187885', '118

Iteration:  2
Iteration:  3
Iteration:  4
Iteration:  5
Iteration:  6
Iteration:  7
Iteration:  8
Iteration:  9
Iteration:  10
Iteration:  11
Iteration:  12
Iteration:  13
Iteration:  14
Iteration:  15
Iteration:  16
Iteration:  17
Iteration:  18
Iteration:  19
Iteration:  20
Iteration:  21
Iteration:  22
Iteration:  23
Iteration:  24
Iteration:  25
Iteration:  26
Iteration:  27
Iteration:  28
Iteration:  29
Iteration:  30
Iteration:  31
Iteration:  32
Iteration:  33
Iteration:  34
Iteration:  35
Iteration:  36
Iteration:  37
Iteration:  38
Iteration:  39
Iteration:  40
Iteration:  41
Iteration:  42
Iteration:  43
Iteration:  44
Iteration:  45
Iteration:  46
Iteration:  47
Iteration:  48
Iteration:  49
Iteration:  50
Iteration:  51
Iteration:  52
Iteration:  53
Iteration:  54
Iteration:  55
Iteration:  56
Iteration:  57
Iteration:  58
Iteration:  59
Iteration:  60
Iteration:  61
Iteration:  62
Iteration:  63
Iteration:  64
Iteration:  65
Iteration:  66
Iteration:  67
Iteration:  68
Ite

Iteration:  521
Iteration:  522
Iteration:  523
Iteration:  524
Iteration:  525
Iteration:  526
Iteration:  527
Iteration:  528
Iteration:  529
Iteration:  530
Iteration:  531
Iteration:  532
Iteration:  533
Iteration:  534
Iteration:  535
Iteration:  536
Iteration:  537
Iteration:  538
Iteration:  539
Iteration:  540
Iteration:  541
Iteration:  542
Iteration:  543
Iteration:  544
Iteration:  545
Iteration:  546
Iteration:  547
Iteration:  548
Iteration:  549
Iteration:  550
Iteration:  551
Iteration:  552
Iteration:  553
Iteration:  554
Iteration:  555
Iteration:  556
Iteration:  557
Iteration:  558
Iteration:  559
Iteration:  560
Iteration:  561
Iteration:  562
Iteration:  563
Iteration:  564
Iteration:  565
Iteration:  566
Iteration:  567
Iteration:  568
Iteration:  569
Iteration:  570
Iteration:  571
Iteration:  572
Iteration:  573
Iteration:  574
Iteration:  575
Iteration:  576
Iteration:  577
Iteration:  578
Iteration:  579
Iteration:  580
Iteration:  581
Iteration:  582
Iteratio

In [4]:
content_df = pd.DataFrame(content)
#print(content_df)
data['SOURCE'] = content_df['docsource']
data['RELATED_QS'] = content_df['relatedqs']
data['CITES'] = content_df['citeList']
data['CITED_BY'] = content_df['citedbyList']
data['TITLE'] = content_df['title']
for i in range(len(data)):
    if(data.loc[i,'KEYWORD']==""):
        data[i,'KEYWORD']="IPR/L1"
print(data.head())
data.to_csv("/Users/jaspreetsingh/Desktop/links_updated.csv",index=False)


      KEYWORD                                              TITLE  \
0  PATENT ACT  State Of Gujarat vs Vora Salebhai Gulamali And...   
1  PATENT ACT  Aphali Pharmaceuticals Ltd vs State Of Maharas...   
2  PATENT ACT  Naranbhai Dayabhai Patel & Anr vs Suleman Isub...   
3  PATENT ACT  Isha Distribution House Pvt Ltd vs Aditya Birl...   
4  PATENT ACT  M/S Shinhan Apex Corporation vs M/S Euro Apex ...   

                                      LINK        TID                  SOURCE  \
0    https://indiankanoon.org/doc/1338837/    1338837  Supreme Court of India   
1    https://indiankanoon.org/doc/1328814/    1328814  Supreme Court of India   
2     https://indiankanoon.org/doc/271216/     271216  Supreme Court of India   
3  https://indiankanoon.org/doc/103057287/  103057287  Supreme Court of India   
4  https://indiankanoon.org/doc/161614515/  161614515  Supreme Court of India   

                                          RELATED_QS  \
0  [{'value': 'jagirdar', 'formInput': 'jagirdar

In [6]:
content = content_df['doc']
print(content)

0      <div class="docsource_main">Delhi High Court</...
1      <div class="docsource_main">Bombay High Court<...
2      <div class="docsource_main">Supreme Court of I...
3      <div class="docsource_main">Supreme Court of I...
4      <div class="docsource_main">Delhi High Court</...
                             ...                        
793    <div class="docsource_main">Supreme Court of I...
794    <div class="docsource_main">Supreme Court of I...
795    <div class="docsource_main">Supreme Court of I...
796    <div class="docsource_main">Delhi High Court</...
797    <div class="docsource_main">Supreme Court of I...
Name: doc, Length: 798, dtype: object


In [8]:
import html2text
h = html2text.HTML2Text()
h.ignore_links = True
name = content_df['tid'].tolist()
for i in range(len(content)):
    print("FILE: ",i+1)
    S = h.handle(content[i])
    f=open("../CASEMINE/CASE_SIMILARITY/TEXT_FILES/"+str(name[i])+".txt", "w")
    f.write(S)

FILE:  1
FILE:  2
FILE:  3
FILE:  4
FILE:  5
FILE:  6
FILE:  7
FILE:  8
FILE:  9
FILE:  10
FILE:  11
FILE:  12
FILE:  13
FILE:  14
FILE:  15
FILE:  16
FILE:  17
FILE:  18
FILE:  19
FILE:  20
FILE:  21
FILE:  22
FILE:  23
FILE:  24
FILE:  25
FILE:  26
FILE:  27
FILE:  28
FILE:  29
FILE:  30
FILE:  31
FILE:  32
FILE:  33
FILE:  34
FILE:  35
FILE:  36
FILE:  37
FILE:  38
FILE:  39
FILE:  40
FILE:  41
FILE:  42
FILE:  43
FILE:  44
FILE:  45
FILE:  46
FILE:  47
FILE:  48
FILE:  49
FILE:  50
FILE:  51
FILE:  52
FILE:  53
FILE:  54
FILE:  55
FILE:  56
FILE:  57
FILE:  58
FILE:  59
FILE:  60
FILE:  61
FILE:  62
FILE:  63
FILE:  64
FILE:  65
FILE:  66
FILE:  67
FILE:  68
FILE:  69
FILE:  70
FILE:  71
FILE:  72
FILE:  73
FILE:  74
FILE:  75
FILE:  76
FILE:  77
FILE:  78
FILE:  79
FILE:  80
FILE:  81
FILE:  82
FILE:  83
FILE:  84
FILE:  85
FILE:  86
FILE:  87
FILE:  88
FILE:  89
FILE:  90
FILE:  91
FILE:  92
FILE:  93
FILE:  94
FILE:  95
FILE:  96
FILE:  97
FILE:  98
FILE:  99
FILE:  100
FILE:  1

FILE:  767
FILE:  768
FILE:  769
FILE:  770
FILE:  771
FILE:  772
FILE:  773
FILE:  774
FILE:  775
FILE:  776
FILE:  777
FILE:  778
FILE:  779
FILE:  780
FILE:  781
FILE:  782
FILE:  783
FILE:  784
FILE:  785
FILE:  786
FILE:  787
FILE:  788
FILE:  789
FILE:  790
FILE:  791
FILE:  792
FILE:  793
FILE:  794
FILE:  795
FILE:  796
FILE:  797
FILE:  798


In [7]:
import os
files = os.listdir("../CASEMINE/TEXT_FILES")

In [8]:
print(len(files))

874


In [9]:
data=data.drop_duplicates(['TID'])
print(len(data))

874


In [10]:
data.to_csv("../CASEMINE/links_updated.csv",index=False)