In [2]:
from bs4 import BeautifulSoup
import json
from pathlib import Path
import glob
import os
import requests

In [3]:
"""extracting cites (including edge cases for "view all" and None)"""

def find_cites(soup):
    
    baseurl = 'https://indiankanoon.org'
    cites_list = []  # final cites list 
    main_cites = soup.find('div',class_ = 'doc_cite')  # extracting all document citations(cites and citedby)
    if main_cites != None:
        view_all_link = main_cites.a['href']  # to get the link to extract more 'cites' - view all
        if ":" in view_all_link:
            page_num = 0
            while True:
                tempurl = "https://indiankanoon.org/search/?formInput=cites%3A%20"+ view_all_link.split(":")[1]+"&pagenum=" 
                tempurl += str(page_num)
                while True:
                    try:
                        r = requests.get(tempurl)
                        break
                    except(AttributeError, requests.exceptions.MissingSchema, requests.exceptions.ConnectionError, requests.exceptions.InvalidURL,requests.exceptions.InvalidSchema):
                        print("Request Error")
                soup_again = BeautifulSoup(r.content, 'html')
                all_cites_link = soup_again.select(".result_title")
                for link in all_cites_link :
                    cites_list.append({"title":link.find('a').text,"link": baseurl + link.find('a').get('href'),"caseids":["0"]})

                if len(all_cites_link) != 10:  # to break the infinite loop
                    break
                page_num += 1

        else:
            only_cite_heads = soup.select("div > .doc_cite_head")  # array of cites and citedby headers
            if "Cites" in only_cite_heads[0].text:  # checking if cites are present in the html
                res = [int(i) for i in only_cite_heads[0].text.split() if i.isdigit()]  # getting the number of cites to extract
                all_cites_link = soup.select("div > .cite_title")[:res[0]]
                for link in all_cites_link:
                    cites_list.append({"title":link.find('a').text,"link":baseurl + link.find('a').get('href'),"caseids":["0"]})

    return cites_list

In [4]:
"""extracting Cited By (including edge cases for "view all" and None)"""

def find_cited_by(soup):
    
    baseurl = 'https://indiankanoon.org'
    cited_by_header = soup.select('a[href*="citedby"]')  # extracting the header and its content which contains "citedby"
    cited_by_list = []  # final Cited By list
    if cited_by_header != []:  # Checking if there are no Cited By in the HTML
        view_all_link = cited_by_header[0].parent.a['href']  # to get the link to extract more 'cited by' - view all
        if ":" in view_all_link:
            page_num = 0
            while True:
                tempurl = "https://indiankanoon.org/search/?formInput=citedby%3A%20"+ view_all_link.split(":")[1]+"&pagenum=" 
                tempurl += str(page_num)
                while True:
                    try:
                        r = requests.get(tempurl)
                        break
                    except(AttributeError, requests.exceptions.MissingSchema, requests.exceptions.ConnectionError, requests.exceptions.InvalidURL,requests.exceptions.InvalidSchema):
                        print("Request Error")
                soup_again = BeautifulSoup(r.content, 'html')
                all_cited_by_links = soup_again.select(".result_title")
                
                for link in all_cited_by_links:
                    cited_by_list.append({"title":link.find('a').text,"link": baseurl + link.find('a').get('href'),"caseids":["0"]})
                    
                if len(all_cited_by_links) != 10:  # to break the infinite loop
                    break
                page_num += 1
                
    else:
        main_cites = soup.find('div',class_ = 'doc_cite')  # extracting all document citations(cites and citedby)
        only_cite_heads = soup.select("div > .doc_cite_head")  # array of cites and citedby headers
        if only_cite_heads != []:
            if len(only_cite_heads) > 1:  # if only_cite_heads[1] contains cited by links
                all_cited_by_links = only_cite_heads[1].find_next_siblings("div")  # iteratively find all links in the same tree
                for link in all_cited_by_links:
                    cited_by_list.append({"title": link.find('a').text,"link":baseurl + link.find('a').get('href'),"caseids":["0"]})

            elif "Citedby" in only_cite_heads[0].text: # if there are only cited by links present in the HTML (no cites)
                all_cited_by_links = only_cite_heads[0].find_next_siblings("div") #find siblings
                for link in all_cited_by_links:
                    cited_by_list.append({"title": link.find('a').text,"link":baseurl + link.find('a').get('href'),"caseids":["0"]})
    
    return cited_by_list

In [5]:
"""Creating basic caseinfo dictionary of cites and citedby"""

def create_caseinfo(soup):
    
    cites = find_cites(soup)
    cited_by = find_cited_by(soup)
    court = soup.find('div',class_ = 'docsource_main').text
    caseinfo = {'court': court.strip(), 'cites': cites,'citedBy': cited_by}
    return caseinfo

In [6]:
"""Code adapted from data acquisition pipeline code to extract data from HTML"""

def sortdata(soup, caseinfo, alltext):
    
    initial = alltext.find('pre', id = 'pre_1')
    vslist = ['versus', 'vs', 'vs.', 'Versus']
    petitioners = []
    respondents = ''
    flag = 0
    baseurl = 'https://indiankanoon.org'

    # Application name and number
    test_list = []
    if initial:
        test_list = [i.strip() for i in initial.text.splitlines() if i]
    applicationname = '0'
    applicationnumber = '0'
    for test in test_list:
        if test.lower() == 'in the supreme court of india' or test.lower() == 'in the high court of judicature of bombay' or test.lower() == 'case no.:' or test.lower().find('in the high court') != -1:
            applicationname = '1'
        elif applicationname == '1':
            applicationname = test
            applicationnumber = '1'
        elif applicationnumber == '1':
            applicationnumber = test
    if alltext.find('p', id='p_1') and applicationname == "0" and applicationnumber == "0":
        temp = alltext.find('p', id='p_1').text.strip().split(' ')
        temp2 = alltext.find('p', id='p_1').text.split(':')
        try:
            if temp[1].strip().lower() == 'appellate' or temp2[0].strip().lower() == 'writ' and temp2[1].strip().lower() == 'petition':
                ans = alltext.find('p', id='p_1').text.split(':')
                # print(ans)
                applicationname = ans[0].strip()
                if len(ans) != 1:
                    applicationnumber = ans[1].strip()
        except IndexError:
            pass

    caseinfo['applicationName'] = applicationname
    caseinfo['applicationNumber'] = applicationnumber

    author = '0'
    if alltext.find('div', class_ = 'doc_author'):
        author = alltext.find('div', class_ = 'doc_author').text.split(':')[1].strip()
    caseinfo['author'] = author

    # Bench
    bench = '0'
    if alltext.find('div', class_='doc_bench'):
        bench = alltext.find('div', class_='doc_bench').text.split(':')[1].strip()
        bench = [{"name":a_bench, "id":"0"} for a_bench in bench.split(",")]

    caseinfo['bench'] = bench

    link = baseurl + str(soup.find('form').get("action").split("nextpage=")[-1])  # changed link
    caselink = link
    casename = soup.find('title').text.strip().replace('/', '').replace('*', '').replace('\\', '').replace('<', '').replace('>', '').replace(':', '').replace('?', '').replace('"', '').replace('|', '').replace('\t', '')

    paras = alltext.find_all(['p', 'blockquote','pre'])[1:]  # 'blockquote'

    if not initial:

        if alltext.find('p', id = 'p_1') and alltext.find('p', id = 'p_1').text.split(' ')[0].strip() in ['JUDGMENT', 'J U D G M E N T', 'ORDER']:
            judge = (alltext.find('p', id='p_1').text.replace('JUDGMENT', '').replace('J U D G M E N T', '').replace('ORDER', '').strip())

        else:
            judge = '0'

    else:
        judgement = (initial.text.splitlines()[-1].strip().replace(' ',''))
        if alltext.find('p', id='p_1') != None:
            if len(alltext.find('p', id='p_1').text) < 25 and alltext.find('p', id='p_1') and (judgement.lower().replace(':', '') == 'judgment' or alltext.find('p', id = 'p_1').text.lower().find('judgment') != -1 or alltext.find('p', id = 'p_1').text.lower().find('j u d g m e n t ') != -1):
                judge = (alltext.find('p', id = 'p_1').text.replace('JUDGMENT', '').replace('J U D G M E N T', '').strip())
                paras = paras[1:]
            else:
                judge = (judgement)
        else:
            judge = ""

    if judge == "":
        judge = '0'
    if judge.lower().find('order') != -1 or judge.lower().find('respondent') != -1:
        judge = '0'

    judge = [{"name":judge, "id":"0"}]

    caseinfo['judge'] = judge

    casetitle = soup.find('title').text.strip().replace('/', '').replace('*', '').replace('\\', '').replace('<', '').replace('>', '').replace(':', '').replace('?', '').replace('"', '').replace('|', '').replace('\t', '')

    # Petitioners and Respondents
    if initial:
        text = ''
        vsflag = 0
        for word in initial.text.splitlines():
            word = (word.strip())
            if not word:
                text = ''
                continue
            else:
                text = text + ' ' + word

            if flag == 2:
                respondents += ' ' + word

            if flag == 1:
                respondents = text
                flag = 2

            if word.replace('-', '').replace('/', '').replace('.', '').lower().strip() in vslist:
                vsflag = 1
                petitioners.append(prev)
                text = ''
                flag = 1
            prev = text
        petitioners = [item.split('   ')[0].replace('PETITIONER:', '').strip() for item in petitioners]

        if vsflag == 0:
            pet = 0
            res = 0
            for word in initial.text.splitlines():
                if word.strip().replace(':', '').replace('-', '').lower() == 'petitioner':
                    pet = 1
                elif word.strip().replace(':', '').replace('-', '').lower() == 'respondent':
                    res = 1
                elif pet == 1:
                    petitioners.append(word.strip())
                    pet = 0
                elif res == 1:
                    respondents = word.strip()
                    res = 0

    respondents = respondents.replace('RESPONDENT:', '').split('   ')[0].strip()

    if len(petitioners) == 0 or respondents == "" or len(petitioners[0]) < 5 or len(respondents) < 5:
        temp = casetitle.split('vs')
        petitioners = []
        petitioners.append(temp[0].strip())
        if len(temp)>1:
            respondents = temp[1].split(' on ')[0].strip()

    petitioners = list(dict.fromkeys(petitioners))

    # Paragraphs
    allparas = []
    mainparas = {}
    subparas = {}
    blockq = []    
    for  i in range(1,len(paras)) :
        if paras[i].name == "p":
            mainparas=[paras[i].text.strip().replace('\n', ' ').replace('\t', ' ').replace('\x0c', '')]

            subparas={}
            blockq = []        
            if i+1 != len(paras):                               
                if paras[i+1].name == "pre":
                    sub = {}  
                    q =i+1
                    while (paras[q].name !="p"):
                        sub[paras[q].get('id')] = paras[q].text.strip().replace('\n', ' ').replace('\t', ' ').replace('\x0c', '')
                        q=q+1

                        if q ==len(paras):
                            break
                    subparas =sub
                if paras[i+1].name =="blockquote":
                    block_quote = []
                    q= i+1
                    while (paras[q].name !="p"):
                        block_quote.append(paras[q].text.strip().replace('\n', ' ').replace('\t', ' ').replace('\x0c', ''))
                        q = q+1

                        if q == len(paras):
                            break

                    blockq = ["\n ".join(block_quote)] 
            else:
                pass
            allparas.append({'para': mainparas,  'blockQuotes': blockq,'subPara': subparas,})

    # Combining all together
    caseinfo['title'] = casetitle
    caseinfo['petitioners'] = petitioners
    caseinfo['respondants'] = respondents
    caseinfo['link'] = caselink

    prejudgement = ""
    if initial != None:
        prejudgement = initial.text

    equivalent_citations = []    
    caseinfo["prejudgement"] = [prejudgement]
    caseinfo['paragraphs'] = allparas
    caseinfo['equivalentCitations'] = equivalent_citations
    caseinfo['id'] = "0"

    return caseinfo

In [7]:
"""Main method to change HTML path according to the user"""

def main():
    
    data_folder_path = r'/Users/arya/anydesk_transfer/J&K'  # HTML files path
    duplicate_index_counter = 0
    for court in os.listdir(data_folder_path):
        if not court.endswith(".zip"):
            if court == "Jammu & Kashmir":
                for year in sorted(os.listdir(data_folder_path+"/"+court)):
                    if year == ".DS_Store":
                        continue
                    if int(year) >= int("1800"):
                        for month in sorted(os.listdir(data_folder_path+"/"+court+"/"+year)):
                            if month == ".DS_Store":
                                continue
                            for a_html in Path(data_folder_path+"/"+court+"/"+year+"/"+month).rglob('*.html'):
                                soup = BeautifulSoup(open(a_html,'r', encoding='utf-8'),'html.parser')
                                case_name = str(a_html).split("\\")[-1]
                                alltext = soup.find('div', class_ = 'judgments')
                                caseinfo_raw = create_caseinfo(soup)
                                caseinfo = sortdata(soup, caseinfo_raw, alltext)
                                save = os.path.join(Path(data_folder_path+"/"+court+"/"+year+"/"+month), case_name.replace(".html",".json"))
                                with open(save, "w") as outfile:
                                    json.dump(caseinfo, outfile, indent = 4)
                                outfile.close()
                                        
    print("Successfully converted all HTML(s) to JSON(s)")
    
if __name__ == "__main__":
    main()

Successfully converted all HTML(s) to JSON(s)
