In [56]:
import yaml
from bs4 import BeautifulSoup
from pathlib import Path
import re

In [57]:
def get_gpath_dict():
    paths = yaml.safe_load(open('/home2/sisodiya.bhoomendra/github/contrastive_learning/data/paths.yaml','r'))
    posix_path = {}
    for k,v in paths.items():
        posix_path[k] = Path(v)
    return posix_path

In [58]:
posix_path = get_gpath_dict()

In [59]:
posix_path['path_html_judg']

PosixPath('/home2/sisodiya.bhoomendra/judgments')

In [60]:
jids = set()
for paths in posix_path['path_html_judg'].iterdir():
    # BeautifulSoup(paths)
    jids.add(paths.name.split('.')[0])

In [61]:
len(jids)

53897

In [62]:
read = BeautifulSoup(open("/home2/sisodiya.bhoomendra/judgments/1579360.html"))

In [63]:
read

<div class="judgments">
<div class="docsource_main">Supreme Court of India</div>
<div class="doc_title">Dr. H. Mukherjee vs Union Of India on 28 September, 1993</div><div class="doc_citations">Equivalent citations: 1994 AIR  495, 1994 SCC  Supl.  (1) 250</div>
<div class="doc_author">Author: Ahmadi</div>
<div class="doc_bench">Bench: Ahmadi, A.M. (J)</div>
<pre id="pre_1">           PETITIONER:
DR. H. MUKHERJEE

	Vs.

RESPONDENT:
UNION OF INDIA

DATE OF JUDGMENT28/09/1993

BENCH:
AHMADI, A.M. (J)
BENCH:
AHMADI, A.M. (J)
PUNCHHI, M.M.
RAMASWAMY, K.

CITATION:
 1994 AIR  495		  1994 SCC  Supl.  (1) 250
 JT 1993 (5)   439	  1993 SCALE  (3)887


ACT:



HEADNOTE:



JUDGMENT:
</pre><p id="p_1">The Judgment of the Court was delivered by
AHMADI,	 J.-  These appeals by special	leave  are  directed
against the decision rendered by the Principal Bench of	 the
Central Administrative Tribunal on February 9, 1993t whereby
it  directed the Appointments Committee of the Cabinet	(for
short 'ACC') to 

In [64]:
def clean_text(para):
    """
    Given a string input return a clearn string 
    1. Replaces tabs next lines and as spaces
    2. Combines Multiple spaces to 1
    3. Removing Punctuation
    3. Makes everthing in lower case
    """
    para = para.replace("\t",' ')
    para = para.replace('\n',' ')# Replacing tabs and next line with spaces
    # para = re.sub('[^\w\s]','',para)# Replacing puncation with space
    para = re.sub(' +',' ',para)
    para = para.lower()
    return para

In [67]:
def to_text(doc):
    if doc is not None:
        return doc.text
    return ''

def get_doc_id_from_a_tag(lk):
    return lk.get('href').split('/')[-2]

def get_doc_name_from_a_tag(lk):
    print(lk.text)
    return lk.text



def html2dict(soup_jud):
    title = to_text(soup_jud.find('div',{"class": "doc_title"}))
    eq_citation = to_text(soup_jud.find('div',{'class':'doc_citations'}))# may or maynot exit
    author =  to_text(soup_jud.find('div',{'class':'doc_author'}))# may or maynot exit
    bench = to_text(soup_jud.find('div',{'class':'doc_bench'}))# may or maynot exit
    meta_data = []
    # Head note should also come in paragraph so if it is pre the remove it and try to add into paragraph
    head_note = []
    paragraphs = []
    for pre in soup_jud.find_all('pre'):
        data = to_text(pre).lower()
        a = data.split('headnote')
        meta_data.append(a[0])
        if len(a)==2:
            if len(a[1]) > 100:
                head_note.append(clean_text(a[1]))
    
    for blockquote in soup_jud.find_all('blockquote'):
        if blockquote is not None:
            paragraphs.append(clean_text(blockquote.text))
    
    judgment_citations = set() # Preform intersection with Links
    other_citations = set()
    for para in soup_jud.find_all('p'):
        if para is not None:
            paragraphs.append(clean_text(para.text))
            
    for link in soup_jud.find_all('a'):
        if link is not None:
            doc_id = get_doc_id_from_a_tag(link)
            if doc_id in jids:
                judgment_citations.add(doc_id)
            else:
                other_citations.add((doc_id,clean_text(get_doc_name_from_a_tag(link))))
    other_citations = list(other_citations)
    judgment_citations = list(judgment_citations)
    return {"title":title ,
            "author":author,
            "eq_citation":eq_citation,
            "bench":bench,
            'meta_data':meta_data,
            "headnote":head_note,
            "paragraphs":paragraphs,
            "citation_sc":judgment_citations,
            "citation_others":other_citations}

In [68]:
html2dict(read)

Article 320
Article	323
Article 323
Article 323
Article 323
Article 323
Article 323


{'title': 'Dr. H. Mukherjee vs Union Of India on 28 September, 1993',
 'author': 'Author: Ahmadi',
 'eq_citation': 'Equivalent citations: 1994 AIR  495, 1994 SCC  Supl.  (1) 250',
 'bench': 'Bench: Ahmadi, A.M. (J)',
 'meta_data': ['           petitioner:\ndr. h. mukherjee\n\n\tvs.\n\nrespondent:\nunion of india\n\ndate of judgment28/09/1993\n\nbench:\nahmadi, a.m. (j)\nbench:\nahmadi, a.m. (j)\npunchhi, m.m.\nramaswamy, k.\n\ncitation:\n 1994 air  495\t\t  1994 scc  supl.  (1) 250\n jt 1993 (5)   439\t  1993 scale  (3)887\n\n\nact:\n\n\n\n'],
 'headnote': [],
 'paragraphs': [' "[h]e has not the ability to give leadership in a department which has all-india jurisdiction. he has also been orally advised not to bring outside influence in his service matters. he needs to develop a proper perspective about the role and functioning of the department. he has not done any meritorious work." ',
  ' "in the instant case no adverse remarks had been communicated to the appellant at the time of se

In [None]:
'../data/processed/processed_judgements/'