##Installing ElasticSearch

In [1]:
!wget -q https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-oss-7.9.2-linux-x86_64.tar.gz
!wget -q https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-oss-7.9.2-linux-x86_64.tar.gz.sha512
!tar -xzf elasticsearch-oss-7.9.2-linux-x86_64.tar.gz
!sudo chown -R daemon:daemon elasticsearch-7.9.2/
!shasum -a 512 -c elasticsearch-oss-7.9.2-linux-x86_64.tar.gz.sha512 


elasticsearch-oss-7.9.2-linux-x86_64.tar.gz: OK


In [2]:
!pip install elasticsearch==7.9.1 -q
# !pip install elasticsearch==8.1.2 -q

[?25l[K     |█▌                              | 10 kB 22.3 MB/s eta 0:00:01[K     |███                             | 20 kB 14.0 MB/s eta 0:00:01[K     |████▌                           | 30 kB 10.0 MB/s eta 0:00:01[K     |██████                          | 40 kB 8.8 MB/s eta 0:00:01[K     |███████▌                        | 51 kB 4.7 MB/s eta 0:00:01[K     |█████████                       | 61 kB 5.5 MB/s eta 0:00:01[K     |██████████▌                     | 71 kB 5.5 MB/s eta 0:00:01[K     |████████████                    | 81 kB 5.4 MB/s eta 0:00:01[K     |█████████████▌                  | 92 kB 6.0 MB/s eta 0:00:01[K     |███████████████                 | 102 kB 5.2 MB/s eta 0:00:01[K     |████████████████▌               | 112 kB 5.2 MB/s eta 0:00:01[K     |██████████████████              | 122 kB 5.2 MB/s eta 0:00:01[K     |███████████████████▍            | 133 kB 5.2 MB/s eta 0:00:01[K     |█████████████████████           | 143 kB 5.2 MB/s eta 0:00:01[K  

In [3]:
!pip freeze | grep elasticsearch

elasticsearch==7.9.1


Importing elasticsearch and other libraries

In [4]:
from google.colab import drive
from elasticsearch import Elasticsearch
import os, glob
import pandas as pd
import numpy as np
import time
from sklearn.feature_extraction.text import CountVectorizer


In [5]:
# Mounting google drive
drive.mount('/content/drive')

Mounted at /content/drive


# Loading Data
##Extracting prior_cases documents from google drive

In [6]:
path= '/content/drive/MyDrive/Data/Prior_Cases/*.txt'
files = glob.glob(path)
len(files)

2006

In [7]:
# Creating a table like structure with first value as file name and second value as text
data=[]
for file in files:
  f = open(file, 'r')
  textfile = f.read()
  case_name = file.split('/')[-1]
  data.append([case_name, textfile])
  f.close()


Creating a dataframe from the above table

In [8]:
df = pd.DataFrame(data,columns=['case_id', 'Case_Text'])
df

Unnamed: 0,case_id,Case_Text
0,prior_case_1188.txt,[1988] INSC 160; AIR 1988 SC 1417; 1988 (1) S...
1,prior_case_0916.txt,INSC 248; AIR 1975 SC 230; 1975 (2) SCR 811; ...
2,prior_case_1987.txt,1973 SC 2622; 1974 (1) SCR 489; 1973 (2) SCC ...
3,prior_case_0086.txt,CLOTH AND GENERAL MILLS LTD. [1969] INSC 301;...
4,prior_case_1997.txt,"November 1968)\n08/11/1968 SHAH, J.C.\nSHAH, ..."
...,...,...
2001,prior_case_1333.txt,INSC 144; AIR 1964 SC 1813; 1964 (8) SCR 18 (...
2002,prior_case_0132.txt,INSC 41; AIR 1955 SC 765; 1955 (2) SCR 483 (2...
2003,prior_case_1521.txt,"October 1956)\n04/10/1956 AIYYAR, T.L. VENKAT..."
2004,prior_case_1537.txt,ORS [1987] INSC 15; AIR 1987 SC 490; 1987 (1)...


# Data Cleaning and Pre-processing
## Removing digits punctuation and less than 4 character words

In [9]:
df['Case_Text'] = df['Case_Text'].str.replace('\d+', '') # for digits
df['Case_Text'] = df['Case_Text'].str.replace(r'(\b\w{1,3}\b)', '') # for words
df['Case_Text'] = df['Case_Text'].str.replace('[^\w\s]', '') # for punctuation
df['Case_Text'] = df['Case_Text'].str.replace('_', '') # for underscores

df

  """Entry point for launching an IPython kernel.
  
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,case_id,Case_Text
0,prior_case_1188.txt,INSC Suppl SCALE \nVENK...
1,prior_case_0916.txt,INSC November \nMATHEW KUTTYIL ...
2,prior_case_1987.txt,August \nKRISHNAIYER \nKRISHNAIYE...
3,prior_case_0086.txt,CLOTH GENERAL MILLS INSC Oct...
4,prior_case_1997.txt,November \n SHAH \nSHAH \nRAMASWAMI \nGROVER ...
...,...,...
2001,prior_case_1333.txt,INSC April \n SUBBARAO \nSUBBARAO \...
2002,prior_case_0132.txt,INSC September \n BHAGWATI NATWARLA...
2003,prior_case_1521.txt,October \n AIYYAR VENKATARAMA AIYYAR VENKAT...
2004,prior_case_1537.txt,INSC SCALE January \nV...


Creating a list corpus having all the documents

In [13]:
corpus=[]
corpus = df.loc[:,'Case_Text'].tolist()
corpus[0]

'  INSC        Suppl            SCALE    \nVENKATACHALLIAH   VENKATACHALLIAH   PATHAK  \nCITATION      Supl             SCALE \nWealth   Sections   LoanTermed  assessee  Quarazahasana held includible  wealth  assessee  liable  \nWords  Phrases QuarazaHasanaMeaning \n assesseerespondent  advanced       partner Faizullabhai Mandlawala which   employed   part  Mandlawala capital  their partnership firm  assessee sought  have  value  that loan excluded from  wealth   claim that this loan  what  known  Muslim   QuarazaHasana  debt  good faith  goodwill carrying with   legal obligation   part   debtor  repay  correspondingly  right   part  assessee  expect much less enforce  repayment This claim  supported  debtor declaration that    received   without  obligation  without  rate  interest  without  consideration  Wealth  Officer   Appellate Assistant Commissioner found  difficult  accept this claim  Appellate Tribunal however accepting  assessee claim held that  loan partook   character  Qua

This portion of code was used to stem and tokenize, removed later on to improve the resuts, hence commented

In [None]:
# import nltk
# from nltk.tokenize import sent_tokenize, word_tokenize
# from nltk.stem import PorterStemmer
# nltk.download('punkt')

# porter=PorterStemmer()

# def stemSentence(sentence):
#     token_words=word_tokenize(sentence)
#     token_words
#     stem_sentence=[]
#     for word in token_words:
#         stem_sentence.append(porter.stem(word))
#         stem_sentence.append(" ")
#     return "".join(stem_sentence)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


A stemmed corpus was created here, but as above removed later, hence commented

In [None]:
# stemmed_corpus = []
# for file in corpus:
#   stem_file = stemSentence(file)
#   stemmed_corpus.append(stem_file)
# print(stemmed_corpus.shape())
# df['stemmed_text'] = stemmed_corpus
# df
# stemmed_corpus[0]

Creating a list having each document_id and text pair to make it easy to refer to the name of the document in the later stages

In [11]:
Id_Text_pair = []
for file,corpus in zip(files,corpus):
  case_id = file.split('/')[-1]
  text = corpus
  Id_Text_pair.append((case_id,text))

In [12]:
# df_stemmed1 = df[['case_id','stemmed_text']]   # commented as stemming is not implemented now
df_stemmed1 = df[['case_id','Case_Text']]
df_stemmed1

Unnamed: 0,case_id,Case_Text
0,prior_case_1188.txt,INSC Suppl SCALE \nVENK...
1,prior_case_0916.txt,INSC November \nMATHEW KUTTYIL ...
2,prior_case_1987.txt,August \nKRISHNAIYER \nKRISHNAIYE...
3,prior_case_0086.txt,CLOTH GENERAL MILLS INSC Oct...
4,prior_case_1997.txt,November \n SHAH \nSHAH \nRAMASWAMI \nGROVER ...
...,...,...
2001,prior_case_1333.txt,INSC April \n SUBBARAO \nSUBBARAO \...
2002,prior_case_0132.txt,INSC September \n BHAGWATI NATWARLA...
2003,prior_case_1521.txt,October \n AIYYAR VENKATARAMA AIYYAR VENKAT...
2004,prior_case_1537.txt,INSC SCALE January \nV...


Getting the vocabulary and its size, just for visualization purposes

In [14]:
vectorizer = CountVectorizer(stop_words='english')
documents_vectorized = vectorizer.fit_transform(corpus)
vocabulary = vectorizer.get_feature_names_out()
vocabulary.shape

(83786,)

In [15]:
dataframe = pd.DataFrame(documents_vectorized.toarray(), columns=vocabulary)
dataframe

Unnamed: 0,aaaa,aaccorded,aadras,aaginst,aalborg,aand,aane,aangam,aapeal,aapke,...,zoraster,zorawar,zoroastrian,zoroastrianism,zumber,zure,zurich,zuripeshgi,zutshi,zwinglee
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,7,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2001,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2002,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2003,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2004,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


#Connecting to ElasticSearch

In [16]:
%%bash --bg
sudo -H -u daemon elasticsearch-7.9.2/bin/elasticsearch

Starting job # 0 in a separate thread.


In [17]:
%%bash
ps -ef | grep elasticsearch

root        2395    2393  0 09:42 ?        00:00:00 sudo -H -u daemon elasticsearch-7.9.2/bin/elasticsearch
daemon      2396    2395  0 09:42 ?        00:00:00 /bin/bash elasticsearch-7.9.2/bin/elasticsearch
daemon      2419    2396  0 09:42 ?        00:00:00 /bin/bash elasticsearch-7.9.2/bin/elasticsearch
daemon      2420    2419  0 09:42 ?        00:00:00 /content/elasticsearch-7.9.2/jdk/bin/java -version
root        2429    2407  0 09:42 ?        00:00:00 grep elasticsearch


In [18]:
# start es server
time.sleep(20) # give the server 20 seconds to start ..
!curl -X GET "http://localhost:9200" 

{
  "name" : "c089867feb90",
  "cluster_name" : "elasticsearch",
  "cluster_uuid" : "lV7sTj0USSynrZI8nm7lkA",
  "version" : {
    "number" : "7.9.2",
    "build_flavor" : "oss",
    "build_type" : "tar",
    "build_hash" : "d34da0ea4a966c4e49417f2da2f244e3e97b4e6e",
    "build_date" : "2020-09-23T00:45:33.626720Z",
    "build_snapshot" : false,
    "lucene_version" : "8.6.2",
    "minimum_wire_compatibility_version" : "6.8.0",
    "minimum_index_compatibility_version" : "6.0.0-beta1"
  },
  "tagline" : "You Know, for Search"
}


In [19]:
def test_ES(client):
  return es.ping()  # got True

In [20]:
# start and TEST es 
es = Elasticsearch("http://localhost:9200")
if test_ES(es):
  print('ES instance working')
else:
  print('ES instance not working')

ES instance working


In [21]:
# Server information
es.info()

{'cluster_name': 'elasticsearch',
 'cluster_uuid': 'lV7sTj0USSynrZI8nm7lkA',
 'name': 'c089867feb90',
 'tagline': 'You Know, for Search',
 'version': {'build_date': '2020-09-23T00:45:33.626720Z',
  'build_flavor': 'oss',
  'build_hash': 'd34da0ea4a966c4e49417f2da2f244e3e97b4e6e',
  'build_snapshot': False,
  'build_type': 'tar',
  'lucene_version': '8.6.2',
  'minimum_index_compatibility_version': '6.0.0-beta1',
  'minimum_wire_compatibility_version': '6.8.0',
  'number': '7.9.2'}}

In [22]:
es = Elasticsearch()

# Retrieval Models
### Explicitly mapping the data for indexing rather than using dynamic mapping, for the default implementation of ElasticSearch i.e. Okapi BM25 model

In [23]:
request_body = {
    'settings': {
        'number_of_shards': 1,
        'number_of_replicas': 0,
        
    },
    'mappings': {
          'properties': {
              'title': {'type': 'text'},
              'body': {'type': 'text'},
          }
    }
}

index_name = 'prior_cases'
try:
  es.indices.get(index_name)
  print('index {} already exists'.format(index_name))
except:
  print('creating index {}'.format(index_name))
  es.indices.create(index_name, body=request_body)

creating index prior_cases


#Expicit mapping as above for the DFR model implementation

In [24]:
request_body_dfr = {
    'settings': {
        'number_of_shards': 1,
        'number_of_replicas': 1,
        'index': {
            'similarity': {
                'dfr_similarity': {
                    'type': 'DFR',
                    'basic_model': 'g',
                    'after_effect': 'l',
                    'normalization': 'h2',
                    'normalization.h2.c':'3.0'

                }
            }
        }
        
    },
    'mappings': {
          'properties': {
              'title': {'type': 'text', 'similarity': 'dfr_similarity'},
              'body': {'type': 'text', 'similarity': 'dfr_similarity'}
          }
    }
}

index_name_dfr = 'prior_cases_dfr'
try:
  es.indices.get(index_name_dfr)
  print('index {} already exists'.format(index_name_dfr))
except:
  print('creating index {}'.format(index_name_dfr))
  es.indices.create(index_name_dfr, body=request_body_dfr)

creating index prior_cases_dfr


##Here, we are entering data into the index for BM25 model

In [25]:
for caseid,text in Id_Text_pair:
  document_body= {
      'title':caseid,
      'body':text
  }
  es.index(index_name,document_body)


##Now, entering data into the index for DFR Model

In [26]:
for caseid,text in Id_Text_pair:
  document_body_dfr= {
      'title':caseid,
      'body':text
  }
  es.index(index_name_dfr,document_body_dfr)

In [27]:
print('we have made an index called {} with {} documents'.format(index_name, es.cat.count(index=index_name,h=['count'])))

we have made an index called prior_cases with 2006
 documents


In [28]:
print('we have made an index called {} with {} documents'.format(index_name_dfr, es.cat.count(index=index_name_dfr,h=['count'])))

we have made an index called prior_cases_dfr with 2006
 documents


In [29]:
# Defining search function for BM25 implementation which returns the results as a list
def search(index_name, query_body):
  # return only doc_id and rank
  results = es.search(index=index_name, body=query_body, explain=False)
  plain_results = [(x['_source']['title'], x['_score']) for x in results['hits']['hits']]
  return results, plain_results

In [30]:
# Defining the search function for DFR implementation which returns the results as a list
def search_dfr(index_name_dfr, query_body):
  # return only doc_id and rank
  results_dfr = es.search(index=index_name_dfr, body=query_body, explain=False)
  plain_results_dfr = [(x['_source']['title'], x['_score']) for x in results['hits']['hits']]
  return results_dfr, plain_results_dfr

###Defining query body for BM25 and calling the search function with this query_body

###Getting the input from user, which eventually will be the processed and filtered query

In [39]:
query_body = {
  "query": {
    "match": {
        "body": input("Enter the query")
      }
    }
  }
results, plain_results_bm25 = search(index_name, query_body)
plain_results_bm25

Enter the queryabrogate irrecoverable roadblocks reinvested thirtieth presumes posited partys obtrusive undoes lunchroom longterm newlyfloated irreconcilability creche inferentially decongestion wellrecognized stimulating dictionaries restroom relocation rationalization incongruity transitional concentration constructs investing supersede livestock omit incurs congestion favouring relieving sway introduces enure undivided library poultry canteen poses owning repeals attributing constructing manifested window utilize insert obiter dicta shifting postulate incuriam recreational exports expires hazards fulfilment conceivable utilization repealing textual supersession detract redundant presents debts shifts inadvertence decidendi selfsame enacts declaratory deducting superseded appropriated utilised


[('prior_case_0972.txt', 70.714516),
 ('prior_case_0131.txt', 43.986588),
 ('prior_case_1612.txt', 40.44661),
 ('prior_case_1659.txt', 37.371014),
 ('prior_case_0099.txt', 37.363853),
 ('prior_case_1003.txt', 37.001842),
 ('prior_case_1037.txt', 33.191666),
 ('prior_case_1240.txt', 32.109276),
 ('prior_case_1604.txt', 31.270605),
 ('prior_case_0808.txt', 30.416868)]

###Here, we have defined query body and called the search function for DFR model implementation

###Here query is passed as string, which is processed and filtered query copied from the Query processing notebook

In [40]:
query_body_dfr = {
    "query":{
        "match": {
            "body":  "abrogate irrecoverable roadblocks reinvested thirtieth presumes posited partys obtrusive undoes lunchroom longterm newlyfloated irreconcilability creche inferentially decongestion wellrecognized stimulating dictionaries restroom relocation rationalization incongruity transitional concentration constructs investing supersede livestock omit incurs congestion favouring relieving sway introduces enure undivided library poultry canteen poses owning repeals attributing constructing manifested window utilize insert obiter dicta shifting postulate incuriam recreational exports expires hazards fulfilment conceivable utilization repealing textual supersession detract redundant presents debts shifts inadvertence decidendi selfsame enacts declaratory deducting superseded appropriated utilised construct pollution incentive glaring assessees hereto delete earning earned facilitating advances missed municipality embark column omitting oversight whilst purchases nugatory"
        }
    }
}
results, plain_results_dfr = search(index_name_dfr, query_body_dfr)
plain_results_dfr

[('prior_case_0972.txt', 57.524017),
 ('prior_case_0131.txt', 44.6589),
 ('prior_case_1593.txt', 42.77276),
 ('prior_case_1604.txt', 41.839493),
 ('prior_case_1667.txt', 37.597076),
 ('prior_case_0890.txt', 37.380245),
 ('prior_case_1239.txt', 36.917564),
 ('prior_case_1151.txt', 36.789234),
 ('prior_case_0111.txt', 36.589836),
 ('prior_case_1567.txt', 36.33386)]

#Evaluation

###Reading the qrel file for the Evaluation

In [34]:
f = open("qrel.txt", "r")
q_rel = f. read()
qrel = q_rel.split("\n")
f. close()

###Creating a dictionary with query document as key and all relevant documents as values for that particular query document

In [35]:
from collections import defaultdict
qrel_dict = defaultdict(list)
for doc in qrel:
  try:
    query_id=doc.split(" ")[0]
    doc_id=doc.split(" ")[2]
    qrel_dict[query_id].append(doc_id)
  except:
    pass
print(qrel_dict)

defaultdict(<class 'list'>, {'current_case_0051': ['prior_case_1593', 'prior_case_0895', 'prior_case_1463', 'prior_case_1612', 'prior_case_1644'], 'current_case_0184': ['prior_case_1282', 'prior_case_0317', 'prior_case_0639', 'prior_case_0735', 'prior_case_0343'], 'current_case_0160': ['prior_case_0599', 'prior_case_0277', 'prior_case_1042', 'prior_case_0634', 'prior_case_1133'], 'current_case_0142': ['prior_case_0688', 'prior_case_1296', 'prior_case_0413', 'prior_case_1227', 'prior_case_0327'], 'current_case_0166': ['prior_case_1131', 'prior_case_1566', 'prior_case_0649', 'prior_case_1331', 'prior_case_1515'], 'current_case_0035': ['prior_case_0646', 'prior_case_0133', 'prior_case_1225', 'prior_case_0428', 'prior_case_1284'], 'current_case_0061': ['prior_case_0114', 'prior_case_0881', 'prior_case_1205', 'prior_case_0627', 'prior_case_0445'], 'current_case_0121': ['prior_case_0408', 'prior_case_0603', 'prior_case_0409', 'prior_case_1482', 'prior_case_1804'], 'current_case_0078': ['prio

###Getting the relevant documents for a query in a list

In [41]:
query_id=input("Enter query_id")
case_list = qrel_dict[query_id]
case_list

Enter query_idcurrent_case_0183


['prior_case_1659',
 'prior_case_0131',
 'prior_case_0008',
 'prior_case_0822',
 'prior_case_0972']

####Defining a function calculate_evaluations which calculates precision, recall and f1 score for both the models. We are taking only the first five documents retrieved as our dataset has five relevant documents for every query.

In [42]:
def calculate_evaluations(case_list):

  retrieved_bm25= [doc[0].split('.')[0] for doc in plain_results_bm25] # take only the document id, rather than score
  retrieved_bm25=retrieved_bm25[0:5]
  retrieved_dfr= [doc[0].split('.')[0] for doc in plain_results_dfr]
  retrieved_dfr=retrieved_dfr[0:5]
  TP_bm25=(set(case_list).intersection(set(retrieved_bm25)))
  TP_bm25_len = len(TP_bm25)
  FP_bm25=set(retrieved_bm25)-TP_bm25
  FP_bm25_len=len(FP_bm25)
  TP_dfr=(set(case_list).intersection(set(retrieved_dfr)))
  TP_dfr_len = len(TP_dfr)
  FP_dfr=set(retrieved_dfr)-TP_dfr
  FP_dfr_len=len(FP_dfr)
  FN_bm25= len(set(case_list)-TP_bm25)
  FN_dfr= len(set(case_list)-set(retrieved_dfr))
 

  precision_bm25 = TP_bm25_len / (TP_bm25_len+FP_bm25_len)
  precision_dfr = TP_dfr_len / (TP_dfr_len+FP_dfr_len)
  recall_bm25=TP_bm25_len/(TP_bm25_len+FN_bm25)
  recall_dfr=TP_dfr_len/(TP_dfr_len+FN_dfr)
  f1_bm25=2*precision_bm25*recall_bm25 / (precision_bm25+recall_bm25)
  f1_dfr=2*precision_dfr*recall_dfr / (precision_dfr+recall_dfr)

  return TP_bm25_len,TP_dfr_len,precision_bm25, precision_dfr,recall_bm25,recall_dfr,f1_bm25,f1_dfr

##Calling the above function

In [43]:
tp_bm25,tp_dfr,p_bm25,p_dfr,r_bm25,r_dfr,f1_bm25,f1_dfr=calculate_evaluations(case_list)
print("True Positives for BM25:",tp_bm25,"\nPrecion_BM25:",p_bm25,"\nRecall_BM25:",r_bm25,"\nf1_score_BM25:",f1_bm25)
print("True positives for DFR",tp_dfr,"\nPrecion_DFR:",p_dfr,"\nRecall_DFR:",r_dfr,"\nf1_score_DFR:",f1_dfr)


True Positives for BM25: 3 
Precion_BM25: 0.6 
Recall_BM25: 0.6 
f1_score_BM25: 0.6
True positives for DFR 2 
Precion_DFR: 0.4 
Recall_DFR: 0.4 
f1_score_DFR: 0.4000000000000001
