In [1]:
# Mounting drive and importing necessary libraries
from google.colab import drive
import os, glob
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np

drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# Loading the 200 query files, current_cases data
path= '/content/drive/MyDrive/Data/*.txt'
files = glob.glob(path)
len(files)

200

Creating table like structure for the list

In [3]:
data=[]
for file in files:
  f = open(file, 'r', encoding='cp1252')
  textfile = f.read()
  case_name = file.split('/')[-1]
  data.append([case_name, textfile])
  f.close()


In [4]:
df = pd.DataFrame(data,columns=['case_id', 'Case_Text'])
df

Unnamed: 0,case_id,Case_Text
0,current_case_0183.txt,"1\. The assessee, a private limited company, h..."
1,current_case_0153.txt,This criminal appeal arises from the final ord...
2,current_case_0011.txt,"In all these appeals, identical question of la..."
3,current_case_0140.txt,The State of Uttar Pradesh through CBI aggriev...
4,current_case_0002.txt,"The petitioners in all these petitions, served..."
...,...,...
195,current_case_0179.txt,"S. SAGHIR AHMAD, J. Habibur Rehman, who is sin..."
196,current_case_0161.txt,An eviction petition filed by the landlord-res...
197,current_case_0188.txt,**Judgement**\nIN THE SUPREME COURT OF INDIA C...
198,current_case_0041.txt,Bar Council of India by means of this writ pet...


# Query Cleaning

In [5]:
# Removing digits punctuation and less than 4 character words
df['Case_Text'] = df['Case_Text'].str.replace('\d+', '') # for digits
df['Case_Text'] = df['Case_Text'].str.replace(r'(\b\w{1,3}\b)', '') # for words
df['Case_Text'] = df['Case_Text'].str.replace('[^\w\s]', '') # for punctuation
df['Case_Text'] = df['Case_Text'].str.replace('_', '') # for underscores


df

  
  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from sys.path.


Unnamed: 0,case_id,Case_Text
0,current_case_0183.txt,assessee private limited company industri...
1,current_case_0153.txt,This criminal appeal arises from final order ...
2,current_case_0011.txt,these appeals identical question involved...
3,current_case_0140.txt,State Uttar Pradesh through aggrieved ord...
4,current_case_0002.txt,petitioners these petitions served Medical...
...,...,...
195,current_case_0179.txt,SAGHIR AHMAD Habibur Rehman since dead r...
196,current_case_0161.txt,eviction petition filed landlordrespondent ...
197,current_case_0188.txt,Judgement\n SUPREME COURT INDIA CRIMINAL APP...
198,current_case_0041.txt,Council India means this writ petition und...


###Creating a list corpus for all the query documents as a list

In [8]:
corpus=[]
corpus = df.loc[:,'Case_Text'].tolist()
corpus[0]

'  assessee  private limited company   industrial unit  Majiwada Thane which   notified urban area With  view  shift  industrial undertaking from  urban area   urban area  Kurukumbh Village Pune District Maharashtra  sold  land building  plant  machinery situated  Majiwada Thane  Shree Vardhman Trust   consideration    after deducting  amount    earned  capital gain   Since  intended  shift  industrial undertaking from  urban area    urban area    capital gain  earned  appellant paid    advances various amounts  different persons  purchase  land plant  machinery construction  factory building  Such advances amounted     year   appellant claimed exemption under Section    Income     entire capital gain earned from  sale proceeds   erstwhile industrial undertaking situate  Thane  view   advances  made being more than  capital gain made  \n   order dated   Assessing Officer imposed    capital gains refusing  grant exemption   appellant under Section   reasons given were\n  have carefully 

PorterStemmer used and then discardered

In [None]:
# import nltk
# from nltk.tokenize import sent_tokenize, word_tokenize
# from nltk.stem import PorterStemmer
# nltk.download('punkt')

# porter=PorterStemmer()

# def stemSentence(sentence):
#     token_words=word_tokenize(sentence)
#     token_words
#     stem_sentence=[]
#     for word in token_words:
#         stem_sentence.append(porter.stem(word))
#         stem_sentence.append(" ")
#     return "".join(stem_sentence)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
# stemmed_corpus = []
# for file in corpus:
#   stem_file = stemSentence(file)
#   stemmed_corpus.append(stem_file)
# # print(stemmed_corpus.shape())

# df['stemmed_text'] = stemmed_corpus
# df

Unnamed: 0,case_id,Case_Text,stemmed_text
0,current_case_0183.txt,assessee private limited company industri...,assesse privat limit compani industri unit maj...
1,current_case_0153.txt,This criminal appeal arises from final order ...,thi crimin appeal aris from final order judgme...
2,current_case_0011.txt,these appeals identical question involved...,these appeal ident question involv sake brevit...
3,current_case_0140.txt,State Uttar Pradesh through aggrieved ord...,state uttar pradesh through aggriev order date...
4,current_case_0002.txt,petitioners these petitions served Medical...,petition these petit serv medic dental veterin...
...,...,...,...
195,current_case_0179.txt,SAGHIR AHMAD Habibur Rehman since dead r...,saghir ahmad habibur rehman sinc dead repres p...
196,current_case_0161.txt,eviction petition filed landlordrespondent ...,evict petit file landlordrespond urg ground ev...
197,current_case_0188.txt,Judgement\n SUPREME COURT INDIA CRIMINAL APP...,judgement suprem court india crimin appel juri...
198,current_case_0041.txt,Council India means this writ petition und...,council india mean thi writ petit under articl...


Creating an query_id and query text pair for every query

In [7]:
Id_Text_pair = []
# for file,corpus in zip(files,stemmed_corpus):
for file,corpus in zip(files,corpus):
  case_id = file.split('/')[-1]
  text = corpus
  Id_Text_pair.append((case_id,text))
Id_Text_pair[0][0]

'current_case_0183.txt'

###Creating and index for all the query documents



In [9]:
vectorizer = CountVectorizer(stop_words='english')
# documents_vectorized = vectorizer.fit_transform(stemmed_corpus)
documents_vectorized = vectorizer.fit_transform(corpus)
vocabulary = vectorizer.get_feature_names_out()
vocabulary.shape

(22860,)

In [10]:
dataframe = pd.DataFrame(documents_vectorized.toarray(), columns=vocabulary)
dataframe

Unnamed: 0,aadil,aakash,aarif,aarzoo,aati,abadi,abadis,abandon,abandoned,abandoning,...,zile,zilla,zindabad,zindabads,zing,zonal,zone,zones,zoning,zoology
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
196,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
197,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
198,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
# Getting document frequency for all unique words in the query corpus
dfs = (dataframe > 0).sum(axis=0) ## if value is greater than 0 than the dataframe>0 returns 1 and sum function adds all such 1 values
dfs

aadil       1
aakash      1
aarif       1
aarzoo      1
aati        1
           ..
zonal       3
zone       12
zones       3
zoning      1
zoology     1
Length: 22860, dtype: int64

In [12]:
# Here we are getting number of unique words in N 
N = dataframe.shape[0] 
np.array(dataframe.columns)

array(['aadil', 'aakash', 'aarif', ..., 'zones', 'zoning', 'zoology'],
      dtype=object)

In [13]:
# Calculatinf Inverse Document Frequency score for all the words
idfs = np.log10(N/dfs)
idfs

aadil      2.301030
aakash     2.301030
aarif      2.301030
aarzoo     2.301030
aati       2.301030
             ...   
zonal      1.823909
zone       1.221849
zones      1.823909
zoning     2.301030
zoology    2.301030
Length: 22860, dtype: float64

In [14]:
# Creating a dataframe from above data
idf_table = np.array((np.array(dataframe.columns),np.array(idfs))).T
df_idf = pd.DataFrame(idf_table, columns=['word', 'score'])
df_idf

Unnamed: 0,word,score
0,aadil,2.30103
1,aakash,2.30103
2,aarif,2.30103
3,aarzoo,2.30103
4,aati,2.30103
...,...,...
22855,zonal,1.823909
22856,zone,1.221849
22857,zones,1.823909
22858,zoning,2.30103


In [15]:
# Creating a dictionary for idf values
idf_dict = {}
for word,score in zip(df_idf.loc[:,'word'],df_idf.loc[:,'score']):
  idf_dict[word]= score
idf_dict

{'aadil': 2.3010299956639813,
 'aakash': 2.3010299956639813,
 'aarif': 2.3010299956639813,
 'aarzoo': 2.3010299956639813,
 'aati': 2.3010299956639813,
 'abadi': 2.3010299956639813,
 'abadis': 2.3010299956639813,
 'abandon': 1.5228787452803376,
 'abandoned': 1.1870866433571443,
 'abandoning': 2.0,
 'abandonment': 1.3979400086720377,
 'abandons': 2.3010299956639813,
 'abate': 1.8239087409443189,
 'abated': 1.4559319556497243,
 'abatement': 2.3010299956639813,
 'abates': 2.3010299956639813,
 'abating': 2.3010299956639813,
 'abbas': 2.0,
 'abbreviate': 2.3010299956639813,
 'abbreviated': 2.3010299956639813,
 'abbreviation': 2.3010299956639813,
 'abbreviations': 2.3010299956639813,
 'abdicate': 2.0,
 'abdicated': 2.3010299956639813,
 'abdicates': 2.3010299956639813,
 'abdication': 2.3010299956639813,
 'abdomen': 1.4559319556497243,
 'abdominal': 1.6020599913279625,
 'abducting': 2.3010299956639813,
 'abduction': 2.3010299956639813,
 'abducts': 2.3010299956639813,
 'abdul': 1.301029995663981

###Defining a function which takes a query document as input and sorts the document in decreasing order of IDF scores and consequently, filtering the top 80 words with higher IDF scores

###Also took the noun chunks from the above words using Natural Language Processing(Spacy Library), but again this implementation is commented out to increase the precision

In [16]:
# Defining function for filtering a particular query
import spacy
def filter_query(query_number):
  query_id=Id_Text_pair[query_number][0]
  set1=Id_Text_pair[query_number][1].split() # getting first document
  set1=set(set1)
  df_doc = pd.DataFrame(set1,columns=['word'])
  df_merge = pd.merge(df_idf,df_doc, how = 'inner', on ="word")
  df_merge =df_merge.sort_values('score', ascending=False)

  words= list(df_merge['word'])
  top_words= words[0:80]
  top_words_str=' '.join(top_words)
  # all_words_str=' '.join(words)

  # nlp = spacy.load("en_core_web_sm")
  # doc = nlp(all_words_str)
  # chunked = []
  # for chunk in doc.noun_chunks:
  #   chunked.append(chunk.root.text)
  # chunked_str=' '.join(chunked)
  return query_id,top_words_str

In [18]:
filtered = filter_query(int(input("Enter query number")))
filtered

Enter query number0


('current_case_0183.txt',
 'abrogate irrecoverable roadblocks reinvested thirtieth presumes posited partys obtrusive undoes lunchroom longterm newlyfloated irreconcilability creche inferentially decongestion wellrecognized stimulating dictionaries restroom relocation rationalization incongruity transitional concentration constructs investing supersede livestock omit incurs congestion favouring relieving sway introduces enure undivided library poultry canteen poses owning repeals attributing constructing manifested window utilize insert obiter dicta shifting postulate incuriam recreational exports expires hazards fulfilment conceivable utilization repealing textual supersession detract redundant presents debts shifts inadvertence decidendi selfsame enacts declaratory deducting superseded appropriated utilised')