**Step-1: Importing Libraries**

In [1]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import os
import functools
import re

In [2]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to C:\Users\Ayush
[nltk_data]     Kumar\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\Ayush
[nltk_data]     Kumar\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Ayush
[nltk_data]     Kumar\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Ayush Kumar\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [3]:
directory = "./judgement/"

**Step-2: Text Preprocessing**

In [4]:
def preprocess_text(text):
    
    #Removing characters which are not alphabets or numbers
    text = re.sub(r'[^\w\s]','',text)

    #Tokenization
    words = word_tokenize(text)

    #Stop Words removal
    exclude_words = ["not","don't", 'should', "should've", "mightn't", 'mustn', "mustn't",'shouldn', 
                "shouldn't", 'wasn', "wasn't", 'weren', "weren't", "won't", 'wouldn', "wouldn't",
                'when', 'where', 'why', 'how',"couldn't","didn't","doesn't","hadn't","haven't",] 

    stopwords_list = set(stopwords.words('english'))
    stopwords_list = list(filter(lambda x: x not in exclude_words,stopwords_list))
    words = [word for word in words if word.isalpha() and word not in stopwords_list]

    final_words_list = []

    #Lemmatization
    exceptions = ['rs','was','as','has','ms','vs']
    lemmatizer = WordNetLemmatizer()
    for word in words:
        if word.lower() in exceptions:
            final_words_list.append(word)
        else:
            final_words_list.append(lemmatizer.lemmatize(word))

    return final_words_list

**Step-3: Feature Engineering**

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
def vectorize_text(corpus):
    vectors = vectorizer.fit_transform(corpus)
    return vectors

**Step-4: File Loading**

In [6]:
@functools.lru_cache(maxsize=None)
def get_file_contents(file_path):

    files_data = {}  # Original Text Dictionary
    vector_files_data = {} # Vectorized Text Dictionary

    for filename in os.listdir(file_path):
        f = os.path.join(file_path,filename)
        with open(f,'r') as file:
            text = file.read()
            processed_text = preprocess_text(text)
            vectors = vectorize_text(processed_text)

            # Creating the vectors dictionary
            vector_files_data[filename] = vectors
            # Creating the original text dictionary
            files_data[filename] = processed_text

    return files_data, vector_files_data

In [7]:
files_data,vector_files_data = get_file_contents(directory)

**Step-5: Printing the processed files**

In [8]:
for key,value in files_data.items():
    print(f"{key}:{value}")

1953_L_1.txt:['one', 'lakshminarayana', 'iyer', 'hindu', 'brahmin', 'owned', 'considerable', 'property', 'tirunelveli', 'district', 'died', 'december', 'leaving', 'surviving', 'widow', 'ranganayaki', 'married', 'daughter', 'ramalakshmi', 'ramalakshmi', 'married', 'plaintiff', 'number', 'child', 'alive', 'december', 'when', 'lakshminarayana', 'died', 'death', 'executed', 'november', 'construction', 'controversy', 'appeal', 'gave', 'following', 'direction', 'lifetime', 'aforesaid', 'ranganayaki', 'amminal', 'wife', 'shall', 'till', 'lifetime', 'enjoy', 'aforesaid', 'entire', 'property', 'outstandings', 'due', 'debt', 'payable', 'chit', 'amount', 'payable', 'lifetime', 'ramalakshmi', 'ammal', 'daughter', 'wife', 'rama', 'ayyar', 'avergal', 'melagaram', 'village', 'heir', 'shall', 'enjoy', 'absolute', 'right', 'power', 'alienation', 'gift', 'exchange', 'sale', 'son', 'grandson', 'generation', 'regard', 'payment', 'maintenance', 'made', 'chinnanmal', 'alias', 'lakshmi', 'ammal', 'wife', 'la