## Importing Packages

In [22]:
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
import re
import nltk
import random

nltk.download('wordnet')
nltk.download('punkt')
nltk.download('stopwords')

from nltk.tokenize import word_tokenize


[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/sathvikjammula/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/sathvikjammula/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/sathvikjammula/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
df = pd.read_excel('classification new1.xlsx')
df.head()


Unnamed: 0,Id,Judgment,Domain,Judgment Status,Equivalent Citation,Judgment Name,Court,Label
0,1,""" Being aggrieved by the judgment delivered on...",Civil,Appeal dismissed,CIVIL APPEAL NO. 3880 OF 2003,RAJASTHAN R.S.S. & GINNING MILLS FED.LTD Vs. D...,Supreme Court,0
1,2,"""This was an appeal from a judgment and decree...",Civil,Appeal dismissed,"1950 AIR, 7 1950 SCR 64",Messrs. Khimji Poonja And Company vs Shri Bald...,Supreme Court,0
2,3,"""This was an application under article 32 (1) ...",Civil,Petition allowed,"1950 AIR 163, 1950 SCR 566","Rashid Ahmed vs The Municipal Board, Kairana.T...",Supreme Court,0
3,4,"""APPEAL from a judgment of the High Court of H...",Civil,Appeal allowed,"1953 AIR 413, 1950 SCR 747",Supreme Court Of India ... vs Kadar Unnisa Beg...,Supreme Court,0
4,5,"""APPEAL under article 374(4) of the Constituti...",Civil,Appeal allowed,"1950 AIR 274, 1950 SCR 741",Supreme Court Of India ... vs Nandlal on 12 O...,Supreme Court,0


## Balancing

In [4]:
import pandas as pd

# count the number of occurrences of each label
label_counts = df['Label'].value_counts()

# determine the label with the fewest occurrences
min_count = label_counts.min()

# create an empty dataframe to hold the balanced dataset
balanced_df = pd.DataFrame()

# loop through each label
for label in label_counts.index:

    # get all entries with this label
    label_df = df[df['Label'] == label]

    # duplicate entries to match the number of the minority class
    duplicates_df = label_df.sample(n=min_count, replace=True)

    # concatenate the original and duplicated entries
    balanced_df = pd.concat([balanced_df, label_df, duplicates_df])

# save the balanced dataset
balanced_df.to_excel('balanced_dataset.xlsx', index=False)

In [5]:
data = pd.read_excel('balanced_dataset.xlsx')

In [6]:
data.head()

Unnamed: 0,Id,Judgment,Domain,Judgment Status,Equivalent Citation,Judgment Name,Court,Label
0,1,""" Being aggrieved by the judgment delivered on...",Civil,Appeal dismissed,CIVIL APPEAL NO. 3880 OF 2003,RAJASTHAN R.S.S. & GINNING MILLS FED.LTD Vs. D...,Supreme Court,0
1,2,"""This was an appeal from a judgment and decree...",Civil,Appeal dismissed,"1950 AIR, 7 1950 SCR 64",Messrs. Khimji Poonja And Company vs Shri Bald...,Supreme Court,0
2,3,"""This was an application under article 32 (1) ...",Civil,Petition allowed,"1950 AIR 163, 1950 SCR 566","Rashid Ahmed vs The Municipal Board, Kairana.T...",Supreme Court,0
3,4,"""APPEAL from a judgment of the High Court of H...",Civil,Appeal allowed,"1953 AIR 413, 1950 SCR 747",Supreme Court Of India ... vs Kadar Unnisa Beg...,Supreme Court,0
4,5,"""APPEAL under article 374(4) of the Constituti...",Civil,Appeal allowed,"1950 AIR 274, 1950 SCR 741",Supreme Court Of India ... vs Nandlal on 12 O...,Supreme Court,0


## Data Augmentation

In [7]:
import random
from nltk.corpus import wordnet, stopwords
from nltk.tokenize import word_tokenize

def synonym_replacement(words, n):
    stop_words = set(stopwords.words('english'))
    words = word_tokenize(words)
    new_words = list(words)
    random_word_list = list(set([word for word in words if word not in stop_words]))
    random.shuffle(random_word_list)
    num_replaced = 0
    for random_word in random_word_list:
        synonyms = get_synonyms(random_word)
        if len(synonyms) >= 1:
            synonym = random.choice(list(synonyms))
            new_words = [synonym if word == random_word else word for word in new_words]
            num_replaced += 1
        if num_replaced >= n:
            break

    sentence = ' '.join(new_words)
    return sentence


def get_synonyms(word):
    synonyms = []
    for syn in wordnet.synsets(word):
        for lemma in syn.lemmas():
            synonym = lemma.name().replace("_", " ").replace("-", " ").lower()
            synonyms.append(synonym)
    return set(synonyms)

def random_swap(sentence, n):
    words = sentence.split()
    new_words = words.copy()
    for _ in range(n):
        new_words = swap_word(new_words)
    sentence = ' '.join(new_words)
    return sentence

def swap_word(new_words):
    random_idx_1 = random.randint(0, len(new_words) - 1)
    random_idx_2 = random_idx_1
    counter = 0
    while random_idx_2 == random_idx_1:
        random_idx_2 = random.randint(0, len(new_words) - 1)
        counter += 1
        if counter > 3:
            return new_words
    new_words[random_idx_1], new_words[random_idx_2] = new_words[random_idx_2], new_words[random_idx_1] 
    return new_words

def random_delete(sentence, p):
    words = sentence.split()
    new_words = []
    for word in words:
        r = random.uniform(0, 1)
        if r > p:
            new_words.append(word)
    sentence = ' '.join(new_words)
    return sentence

def augment_data(df, n_augmentations=3, p_delete=0.2, n_swap=2, n_replace=1):
    df_augmented = pd.DataFrame(columns=df.columns)
    for index, row in df.iterrows():
        for i in range(n_augmentations):
            new_row = row.copy()
            new_row['Judgment'] = random_swap(row['Judgment'], n_swap)
            new_row['Judgment'] = synonym_replacement(new_row['Judgment'], n_replace)
            new_row['Judgment'] = random_delete(new_row['Judgment'], p_delete)
            df_augmented = df_augmented.append(new_row, ignore_index=True)
    return df_augmented


In [8]:
df = pd.DataFrame(data)

# Augment the dataset
df_augmented = augment_data(df, n_augmentations=3, p_delete=0.2, n_swap=2, n_replace=1)

# Print the original and augmented datasets
print("Original Dataset:")
print(df)

print("Augmented Dataset:")
print(df_augmented)

  df_augmented = df_augmented.append(new_row, ignore_index=True)
  df_augmented = df_augmented.append(new_row, ignore_index=True)
  df_augmented = df_augmented.append(new_row, ignore_index=True)
  df_augmented = df_augmented.append(new_row, ignore_index=True)
  df_augmented = df_augmented.append(new_row, ignore_index=True)
  df_augmented = df_augmented.append(new_row, ignore_index=True)
  df_augmented = df_augmented.append(new_row, ignore_index=True)
  df_augmented = df_augmented.append(new_row, ignore_index=True)
  df_augmented = df_augmented.append(new_row, ignore_index=True)
  df_augmented = df_augmented.append(new_row, ignore_index=True)
  df_augmented = df_augmented.append(new_row, ignore_index=True)
  df_augmented = df_augmented.append(new_row, ignore_index=True)
  df_augmented = df_augmented.append(new_row, ignore_index=True)
  df_augmented = df_augmented.append(new_row, ignore_index=True)
  df_augmented = df_augmented.append(new_row, ignore_index=True)
  df_augmented = df_augme

Original Dataset:
      Id                                           Judgment  \
0      1  " Being aggrieved by the judgment delivered on...   
1      2  "This was an appeal from a judgment and decree...   
2      3  "This was an application under article 32 (1) ...   
3      4  "APPEAL from a judgment of the High Court of H...   
4      5  "APPEAL under article 374(4) of the Constituti...   
..   ...                                                ...   
422  206  "The present revision petition has been filed ...   
423  278  "One Selvi (the petitioner, hereafter) has com...   
424  146  "(Passed on this 7th Day of April, 2022) The i...   
425  146  "(Passed on this 7th Day of April, 2022) The i...   
426  207  "(1) Counter affidavit filed on behalf of comp...   

                                                Domain     Judgment Status  \
0                                                Civil    Appeal dismissed   
1                                                Civil    Appeal dism

  df_augmented = df_augmented.append(new_row, ignore_index=True)
  df_augmented = df_augmented.append(new_row, ignore_index=True)


In [9]:
# assume that df is your DataFrame
df_augmented.to_excel('test1.xlsx', index=False)
df_augmented.head()

Unnamed: 0,Id,Judgment,Domain,Judgment Status,Equivalent Citation,Judgment Name,Court,Label
0,1,`` Being aggrieved by the judgment delivered o...,Civil,Appeal dismissed,CIVIL APPEAL NO. 3880 OF 2003,RAJASTHAN R.S.S. & GINNING MILLS FED.LTD Vs. D...,Supreme Court,0
1,1,`` Being aggrieved the judgment on 19th Septem...,Civil,Appeal dismissed,CIVIL APPEAL NO. 3880 OF 2003,RAJASTHAN R.S.S. & GINNING MILLS FED.LTD Vs. D...,Supreme Court,0
2,1,`` Being aggrieved by judgment on 19th Septemb...,Civil,Appeal dismissed,CIVIL APPEAL NO. 3880 OF 2003,RAJASTHAN R.S.S. & GINNING MILLS FED.LTD Vs. D...,Supreme Court,0
3,2,This an appeal a judgment and of Bombay High C...,Civil,Appeal dismissed,"1950 AIR, 7 1950 SCR 64",Messrs. Khimji Poonja And Company vs Shri Bald...,Supreme Court,0
4,2,`` This was an appeal from a and decree of Bom...,Civil,Appeal dismissed,"1950 AIR, 7 1950 SCR 64",Messrs. Khimji Poonja And Company vs Shri Bald...,Supreme Court,0


## Balancing the augmented data

In [10]:
# df4 = pd.read_excel("test1.xlsx")
df4 = pd.read_excel("balanced_dataset.xlsx")


In [11]:
from sklearn.utils import resample

counts = df4['Label'].value_counts()

# Determine the class with the fewest instances
min_count = min(counts)

# Create a new dataframe for each class with the same number of instances as the minority class
class_dfs = []
for i in range(11):
    class_df = df4[df4['Label'] == i]
    class_df = resample(class_df, n_samples=min_count, replace=True, random_state=42)
    class_dfs.append(class_df)

# Combine the dataframes into a balanced dataset
balanced_df = pd.concat(class_dfs)
balanced_counts = balanced_df['Label'].value_counts()
# print(balanced_counts)
# Save the balanced dataset
balanced_df.to_excel('aug_Balance.xlsx', index=False)

## Bagging

In [12]:

stop_words = set(stopwords.words('english')+['the', 'and', 'in', 'of', 'to', 'a','this','was','is','so','it','that','or','by','th'])
def remove_stop_words(sentence):
    pattern = r'\b(?:{})\b'.format('|'.join(stop_words))
    pattern1 = re.compile(r'\W+')

    sentence = re.sub(r'\d+', '', sentence)
    sentence = re.sub(pattern, lambda match: match.group().lower(), sentence, flags=re.IGNORECASE)

    words = sentence.split()
    filtered_words = [word for word in words if word.casefold() not in stop_words and not word.isdigit()]
    filtered_sentence = ' '.join(filtered_words)
    filtered_sentence = pattern1.sub(' ', filtered_sentence)

    return filtered_sentence


# Apply the remove_words function to the 'sentences' column and store the results in a new column
df_augmented['clean_sentences'] = df_augmented['Judgment'].apply(remove_stop_words)

# Save the updated dataframe to a new Excel file
df_augmented.to_excel('bagging.xlsx', index=False)

## Balancing

In [13]:
from sklearn.utils import resample
df5 = pd.read_excel("bagging.xlsx")
counts = df5['Label'].value_counts()

# Determine the class with the fewest instances
min_count = min(counts)

# Create a new dataframe for each class with the same number of instances as the minority class
class_dfs = []
for i in range(11):
    class_df = df5[df5['Label'] == i]
    class_df = resample(class_df, n_samples=min_count, replace=True, random_state=42)
    class_dfs.append(class_df)

# Combine the dataframes into a balanced dataset
balanced_df = pd.concat(class_dfs)
balanced_counts = balanced_df['Label'].value_counts()
# print(balanced_counts)
# Save the balanced dataset
balanced_df.to_excel('aug_Balance1.xlsx', index=False)

## CNN

In [14]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense, Embedding, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.datasets import make_classification

In [15]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score,precision_score, recall_score, f1_score, roc_auc_score,log_loss
# Load the data from the CSV file
df6 = pd.read_excel('new_balanced.xlsx')

# Split the data into features (X) and labels (y)
X_text = df6.iloc[:, :2].apply(lambda x: ' '.join(x.astype(str)), axis=1)
y = df6.iloc[:, 2].values.astype(int) # Convert to integer type

# Vectorize the text using CountVectorizer
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(X_text)

# Convert the vectorized features to a 3D array (batch size, sequence length, input dimension)
X = X.toarray()
X = np.expand_dims(X, axis=2)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

## DropOut

In [16]:
from keras.layers import Dense, Activation, Flatten, Conv1D, MaxPooling1D, Dropout, BatchNormalization

# Define the CNN architecture
model = Sequential()
model.add(Conv1D(32, 3, activation='relu', input_shape=(X.shape[1], X.shape[2])))
model.add(Dropout(0.2))
model.add(MaxPooling1D(2))
model.add(Conv1D(64, 3, activation='relu'))
model.add(Dropout(0.2))
model.add(MaxPooling1D(2))
model.add(Conv1D(64, 3, activation='relu'))
model.add(Dropout(0.2))
model.add(MaxPooling1D(2))
model.add(Flatten())
model.add(Dense(64, activation='relu'))
model.add(Dense(11, activation='softmax')) # 11 classes: 0-10
model.add(Activation('softmax'))


Metal device set to: Apple M1

systemMemory: 8.00 GB
maxCacheSize: 2.67 GB



In [17]:
model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

In [18]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv1d (Conv1D)             (None, 22408, 32)         128       
                                                                 
 dropout (Dropout)           (None, 22408, 32)         0         
                                                                 
 max_pooling1d (MaxPooling1D  (None, 11204, 32)        0         
 )                                                               
                                                                 
 conv1d_1 (Conv1D)           (None, 11202, 64)         6208      
                                                                 
 dropout_1 (Dropout)         (None, 11202, 64)         0         
                                                                 
 max_pooling1d_1 (MaxPooling  (None, 5601, 64)         0         
 1D)                                                    

In [19]:
# Train the model
history = model.fit(X_train, y_train, epochs=100, batch_size=64, validation_data=(X_test, y_test))

Epoch 1/100


2023-06-05 10:25:12.921352: W tensorflow/tsl/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
 7/44 [===>..........................] - ETA: 11s - loss: 2.2527 - accuracy: 0.2902

KeyboardInterrupt: 

In [None]:
model.save('trained_model.h5')

In [23]:
import matplotlib.pyplot as plt
plt.plot(history.history['loss'], label="Training Loss")
plt.plot(history.history['val_loss'], label="Validation Loss") 
plt.title("Training and Validation Loss")
plt.xlabel("Iteration")
plt.ylabel("Loss")
plt.legend()
plt.show() 

NameError: name 'history' is not defined

In [20]:
test_loss, test_acc = model.evaluate(X_test, y_test)
print('Test loss:', test_loss)
print('Test accuracy:', test_acc)
# Evaluate the model on the test set
y_pred_prob = model.predict(X_test)
y_pred = np.argmax(y_pred_prob, axis=1)

# Compute precision, recall, and F1-score
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')

print('Precision: {:.4f}'.format(precision))
print('Recall: {:.4f}'.format(recall))
print('F1-score: {:.4f}'.format(f1))


Test loss: 2.3101112842559814
Test accuracy: 0.2328571379184723
Precision: 0.1456
Recall: 0.2712
F1-score: 0.1694


  _warn_prf(average, modifier, msg_start, len(result))


In [21]:
plt.plot(history.history['accuracy'], label="Training Accuracy") 
plt.plot(history.history['val_accuracy'], label="Validation Accuracy") 
plt.title("Training and Validation Accuracy")
plt.xlabel("Iteration")
plt.ylabel("Accuracy")
plt.legend()
plt.show() 

NameError: name 'plt' is not defined

In [None]:
m,n = model.evaluate(X_train, y_train)
print(n)

NameError: name 'model' is not defined