In [None]:
!pip install tensorflow_text
!pip install nlpaug
#!pip install tensorflow-addons
#!pip install transformers


In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import tensorflow_hub as hub
import tensorflow_text as text
from sklearn.metrics import classification_report
from imblearn.under_sampling import RandomUnderSampler
import nlpaug.augmenter.word as naw
import re
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier
tf.keras.mixed_precision.set_global_policy('mixed_float16')



In [None]:
ds = pd.read_csv('/content/train_output - train_output(1).csv')
class_distribution = ds['Label'].value_counts()
print(class_distribution)

In [None]:
def preprocess_text(text):
    text = re.sub(r'\n', ' ', text)
    return text

ds['Text'] = ds['Text'].apply(preprocess_text)

In [None]:
ds.dropna(axis=0, how='any')

In [None]:
new_ds=ds.loc[ds['Label'].isin(['FAC','ARG_RESPONDENT', 'ARG_PETITIONER', 'ISSUE'])]
#new_ds = ds

In [None]:
unique_classes = new_ds['Label'].unique()
print("Unique classes:", unique_classes)

In [None]:
new_ds[["Label"]] = new_ds[["Label"]].apply(LabelEncoder().fit_transform)

In [None]:
new_ds
'''name = "new_ds.csv"
new_ds.to_csv(name, index=False)'''

In [None]:
X = new_ds['Text']
y = new_ds['Label']

In [None]:
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=47)


In [None]:
def augment_text(text, num_aug=3):
    augmented_texts = []
    aug = naw.SynonymAug()
    for _ in range(num_aug):
        augmented_text = aug.augment(text)
        augmented_texts.append(augmented_text)
    return augmented_texts

X_train_augmented = []
y_train_augmented = []
X_test_augmented = []
y_test_augmented = []



for text, label in zip(X_train, y_train):
    if label == 1:
        augmented_texts = augment_text(text)
        X_train_augmented.extend(augmented_texts)
        y_train_augmented.extend([label] * len(augmented_texts))
for text, label in zip(X_test, y_test):
    if label == 1:
        augmented_texts = augment_text(text)
        X_test_augmented.extend(augmented_texts)
        y_test_augmented.extend([label] * len(augmented_texts))

X_train_resampled = np.concatenate((X_train.values.reshape(-1, 1), np.array(X_train_augmented).reshape(-1, 1)))
y_train_resampled = np.concatenate((y_train, y_train_augmented))

X_test_resampled = np.concatenate((X_test.values.reshape(-1, 1), np.array(X_test_augmented).reshape(-1, 1)))
y_test_resampled = np.concatenate((y_test, y_test_augmented))

X_train_resampled = pd.Series(X_train_resampled.squeeze())
y_train_resampled = pd.Series(y_train_resampled.squeeze())

X_test_resampled = pd.Series(X_test_resampled.squeeze())
y_test_resampled = pd.Series(y_test_resampled.squeeze())

In [None]:

X_train_tensor = tf.convert_to_tensor(X_train_resampled, dtype=tf.string)
y_train_tensor = tf.convert_to_tensor(tf.keras.utils.to_categorical(y_train_resampled, num_classes=13), dtype=tf.float32)

X_test_tensor = tf.convert_to_tensor(X_test_resampled, dtype=tf.string)
y_test_tensor = tf.convert_to_tensor(tf.keras.utils.to_categorical(y_test_resampled, num_classes=13), dtype=tf.float32)

In [None]:
bert_preprocess = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3")
bert_encoder = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4")

In [None]:
text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
preprocessed_text = bert_preprocess(text_input)
outputs = bert_encoder(preprocessed_text)
outputs['pooled_output']

In [None]:
X_train_bert_embed = bert_preprocess(X_train_resampled)
X_train_bert_embed = bert_encoder(X_train_bert_embed)["pooled_output"]

X_test_bert_embed = bert_preprocess(X_test_resampled)
X_test_bert_embed = bert_encoder(X_test_bert_embed)["pooled_output"]

In [None]:
l = tf.keras.layers.Dropout(0.1, name="dropout")(outputs['pooled_output'])

'''l = tf.keras.layers.Dense(128, activation='gelu', name="hidden")(l)
l = tf.keras.layers.Dropout(0.5, name="dropout_2")(l)

l = tf.keras.layers.Dense(64, activation='gelu', name="hidden_2")(l)
l = tf.keras.layers.Dropout(0.1, name="dropout_3")(l)

l = tf.keras.layers.Dense(32, activation='gelu', name="hidden_3")(l)
l = tf.keras.layers.Dropout(0.1, name="dropout_4")(l)

l = tf.keras.layers.Dense(16, activation='gelu', name="hidden_4")(l)
l = tf.keras.layers.Dropout(0.1, name="dropout_5")(l)'''

output_layer = tf.keras.layers.Dense(13, activation='softmax', name="output")(l)

In [None]:
model = tf.keras.Model(inputs=[text_input], outputs = [output_layer])

In [None]:
model.summary()

In [None]:
METRICS = [tf.keras.metrics.F1Score(
    average=None, threshold=None, name='f1_score', dtype=None
),
    tf.keras.metrics.CategoricalAccuracy(name='accuracy'),
    tf.keras.metrics.Precision(name='precision'),
    tf.keras.metrics.Recall(name='recall'),
    tf.keras.metrics.AUC(name='auc'),
]
optimizer1 = tf.keras.optimizers.Adafactor(
    learning_rate=0.001,
    beta_2_decay=-0.8,
    epsilon_1=1e-30,
    epsilon_2=0.001,
    clip_threshold=1.0,
    relative_step=True,
    weight_decay=None,
    clipnorm=None,
    clipvalue=None,
    global_clipnorm=None,
    use_ema=False,
    ema_momentum=0.99,
    ema_overwrite_frequency=None,
    jit_compile=True,
    name='Adafactor')
optimizer2=tf.keras.optimizers.AdamW(
    learning_rate=0.001,
    weight_decay=0.004,
    beta_1=0.9,
    beta_2=0.999,
    epsilon=1e-07,
    amsgrad=False,
    clipnorm=None,
    clipvalue=None,
    global_clipnorm=None,
    use_ema=False,
    ema_momentum=0.99,
    ema_overwrite_frequency=None,
    jit_compile=True,
    name='AdamW')
optimizer3 = tf.keras.optimizers.Lion(
    learning_rate=0.001,
    beta_1=0.9,
    beta_2=0.99,
    weight_decay=None,
    clipnorm=None,
    clipvalue=None,
    global_clipnorm=None,
    use_ema=False,
    ema_momentum=0.99,
    ema_overwrite_frequency=None,
    jit_compile=True,
    name='Lion')
optimizer4=tf.keras.optimizers.Adam(
    learning_rate=0.001,
    beta_1=0.9,
    beta_2=0.999,
    epsilon=1e-07,
    amsgrad=False,
    weight_decay=None,
    clipnorm=None,
    clipvalue=None,
    global_clipnorm=None,
    use_ema=False,
    ema_momentum=0.99,
    ema_overwrite_frequency=None,
    jit_compile=True,
    name='Adam')

optimizer5=tf.keras.optimizers.SGD(
    learning_rate=0.01,
    momentum=0.0,
    nesterov=False,
    weight_decay=None,
    clipnorm=None,
    clipvalue=None,
    global_clipnorm=None,
    use_ema=False,
    ema_momentum=0.99,
    ema_overwrite_frequency=None,
    jit_compile=True,
    name="SGD")

optimizer6=tf.keras.optimizers.Adadelta(
    learning_rate=0.001,
    rho=0.95,
    epsilon=1e-07,
    weight_decay=None,
    clipnorm=None,
    clipvalue=None,
    global_clipnorm=None,
    use_ema=False,
    ema_momentum=0.99,
    ema_overwrite_frequency=None,
    jit_compile=True,
    name="Adadelta")

optimizer7=tf.keras.optimizers.Adagrad(
    learning_rate=0.001,
    initial_accumulator_value=0.1,
    epsilon=1e-07,
    weight_decay=None,
    clipnorm=None,
    clipvalue=None,
    global_clipnorm=None,
    use_ema=False,
    ema_momentum=0.99,
    ema_overwrite_frequency=None,
    jit_compile=True,
    name="Adagrad")

optimizer8=tf.keras.optimizers.RMSprop(
    learning_rate=0.001,
    rho=0.9,
    momentum=0.0,
    epsilon=1e-07,
    centered=False,
    weight_decay=None,
    clipnorm=None,
    clipvalue=None,
    global_clipnorm=None,
    use_ema=False,
    ema_momentum=0.99,
    ema_overwrite_frequency=100,
    jit_compile=True,
    name="RMSprop")


In [None]:
base_models = []

model_1 = tf.keras.models.clone_model(model)
model_1.compile(optimizer=optimizer1, loss='categorical_crossentropy', metrics=METRICS)
base_models.append(model_1)

model_2 = tf.keras.models.clone_model(model)
model_2.compile(optimizer=optimizer2, loss='categorical_crossentropy', metrics=METRICS)
base_models.append(model_2)

model_3 = tf.keras.models.clone_model(model)
model_3.compile(optimizer=optimizer3, loss='categorical_crossentropy', metrics=METRICS)
base_models.append(model_3)

model_4 = tf.keras.models.clone_model(model)
model_4.compile(optimizer=optimizer4, loss='categorical_crossentropy', metrics=METRICS)
base_models.append(model_4)

model_5 = tf.keras.models.clone_model(model)
model_5.compile(optimizer=optimizer5, loss='categorical_crossentropy', metrics=METRICS)
base_models.append(model_5)

model_6 = tf.keras.models.clone_model(model)
model_6.compile(optimizer=optimizer6, loss='categorical_crossentropy', metrics=METRICS)
base_models.append(model_6)

model_7 = tf.keras.models.clone_model(model)
model_7.compile(optimizer=optimizer7, loss='categorical_crossentropy', metrics=METRICS)
base_models.append(model_7)

model_8 = tf.keras.models.clone_model(model)
model_8.compile(optimizer=optimizer8, loss='categorical_crossentropy', metrics=METRICS)
base_models.append(model_8)

train_predictions = []
for model in base_models:
    train_pred = model.predict(X_train_tensor)
    train_predictions.append(train_pred)


In [None]:
stacked_train_predictions = np.hstack(train_predictions)
X_train_combined = np.hstack((X_train_bert_embed, stacked_train_predictions))



In [None]:
test_predictions = []
for model in base_models:
    test_pred = model.predict(X_test_tensor)
    test_predictions.append(test_pred)

stacked_test_predictions = np.hstack(test_predictions)
X_test_combined = np.hstack((X_test_bert_embed, stacked_test_predictions))

In [None]:
param_grid = {
    'n_estimators': [50, 100, 150],
    'learning_rate': [0.01, 0.1, 0.2]
}

In [None]:
gb_classifier = GradientBoostingClassifier(random_state=42)

In [None]:
grid_search = GridSearchCV(gb_classifier, param_grid, cv=5, scoring='f1_macro', n_jobs=-1)
grid_search.fit(X_train_combined, np.array(y_train_resampled))

best_n_estimators = grid_search.best_params_['n_estimators']
best_learning_rate = grid_search.best_params_['learning_rate']

In [None]:
print("Best n_estimators:", best_n_estimators)
print("Best learning_rate:", best_learning_rate)

In [None]:
gb_classifier = GradientBoostingClassifier(n_estimators=best_n_estimators,
                                           learning_rate=best_learning_rate,
                                           random_state=42)
model = gb_classifier.fit(X_train_combined, y_train_resampled)



In [None]:
predicted_labels = model.predict(X_test_combined)
predicted_class_labels = label_encoder.inverse_transform(predicted_labels)




In [None]:
print(predicted_class_labels[0:20])

In [None]:
from sklearn.metrics import confusion_matrix, classification_report

test_predictions = gb_classifier.predict(X_test_combined)
predicted_class_labels = label_encoder.inverse_transform(test_predictions)
true_class_labels = label_encoder.inverse_transform(y_test_resampled)

conf_matrix = confusion_matrix(true_class_labels, predicted_class_labels)
print("Confusion Matrix:")
print(conf_matrix)

class_report = classification_report(true_class_labels, predicted_class_labels)
print("Classification Report:")
print(class_report)


In [None]:
accuracy = accuracy_score(true_class_labels, predicted_class_labels)
precision = precision_score(true_class_labels, predicted_class_labels, average='macro')
recall = recall_score(true_class_labels, predicted_class_labels, average='macro')
f1 = f1_score(true_class_labels, predicted_class_labels, average='macro')
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)

In [None]:
##################################################################



In [None]:
model.save("NLP_TEXT_CLASSIFICATION_saved_model.h5")


In [None]:
model = tf.keras.models.load_model("LLP_saved_model")
def check(line):
    prediction = model.predict(line)
    predicted_labels = np.argmax(prediction, axis=1)
    predicted_class_labels = label_encoder.inverse_transform(predicted_labels)

    dic = {0: 'ANALYSIS', 1: 'ARG_PETITIONER', 2: 'ARG_RESPONDENT', 3: 'FAC', 4: 'ISSUE', 5: 'NONE', 6: 'PREAMBLE', 7: 'PRE_NOT_RELIED', 8: 'PRE_RELIED', 9: 'RATIO', 10: 'RLC', 11: 'RPC', 12: 'STA'}

    for label in predicted_class_labels:
        if label in dic:
            print(dic[label])
#Put your statement here:
line = ['3. The court below answered the above points in the affirmative and accordingly had convicted the accused to undergo simple imprisonment for three years and pay a fine of Rs.5,000/- for the offence punishable under section 498-A IPC and 10 years simple imprisonment and to pay fine of Rs.10,000/- for the offence punishable under Section 306 of the IPC.']
check(line)
