In [1]:
import pandas as pd
import numpy as np
import nltk
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, classification_report
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from nltk.tokenize import word_tokenize

In [None]:
import csv

texts = []
labels = []

# Open the CSV file and read it line by line
with open("/content/complaints.csv", "r", encoding="utf-8") as file:
    # Create a CSV reader
    csv_reader = csv.reader(file)
    # Skip the header line
    next(csv_reader)
    # Iterate over each row in the CSV file
    for row in csv_reader:
      # Extract text and label from the row
      text = row[0]  # Remove leading and trailing quotes
      if len(row) > 1:
        label = row[1]  # Extract label from the second column
      else:
        label = None
      # Append text and label to the respective lists
      texts.append(text)
      labels.append(label)

In [2]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
texts = [word_tokenize(text.lower()) for text in texts]


In [3]:
import pickle
with open('tokenized_texts.pkl', 'wb') as f:
    pickle.dump(texts, f)

In [None]:
# with open('labels.pkl', 'wb') as f:
#     pickle.dump(labels, f)

In [4]:
with open('labels.pkl', 'rb') as f:
    labels_ = pickle.load(f)

In [None]:
# Encode the labels
label_encoder = LabelEncoder()
labels = label_encoder.fit_transform(labels_)

# Convert texts to sequences and pad them
tokenizer = Tokenizer()
tokenizer.fit_on_texts(texts)

In [None]:
with open('tokenizer.pkl', 'wb') as f:
    pickle.dump(tokenizer, f)

In [None]:
sequences = tokenizer.texts_to_sequences(texts)
max_sequence_length = max(len(seq) for seq in sequences)
data = pad_sequences(sequences, maxlen=max_sequence_length)

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.2, random_state=42)

NameError: name 'texts' is not defined

In [None]:
# Save tokenized data and tokenizer
with open('tokenized_data.pkl', 'wb') as f:
    pickle.dump((X_train, y_train, X_test, y_test, tokenizer), f)


NameError: name 'X_train' is not defined

In [None]:
# Load tokenized data and tokenizer
with open('tokenized_data.pkl', 'rb') as f:
    X_train, y_train, X_test, y_test, tokenizer = pickle.load(f)

In [None]:
sequences = tokenizer.texts_to_sequences(texts)
max_sequence_length = max(len(seq) for seq in sequences)

In [None]:

# Define model parameters
embedding_dim = 100
vocab_size = len(tokenizer.word_index) + 1

# Define the model architecture
model = tf.keras.models.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_sequence_length),
    tf.keras.layers.LSTM(128),
    tf.keras.layers.Dense(len(label_encoder.classes_), activation='softmax')
])

# Compile the model
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
history = model.fit(X_train, y_train, epochs=10, batch_size=64, validation_split=0.2)

In [None]:
# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Test Loss: {loss}, Test Accuracy: {accuracy}')

In [None]:
y_pred_prob = model.predict(X_test)

# Convert predicted probabilities to class labels
y_pred = np.argmax(y_pred_prob, axis=1)

# Convert encoded labels back to original labels
y_test_original = label_encoder.inverse_transform(y_test)
y_pred_original = label_encoder.inverse_transform(y_pred)

# Generate classification report
report = classification_report(y_test_original, y_pred_original)
print(report)

In [None]:
# model_path = 'LSTM.h5'
# tf.keras.models.save_model(model, model_path)

In [None]:
# model_path = 'LSTM.keras'
# tf.keras.models.save_model(model, model_path)

In [None]:
# model.layers

In [None]:
import matplotlib.pyplot as plt

# Plot training and validation loss
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Training and Validation Loss')
plt.legend()
plt.show()

# Plot training and validation accuracy
plt.plot(history.history['accuracy'], label='Training Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.title('Training and Validation Accuracy')
plt.legend()
plt.show()

In [None]:
# Need for regularization??

In [None]:
# Define model parameters
embedding_dim = 100
vocab_size = len(tokenizer.word_index) + 1
l2_regularizer = 0.02

# model architecture with L2 regularization
model_l2 = tf.keras.models.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_sequence_length),
    tf.keras.layers.LSTM(128, kernel_regularizer=tf.keras.regularizers.l2(l2_regularizer)),
    tf.keras.layers.Dense(len(label_encoder.classes_), activation='softmax')
])

# Compile the model
model_l2.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
history_l2 = model_l2.fit(X_train, y_train, epochs=8, batch_size=64, validation_split=0.2)


In [None]:
# Plot training and validation loss
plt.plot(history_l2.history['loss'], label='Training Loss')
plt.plot(history_l2.history['val_loss'], label='Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Training and Validation Loss')
plt.legend()
plt.show()

# Plot training and validation accuracy
plt.plot(history_l2.history['accuracy'], label='Training Accuracy')
plt.plot(history_l2.history['val_accuracy'], label='Validation Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.title('Training and Validation Accuracy')
plt.legend()
plt.show()

In [None]:
y_pred_prob = model_l2.predict(X_test)

# Convert predicted probabilities to class labels
y_pred = np.argmax(y_pred_prob, axis=1)

# Convert encoded labels back to original labels
y_test_original = label_encoder.inverse_transform(y_test)
y_pred_original = label_encoder.inverse_transform(y_pred)

# Generate classification report
report = classification_report(y_test_original, y_pred_original)
print(report)

# New Section