In [None]:
import pandas as pd
import numpy as np
import nltk
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, classification_report
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from nltk.tokenize import word_tokenize

In [None]:
import csv

texts = []
labels_ = []

# Open the CSV file and read it line by line
with open("/content/complaints.csv", "r", encoding="utf-8") as file:
    # Create a CSV reader
    csv_reader = csv.reader(file)
    # Skip the header line
    next(csv_reader)
    # Iterate over each row in the CSV file
    for row in csv_reader:
      # Extract text and label from the row
      text = row[0]  # Remove leading and trailing quotes
      if len(row) > 1:
        label = row[1]  # Extract label from the second column
      else:
        label = None
      # Append text and label to the respective lists
      texts.append(text)
      labels_.append(label)

In [None]:
nltk.download("punkt")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:

# Preprocess the text data
texts = [word_tokenize(text.lower()) for text in texts]

# Encode the labels
label_encoder = LabelEncoder()
labels = label_encoder.fit_transform(labels_)


In [None]:
# Convert texts to sequences and pad them
tokenizer = Tokenizer()
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
max_sequence_length = max(len(seq) for seq in sequences)
data = pad_sequences(sequences, maxlen=max_sequence_length)

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.2, random_state=42)

In [None]:
# Define model parameters
embedding_dim = 100
vocab_size = len(tokenizer.word_index) + 1
num_classes = len(label_encoder.classes_)

# Define the CNN model architecture
model = tf.keras.models.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_sequence_length),
    tf.keras.layers.Conv1D(128, 5, activation='relu'),
    tf.keras.layers.GlobalMaxPooling1D(),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(num_classes, activation='softmax')
])

# Compile the model
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
history = model.fit(X_train, y_train, epochs=10, batch_size=64, validation_split=0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Test Loss: {loss}, Test Accuracy: {accuracy}')

Test Loss: 0.5627123713493347, Test Accuracy: 0.8520379662513733


In [None]:
# Predict the probabilities for the test data
y_pred_prob = model.predict(X_test)

# Convert predicted probabilities to class labels
y_pred = np.argmax(y_pred_prob, axis=1)

# Convert encoded labels back to original labels
y_test_original = label_encoder.inverse_transform(y_test)
y_pred_original = label_encoder.inverse_transform(y_pred)

# Generate classification report
report = classification_report(y_test_original, y_pred_original)
print(report)

                                      precision    recall  f1-score   support

   Attempts to collect debt not owed       0.68      0.73      0.71       379
               Communication tactics       0.60      0.59      0.60        61
                       Fraud or scam       0.73      0.86      0.79        43
Incorrect information on your report       0.93      0.90      0.91      1230
          Struggling to pay mortgage       0.86      0.83      0.84        78

                            accuracy                           0.85      1791
                           macro avg       0.76      0.78      0.77      1791
                        weighted avg       0.86      0.85      0.85      1791



In [None]:
import matplotlib.pyplot as plt
conv_layers = [layer for layer in model.layers if isinstance(layer, tf.keras.layers.Conv1D)]

# Visualize filters for each convolutional layer
for i, layer in enumerate(conv_layers):
    filters = layer.get_weights()[0]
    num_filters = filters.shape[2]
    
    # Plot each filter as a grid
    plt.figure(figsize=(10, 10))
    for j in range(num_filters):
        plt.subplot(num_filters/8, 8, j+1)
        plt.plot(filters[:, :, j])
        plt.axis('off')
    plt.suptitle(f'Conv1D Layer {i+1}')
    plt.show()