# Classical ML For Complaints 

In [14]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression

# Step 1: Data Loading and Exploration
df = pd.read_csv('complaints.csv')  # Replace 'complaints.csv' with your dataset file name
print(df.head())
print(df.info())

# Step 2: Handle Missing Values
df['Consumer complaint narrative'] = df['Consumer complaint narrative'].fillna('')

# Step 3: Feature Extraction
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(df['Consumer complaint narrative']) 
feature_names = vectorizer.get_feature_names()

# Step 4: Model Training - Classical ML Model (Logistic Regression)
X_train, X_test, y_train, y_test = train_test_split(X, df['Product'], test_size=0.2, random_state=42)

lr_model = LogisticRegression()
lr_model.fit(X_train, y_train)

# Step 5: Model Evaluation - Classical ML Model (Logistic Regression)
lr_pred = lr_model.predict(X_test)

# Evaluation Metrics
from sklearn.metrics import classification_report

print("Logistic Regression:")
print(classification_report(y_test, lr_pred))

# Step 6: Prediction Function
def classify_complaint_classical(complaint):
    complaint_vector = vectorizer.transform([complaint])
    product = lr_model.predict(complaint_vector)[0]
    return product


  Date received                                            Product  \
0    2023-05-21  Credit reporting, credit repair services, or o...   
1    2023-05-21  Credit reporting, credit repair services, or o...   
2    2023-05-20  Credit reporting, credit repair services, or o...   
3    2023-05-21  Credit reporting, credit repair services, or o...   
4    2023-05-21  Credit reporting, credit repair services, or o...   

        Sub-product                                              Issue  \
0  Credit reporting               Incorrect information on your report   
1  Credit reporting               Incorrect information on your report   
2  Credit reporting               Incorrect information on your report   
3  Credit reporting  Problem with a credit reporting company's inve...   
4  Credit reporting                        Improper use of your report   

                                           Sub-issue  \
0                           Account status incorrect   
1                     

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Logistic Regression:


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


                                                                              precision    recall  f1-score   support

                                                     Bank account or service       0.59      0.04      0.08     17294
                                                 Checking or savings account       0.69      0.29      0.41     33433
                                                               Consumer Loan       0.34      0.02      0.04      6283
                                                                 Credit card       0.54      0.03      0.07     17788
                                                 Credit card or prepaid card       0.69      0.31      0.43     37664
                                                            Credit reporting       0.49      0.02      0.04     28268
Credit reporting, credit repair services, or other personal consumer reports       0.59      0.98      0.74    376837
                                                       

  _warn_prf(average, modifier, msg_start, len(result))


In [39]:
import joblib

# Save the trained model
# joblib.dump(lr_model, 'lr_model.joblib')
joblib.dump(vectorizer, 'vectorizer.joblib')



['vectorizer.joblib']

In [15]:
complaint = "I am having incorrect information on my credit report. Please help me resolve this issue."
predicted_product = classify_complaint_classical(complaint)
print("Predicted Product:", predicted_product)


Predicted Product: Credit reporting, credit repair services, or other personal consumer reports


# Deep Learning LSTM For Complaints


In [28]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, Flatten

# Load and preprocess the data
df = pd.read_csv('complaints.csv')  # Replace with your actual file path
df = df.dropna(subset=['Consumer complaint narrative', 'Product'])  # Drop rows with NaN values

# Transform the product names into integers
encoder = LabelEncoder()
df['Product_int'] = encoder.fit_transform(df['Product'])

# Split the data into train and test sets
train_text, test_text, train_labels, test_labels = train_test_split(df['Consumer complaint narrative'], df['Product_int'], test_size=0.2, random_state=42)

# Preprocessing
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(train_text)
train_sequences = tokenizer.texts_to_sequences(train_text)
test_sequences = tokenizer.texts_to_sequences(test_text)

max_sequence_length = 100
train_data = pad_sequences(train_sequences, maxlen=max_sequence_length)
test_data = pad_sequences(test_sequences, maxlen=max_sequence_length)

# Define and train the ANN model
model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=100, input_length=max_sequence_length))  
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dense(len(df['Product'].unique()), activation='softmax'))  # Adjust the number of classes

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(train_data, train_labels, validation_data=(test_data, test_labels), epochs=2, batch_size=128)

# Evaluate the model
loss, accuracy = model.evaluate(test_data, test_labels)
print('Test Loss:', loss)
print('Test Accuracy:', accuracy)


Epoch 1/5
Epoch 2/5

KeyboardInterrupt: 

In [29]:
# Evaluate the model
loss, accuracy = model.evaluate(test_data, test_labels)
print('Test Loss:', loss)
print('Test Accuracy:', accuracy)

Test Loss: 0.6497840881347656
Test Accuracy: 0.7968804240226746


In [30]:
def predict_complaint(complaint):
    # Preprocessing
    sequences = tokenizer.texts_to_sequences([complaint])
    data = pad_sequences(sequences, maxlen=max_sequence_length)

    # Prediction
    predictions = model.predict(data)
    predicted_class = encoder.inverse_transform([np.argmax(predictions)])

    return predicted_class[0]

# Test the function with a new complaint
new_complaint = "I have been charged twice for the same transaction in my credit card."
print(predict_complaint(new_complaint))


Credit card or prepaid card


# SOTA Model (State of The Art Model)

In [34]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from transformers import BertTokenizer, TFBertForSequenceClassification
import tensorflow as tf

# Load and preprocess the data
df = pd.read_csv('complaints.csv')  
df['Consumer complaint narrative'] = df['Consumer complaint narrative'].fillna('')  # Replace NaN values with empty strings

# Split the data into train and test sets
train_text, test_text, train_labels, test_labels = train_test_split(df['Consumer complaint narrative'], df['Product'], test_size=0.2, random_state=42)

# Load the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize and encode the data
train_encodings = tokenizer.batch_encode_plus(train_text.tolist(), truncation=True, padding=True)
test_encodings = tokenizer.batch_encode_plus(test_text.tolist(), truncation=True, padding=True)

# Prepare the data as TensorFlow datasets
train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings),
    train_labels.tolist()
))
test_dataset = tf.data.Dataset.from_tensor_slices((
    dict(test_encodings),
    test_labels.tolist()
))

# Create the BERT model
model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(df['Product'].unique()))  # Adjust the number of labels as needed

# Compile the model
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-5)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
model.compile(optimizer=optimizer, loss=loss, metrics=[metric])

# Train the model
model.fit(train_dataset.shuffle(1000).batch(16), epochs=1, batch_size=8)

# Evaluate the model
test_loss, test_accuracy = model.evaluate(test_dataset.batch(16), verbose=1)
print('Test Loss:', test_loss)
print('Test Accuracy:', test_accuracy)


KeyboardInterrupt: 