Task 6

Perform a Text Classification on consumer complaint dataset and classify into them as Credit reporting, repair, or
other, Debt collection, Consumer Loan, Mortgage

In [1]:
# importing all the necessary libraries

import requests
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB


import requests
import pandas as pd
from io import BytesIO
import zipfile

from sklearn.preprocessing import LabelEncoder

from sklearn.metrics import classification_report, accuracy_score


In [2]:
zip_file_url = "https://files.consumerfinance.gov/ccdb/complaints.csv.zip"

try:

    response = requests.get(zip_file_url)
    response.raise_for_status()


    with zipfile.ZipFile(BytesIO(response.content)) as zip_file:

        csv_file_name = zip_file.namelist()[0]
        with zip_file.open(csv_file_name) as csv_file:

            df = pd.read_csv(csv_file)


    print(df.head())

except requests.exceptions.RequestException as e:
    print(f"Error: {e}")
except Exception as e:
    print(f"An error occurred: {e}")


  Date received                                            Product  \
0    2023-09-04  Money transfer, virtual currency, or money ser...   
1    2023-09-06                                    Debt collection   
2    2023-09-06                        Checking or savings account   
3    2023-09-07                                    Debt collection   
4    2023-09-13  Credit reporting or other personal consumer re...   

                        Sub-product  \
0          Mobile or digital wallet   
1                        Other debt   
2  Other banking product or service   
3                  Credit card debt   
4                  Credit reporting   

                                               Issue  \
0  Trouble accessing funds in your mobile or digi...   
1                  Attempts to collect debt not owed   
2                                Managing an account   
3                              Communication tactics   
4               Incorrect information on your report   

       

In [3]:
# performing EDA on the dataset

print(df.head())
print(df['Product'].value_counts())

  Date received                                            Product  \
0    2023-09-04  Money transfer, virtual currency, or money ser...   
1    2023-09-06                                    Debt collection   
2    2023-09-06                        Checking or savings account   
3    2023-09-07                                    Debt collection   
4    2023-09-13  Credit reporting or other personal consumer re...   

                        Sub-product  \
0          Mobile or digital wallet   
1                        Other debt   
2  Other banking product or service   
3                  Credit card debt   
4                  Credit reporting   

                                               Issue  \
0  Trouble accessing funds in your mobile or digi...   
1                  Attempts to collect debt not owed   
2                                Managing an account   
3                              Communication tactics   
4               Incorrect information on your report   

       

In [None]:
# to check the names of the columns present in the csv file


print(df.columns)
if 'category' in df.columns:
    category_column = df['category']
else:
    print("The 'category' column does not exist in the DataFrame.")

Index(['Date received', 'Product', 'Sub-product', 'Issue', 'Sub-issue',
       'Consumer complaint narrative', 'Company public response', 'Company',
       'State', 'ZIP code', 'Tags', 'Consumer consent provided?',
       'Submitted via', 'Date sent to company', 'Company response to consumer',
       'Timely response?', 'Consumer disputed?', 'Complaint ID'],
      dtype='object')
The 'category' column does not exist in the DataFrame.


In [4]:
X = df['Consumer complaint narrative']
y = df['Product']


In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:

X_train.fillna('', inplace=True)
X_test.fillna('', inplace=True)




tfidf_vectorizer = TfidfVectorizer(max_features=5000)


X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)


X_test_tfidf = tfidf_vectorizer.transform(X_test)



In [7]:
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

In [None]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(max_iter=500)

classifier.fit(X_train_tfidf, y_train_encoded)




y_pred = classifier.predict(X_test_tfidf)


accuracy = accuracy_score(y_test_encoded, y_pred)
report = classification_report(y_test_encoded, y_pred, target_names=label_encoder.classes_)

print(f'Accuracy: {accuracy}')
print('Classification Report:')
print(report)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Accuracy: 0.6438207552403413
Classification Report:
                                                                              precision    recall  f1-score   support

                                                     Bank account or service       0.62      0.05      0.09     17460
                                                 Checking or savings account       0.72      0.34      0.46     36252
                                                               Consumer Loan       0.48      0.04      0.08      6337
                                                                 Credit card       0.59      0.05      0.10     18100
                                                 Credit card or prepaid card       0.71      0.38      0.50     41401
                                                            Credit reporting       0.62      0.04      0.07     28330
                         Credit reporting or other personal consumer reports       0.00      0.00      0.00      6974
Cre

  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
new_complaint = ["I have an issue with my credit report."]
new_complaint_tfidf = tfidf_vectorizer.transform(new_complaint)
predicted_category = classifier.predict(new_complaint_tfidf)
predicted_category_label = label_encoder.inverse_transform(predicted_category)
print(f'Predicted Category: {predicted_category_label[0]}')

Predicted Category: Credit reporting, credit repair services, or other personal consumer reports


In [None]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

# Create an SVM classifier
svm_classifier = SVC()

# Fit the SVM classifier to the training data
svm_classifier.fit(X_train_tfidf, y_train_encoded)

# Make predictions on the test data
y_pred_svm = svm_classifier.predict(X_test_tfidf)

# Calculate accuracy and generate a classification report
accuracy_svm = accuracy_score(y_test_encoded, y_pred_svm)
report_svm = classification_report(y_test_encoded, y_pred_svm, target_names=label_encoder.classes_)

print("SVM Classifier:")
print(f'Accuracy: {accuracy_svm}')
print('Classification Report:')
print(report_svm)


In [None]:
new_complaint = ["I am trying to dispute an item on my credit report, but the credit bureau is not responding."]
new_complaint_tfidf = tfidf_vectorizer.transform(new_complaint)
predicted_category = classifier.predict(new_complaint_tfidf)
predicted_category_label = label_encoder.inverse_transform(predicted_category)
print(f'Predicted Category: {predicted_category_label[0]}')

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Create a Random Forest classifier
rf_classifier = RandomForestClassifier(n_estimators=100)

# Fit the Random Forest classifier to the training data
rf_classifier.fit(X_train_tfidf, y_train_encoded)

# Make predictions on the test data
y_pred_rf = rf_classifier.predict(X_test_tfidf)

# Calculate accuracy and generate a classification report
accuracy_rf = accuracy_score(y_test_encoded, y_pred_rf)
report_rf = classification_report(y_test_encoded, y_pred_rf, target_names=label_encoder.classes_)

print("Random Forest Classifier:")
print(f'Accuracy: {accuracy_rf}')
print('Classification Report:')
print(report_rf)


In [None]:
new_complaint = ["I have an issue with my credit report."]
new_complaint_tfidf = tfidf_vectorizer.transform(new_complaint)
predicted_category = classifier.predict(new_complaint_tfidf)
predicted_category_label = label_encoder.inverse_transform(predicted_category)
print(f'Predicted Category: {predicted_category_label[0]}')