In [4]:
import pandas as pd
df = pd.read_csv(r"D:\complaints.csv", low_memory=False, nrows=200000)
print("Sample shape:", df.shape)
print(df.head())

Sample shape: (200000, 18)
  Date received                                            Product  \
0    2020-07-06  Credit reporting, credit repair services, or o...   
1    2025-09-24  Credit reporting or other personal consumer re...   
2    2019-12-26                        Credit card or prepaid card   
3    2020-05-08  Credit reporting, credit repair services, or o...   
4    2025-09-23  Credit reporting or other personal consumer re...   

                                  Sub-product  \
0                            Credit reporting   
1                            Credit reporting   
2  General-purpose credit card or charge card   
3                            Credit reporting   
4                            Credit reporting   

                                               Issue  \
0               Incorrect information on your report   
1               Incorrect information on your report   
2  Advertising and marketing, including promotion...   
3               Incorrect informa

In [5]:
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

# Download NLTK resources if not already present
nltk.download('stopwords')
nltk.download('punkt')

# Initialize stopwords and stemmer
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

def clean_text(text):
    try:
        # Handle missing or non-string entries
        if not isinstance(text, str):
            return ''
        # Lowercase
        text = text.lower()
        # Remove URLs
        text = re.sub(r'https?://\S+|www\.\S+', '', text)
        # Remove punctuation
        text = text.translate(str.maketrans('', '', string.punctuation))
        # Remove numbers
        text = re.sub(r'\d+', '', text)
        # Tokenize
        tokens = nltk.word_tokenize(text)
        # Remove stopwords
        tokens = [word for word in tokens if word not in stop_words]
        # Stemming
        stemmed = [stemmer.stem(word) for word in tokens]
        # Join back to string
        return ' '.join(stemmed)
    except Exception as e:
        print(f"Error processing text: {e}")
        return ''

# Apply cleaning to the complaint narrative column
if 'Consumer complaint narrative' in df.columns:
    df['clean_text'] = df['Consumer complaint narrative'].apply(clean_text)
else:
    print("Column 'Consumer complaint narrative' not found in DataFrame.")

# Check results
print(df[['Consumer complaint narrative', 'clean_text']].head())


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


  Consumer complaint narrative clean_text
0                          NaN           
1                          NaN           
2                          NaN           
3   These are not my accounts.    account
4                          NaN           


In [6]:
import re
import string
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

nltk.download('stopwords')

# Check for required column
if 'Consumer complaint narrative' not in df.columns or 'Product' not in df.columns:
    raise ValueError("Required columns not found in DataFrame.")

# Filter for target categories
target_products = [
    'Credit reporting, credit repair services, or other personal consumer reports',
    'Debt collection',
    'Consumer Loan',
    'Mortgage'
]
df = df[df['Product'].isin(target_products)]
df = df.dropna(subset=['Consumer complaint narrative'])

# Map labels to numbers
label_map = {
    'Credit reporting, credit repair services, or other personal consumer reports': 0,
    'Debt collection': 1,
    'Consumer Loan': 2,
    'Mortgage': 3
}
df['label'] = df['Product'].map(label_map)
df = df.dropna(subset=['label'])

# Text preprocessing
stop_words = set(stopwords.words('english'))
def clean_text(text):
    try:
        if not isinstance(text, str):
            return ''
        text = text.lower()
        text = re.sub(r'https?://\S+|www\.\S+', '', text)
        text = text.translate(str.maketrans('', '', string.punctuation))
        text = re.sub(r'\d+', '', text)
        tokens = text.split()
        tokens = [word for word in tokens if word not in stop_words]
        return ' '.join(tokens)
    except Exception as e:
        print(f"Error processing text: {e}")
        return ''
df['clean_text'] = df['Consumer complaint narrative'].apply(clean_text)

# Feature extraction
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(df['clean_text']).toarray()
y = df['label'].values

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Train classifier
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")
print(classification_report(y_test, y_pred))


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Accuracy: 0.8879
              precision    recall  f1-score   support

           0       0.89      0.94      0.92      1887
           1       0.87      0.79      0.83       921
           2       0.33      0.04      0.07        25
           3       0.91      0.90      0.90       343

    accuracy                           0.89      3176
   macro avg       0.75      0.67      0.68      3176
weighted avg       0.88      0.89      0.88      3176



In [2]:
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.


True