## Libraries

In [21]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [22]:
# !pip install tensorflow

In [23]:
import pandas as pd
import numpy as np
from plotly.offline import iplot
import matplotlib.pyplot as plt
import seaborn as sns
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import Dropout
import re
from nltk.corpus import stopwords
from nltk import word_tokenize

In [24]:
# STOPWORDS = set(stopwords.words('english'))

In [25]:
df = pd.read_csv("/content/drive/MyDrive/complaints_processed.csv")

In [26]:
df.head()

Unnamed: 0.1,Unnamed: 0,product,narrative
0,0,credit_card,purchase order day shipping amount receive pro...
1,1,credit_card,forwarded message date tue subject please inve...
2,2,retail_banking,forwarded message cc sent friday pdt subject f...
3,3,credit_reporting,payment history missing credit report speciali...
4,4,credit_reporting,payment history missing credit report made mis...


In [43]:
df1 = df[0:80000]

In [44]:
df1.dtypes

Unnamed: 0     int64
product       object
narrative     object
dtype: object

In [45]:
!pip install nltk

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [46]:
import nltk
nltk.download('stopwords')
stopwords = nltk.corpus.stopwords.words('english')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [47]:
data = df1[['narrative','product']]
data = data.dropna()
data.head()

Unnamed: 0,narrative,product
0,purchase order day shipping amount receive pro...,credit_card
1,forwarded message date tue subject please inve...,credit_card
2,forwarded message cc sent friday pdt subject f...,retail_banking
3,payment history missing credit report speciali...,credit_reporting
4,payment history missing credit report made mis...,credit_reporting


In [48]:
data['product'].value_counts()

credit_reporting       46073
debt_collection        10870
mortgages_and_loans     8876
credit_card             8021
retail_banking          6156
Name: product, dtype: int64

In [49]:
def example_complain(index):
    example = data[data.index == index][['narrative', 'product']].values[0]
    if len(example) > 0:
        print(example[0])
        print()
        print('Category:', example[1])

In [50]:
example_complain(10)

beginning mortgage held mb financial mb mortgage portfolio purchased bank first quarter mb financial customer began fall behind mortgage payment due unemployment divorce house went foreclosure later sheriff sale home scheduled morning submitted hardship application end documentation specific date letter dated acknowledges application discussed next step never received written verbal confirmation regarding status hardship application day scheduled sheriff sale communication way sale never provided clear determination application made payment brought account current able taking loan group friend would lost home sheriff auction day sale received express mail letter dated date scheduled sheriff sale saying hardship application approved detail trial payment plan received letter timeframe required promised would take full loan friend would made required payment delay review option held numerous call since large payment told trial plan leading loan modification place payment plan fact made br

## **Preprocessing**

In [51]:
data = data.reset_index(drop=True)
REPLACE_BY_SPACE_RE = re.compile('[/(){}\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
# STOPWORDS = set(stopwords('english'))

def clean_text(text):
    """
        text: a string
        
        return: modified initial string
    """
    text = text.lower() # lowercase text
    text = REPLACE_BY_SPACE_RE.sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text. substitute the matched string in REPLACE_BY_SPACE_RE with space.
    text = BAD_SYMBOLS_RE.sub('', text) # remove symbols which are in BAD_SYMBOLS_RE from text. substitute the matched string in BAD_SYMBOLS_RE with nothing. 
    text = text.replace('x', '')
#    text = re.sub(r'\W+', '', text)
    text = ' '.join(word for word in text.split() if word not in stopwords) # remove stopwors from text
    return text


data['narrative'] = data['narrative'].apply(clean_text)

In [52]:
# The maximum number of words to be used. (most frequent)
MAX_NB_WORDS = 50000
# Max number of words in each complaint.
MAX_SEQUENCE_LENGTH = 600
# This is fixed.
EMBEDDING_DIM = 100

tokenizer = Tokenizer(num_words=MAX_NB_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)
tokenizer.fit_on_texts(data['narrative'].values)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 31041 unique tokens.


In [53]:
X = tokenizer.texts_to_sequences(data['narrative'].values)
X = pad_sequences(X, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of data tensor:', X.shape)

Shape of data tensor: (79996, 600)


In [54]:
Y = pd.get_dummies(data['product']).values
print('Shape of label tensor:', Y.shape)

Shape of label tensor: (79996, 5)


In [55]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.10, random_state = 64)
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)

(71996, 600) (71996, 5)
(8000, 600) (8000, 5)


In [56]:
print(X_train.shape)
print(X_test.shape)
print(Y_train.shape)
print(Y_test.shape)

(71996, 600)
(8000, 600)
(71996, 5)
(8000, 5)


# The Model

In [57]:
model = Sequential()
model.add(Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length = X.shape[1]))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(150, dropout = 0.2, recurrent_dropout = 0.2))
model.add(Dense(5, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 600, 100)          5000000   
                                                                 
 spatial_dropout1d_2 (Spatia  (None, 600, 100)         0         
 lDropout1D)                                                     
                                                                 
 lstm_2 (LSTM)               (None, 150)               150600    
                                                                 
 dense_2 (Dense)             (None, 5)                 755       
                                                                 
Total params: 5,151,355
Trainable params: 5,151,355
Non-trainable params: 0
_________________________________________________________________
None


In [59]:
history = model.fit(X_train, Y_train, epochs=5, batch_size=64,validation_split=0.1,callbacks=[EarlyStopping(monitor='val_loss', patience=3)])

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
