<h1 style='color:#00868b'>Read, balance and clean dataset<span class="tocSkip"></span></h1>

# Start

In [91]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

## Read dataset

In [92]:
df = pd.read_csv("corpus_sprint3_balanced_cleaned_all.csv", encoding="utf-8")

Set index to old index:

In [93]:
df = df.set_index('Row No')

In [94]:
df.shape

(126593, 18)

In [95]:
df.head()

Unnamed: 0_level_0,Date received,Product,Sub-product,Issue,Sub-issue,Consumer complaint narrative,Company public response,Company,State,ZIP code,Tags,Consumer consent provided?,Submitted via,Date sent to company,Company response to consumer,Timely response?,Consumer disputed?,Complaint ID
Row No,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
9,06/15/19,"Payday loan, title loan, or personal loan",Installment loan,Problem with the payoff process at the end of ...,,they would not let me pay my loan off days bef...,,"Big Picture Loans, LLC",IN,477XX,,Consent provided,Web,06/15/19,Closed with explanation,Yes,,3276316
12,07/25/19,"Payday loan, title loan, or personal loan",Installment loan,Charged fees or interest you didn't expect,,service finance are liars and are charging me ...,,"Service Finance Holdings, LLC",TX,,,Consent provided,Web,07/25/19,Closed with non-monetary relief,Yes,,3318533
38,06/23/19,Vehicle loan or lease,Loan,Problems at the end of the loan or lease,Problem with paying off the loan,on i signed a car loan agreement to finance my...,,"HUNTINGTON NATIONAL BANK, THE",TX,750XX,,Consent provided,Web,06/23/19,Closed with non-monetary relief,Yes,,3284279
44,08/13/19,"Money transfer, virtual currency, or money ser...",Debt settlement,Fraud or scam,,we hired and debt collection to handle collect...,,ALLIED NATIONAL INC,NY,117XX,,Consent provided,Web,08/13/19,Untimely response,No,,3339246
52,07/31/19,"Payday loan, title loan, or personal loan",Payday loan,Problem with the payoff process at the end of ...,,i borrowed in an financial emergency from offi...,Company believes it acted appropriately as aut...,"Harpeth Financial Services, LLC",TN,,Servicemember,Consent provided,Web,08/02/19,Closed with explanation,Yes,,3324772


Example:

In [96]:
df["Consumer complaint narrative"][52]

'i borrowed in an financial emergency from office in tn on then on at in tn i was told my payoff is so i paid however i have been charged much more than on i paid off the loan in person and my checking account was also double billed charged another in tn today i went into the office in tn and asked for the reimbursement of my overbilled monies and the clerk named refused to refund my over double billed charges! threatened to have me arrested if i did not let the office keep the monies my loan of was just for days i was forced to pay a total of i was overbilled double billed '

## Additional preprocessing

### Remove stop words and stem words

Stemming reduces the variation in text data by converting words to their word stem. Applying stemming allows for LDA to focus much more finely on the base form of a word, rather than focusing on the differences in the various variations of a word. The code below is analagous to sprint 1's code in [<code>lemmatization.py</code>](lemmatization.py).

Stop words include: 
* common English words;
* state names.

In [97]:
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords

# use nltk.download('stopwords') to download the list of stop words if this is your first time using nltk
nltk.download('stopwords')

stop_words = nltk.corpus.stopwords.words('english')
my_additional_stop_words = []

# US states, capitalised and lower
states_abbr = ["AL", "AK", "AZ", "AR", "CA", "CO", "CT", "DC", "DE", "FL", "GA", 
          "HI", "ID", "IL", "IN", "IA", "KS", "KY", "LA", "ME", "MD", 
          "MA", "MI", "MN", "MS", "MO", "MT", "NE", "NV", "NH", "NJ", 
          "NM", "NY", "NC", "ND", "OH", "OK", "OR", "PA", "RI", "SC", 
          "SD", "TN", "TX", "UT", "VT", "VA", "WA", "WV", "WI", "WY"]
states = ["Alabama","Alaska","Arizona","Arkansas","California","Colorado",
  "Connecticut","Delaware","Florida","Georgia","Hawaii","Idaho","Illinois",
  "Indiana","Iowa","Kansas","Kentucky","Louisiana","Maine","Maryland",
  "Massachusetts","Michigan","Minnesota","Mississippi","Missouri","Montana",
  "Nebraska","Nevada","New Hampshire","New Jersey","New Mexico","New York",
  "North Carolina","North Dakota","Ohio","Oklahoma","Oregon","Pennsylvania",
  "Rhode Island","South Carolina","South Dakota","Tennessee","Texas","Utah",
  "Vermont","Virginia","Washington","West Virginia","Wisconsin","Wyoming"]

states_abbr = [item.lower() for item in states_abbr]
states = [item.lower() for item in states]
states.extend(states_abbr)

# add to list of additional stop words
my_additional_stop_words.extend(states)
print("Length of extra list: ", len(my_additional_stop_words))

# add other stop words
my_additional_stop_words.extend(['from', 'subject', 're', 'edu', 'use', 'not', 'would', 'say', 'could', '_', 'be', 'know', 'good', 'go', 'get', 'do', 'done', 'try', 'many', 'some', 'nice', 'thank', 'think', 'see', 'rather', 'easy', 'easily', 'lot', 'lack', 'make', 'want', 'seem', 'run', 'need', 'even', 'right', 'line', 'even', 'also', 'may', 'take', 'come'])
print("Length of extra list: ", len(my_additional_stop_words))

# add nltk's stop words
my_stop_words = stop_words + my_additional_stop_words
print("Length of my_stop_words list: ", len(my_stop_words))

Length of extra list:  101
Length of extra list:  143
Length of my_stop_words list:  322


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\bryan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [107]:
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize 
from nltk.corpus import stopwords

ps = PorterStemmer()

i = 0

for ind, complaint in df["Consumer complaint narrative"].items():
    i = i+1
    # Tokenize the complaint and remove stop words
    words = [word for word in complaint.split(' ')
                            if word not in my_stop_words] 
    new_words = []
    # Stem the words in the complaint
    for word in words:
        new_words.append(ps.stem(word))
    df["Consumer complaint narrative"][ind] = new_words
    if (i % 1000) == 0:
        print(i)

# to csv for later use
df.to_csv("corpus_cleaned_and_stemmed_for_LDA.csv", index=False)

df.shape

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000
23000
24000
25000
26000
27000
28000
29000
30000
31000
32000
33000
34000
35000
36000
37000
38000
39000
40000
41000
42000
43000
44000
45000
46000
47000
48000
49000
50000
51000
52000
53000
54000
55000
56000
57000
58000
59000
60000
61000
62000
63000
64000
65000
66000
67000
68000
69000
70000
71000
72000
73000
74000
75000
76000
77000
78000
79000
80000
81000
82000
83000
84000
85000
86000
87000
88000
89000
90000
91000
92000
93000
94000
95000
96000
97000
98000
99000
100000
101000
102000
103000
104000
105000
106000
107000
108000
109000
110000
111000
112000
113000
114000
115000
116000
117000
118000
119000
120000
121000
122000
123000
124000
125000
126000


(126593, 18)

Example:

In [109]:
df['Consumer complaint narrative'][52]

['borrow',
 'financ',
 'emerg',
 'offic',
 'told',
 'payoff',
 'paid',
 'howev',
 'charg',
 'much',
 'paid',
 'loan',
 'person',
 'check',
 'account',
 'doubl',
 'bill',
 'charg',
 'anoth',
 'today',
 'went',
 'offic',
 'ask',
 'reimbur',
 'overbil',
 'moni',
 'clerk',
 'name',
 'refu',
 'refund',
 'doubl',
 'bill',
 'charges!',
 'threaten',
 'arrest',
 'let',
 'offic',
 'keep',
 'moni',
 'loan',
 'day',
 'forc',
 'pay',
 'total',
 'overbil',
 'doubl',
 'bill',
 '']

In [110]:
df['Consumer complaint narrative']

Row No
9         [let, pay, loan, day, next, payment, plu, subs...
12        [servic, financ, liar, charg, interest, hvac, ...
38        [sign, car, loan, agreement, financ, car, loan...
44        [hire, debt, collect, handl, collect, effort, ...
52        [borrow, financ, emerg, offic, told, payoff, p...
                                ...                        
311078    [bank, sever, year, bank, trust, origin, husba...
368588    [file, fraud, alert, bank, week, ago, call, ev...
256815    [previou, trip, bank, america, atm, card, cash...
193394    [account, td, bank, check, save, start, work, ...
257330    [hello, receiv, promot, offer, code, citibank,...
Name: Consumer complaint narrative, Length: 126593, dtype: object

We have now turned every complaint into an array of its words.

### Tokenzation & Document Term Matrix

The two main inputs to our LDA topic model are the Dictionary (id2word) and the corpus. Let's create them. [source](https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/)

We remove rare words and common words based on their document frequency. Below we remove words that appear in less than 20 documents or in more than 50% of the documents.

In [113]:
from gensim.corpora import Dictionary

# Create a dictionary representation of the documents.
id2word = Dictionary(df['Consumer complaint narrative'])

# Filter out words that occur less than 20 documents, or more than 50% of the documents.
id2word.filter_extremes(no_below=20, no_above=0.5)

Finally, we transform the documents to a vectorized form. We simply compute the frequency of each word.

In [117]:
# Bag-of-words representation of the documents.
corpus = [id2word.doc2bow(doc) for doc in df['Consumer complaint narrative']]

Let’s see how many tokens and documents we have to train on.

In [119]:
print('Number of unique tokens: %d' % len(id2word))
print('Number of documents: %d' % len(corpus))

Number of unique tokens: 6378
Number of documents: 126593


Each of the 126593 complaints is now represented as a 6378-dimensional vector, which means our vocabulary has 6378 words. The frequencies stated above (20 and 50% can be finetuned).

We are now ready to apply LDA.

## LDA

Parameters ([source](https://radimrehurek.com/gensim/models/ldamulticore.html)):
* α: Topic smoothing parameter; can be set to an 1D array of length equal to the number of expected topics that expresses our a-priori belief for the each topics’ probability
* eta: Word/term smoothing parameter; a scalar for a symmetric prior over topic/word probability

Most topic modeling analyses in the literature ([Blei et al, 2003](https://www.researchgate.net/publication/326505884_Latent_Dirichlet_Allocation_LDA_for_Topic_Modeling_of_the_CFPB_Consumer_Complaints); [Blei and Lafferty, 2009](https://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.463.1205&rep=rep1&type=pdf#page=96); [Kaplan and Vakili, 2015](https://onlinelibrary.wiley.com/doi/abs/10.1002/smj.2294); [Blei, 2012](https://dl.acm.org/doi/pdf/10.1145/2133806.2133826)) suggest a value of 0.1 for both of these hyperparameters. This results in semantically meaningful topics. However, these values can also be set to 'auto', meaning we would automatically learn these two parameters.

* number of topics: The number of topics LDA has to attempt to identify
* iterations: Maximum number of iterations through the corpus when inferring the topic distribution of a corpus
* passes: Number of passes through the corpus during training

According to Rehurek, 2019 [source](https://radimrehurek.com/gensim/auto_examples/tutorials/run_lda.html):

"First of all, the elephant in the room: how many topics do I need? There is really no easy answer for this, it will depend on both your data and your application. I have used 10 topics here because I wanted to have a few topics that I could interpret and “label”, and because that turned out to give me reasonably good results. You might not need to interpret all your topics, so you could use a large number of topics, for example 100.

<code>chunksize</code> controls how many documents are processed at a time in the training algorithm. Increasing chunksize will speed up training, at least as long as the chunk of documents easily fit into memory. Chunksize can however influence the quality of the model, as discussed in Hoffman and co-authors [source](https://papers.nips.cc/paper/3902-online-learning-for-latent-dirichlet-allocation.pdf).

<code>passes</code> controls how often we train the model on the entire corpus. Another word for passes might be “epochs”. <code>iterations</code> is somewhat technical, but essentially it controls how often we repeat a particular loop over each document.

We used **parallelized Latent Dirichlet Allocation** which uses multiprocessing to speed up learning ([source](https://radimrehurek.com/gensim/models/ldamulticore.html)).

In [None]:
from gensim import corpora, models
from sklearn import metrics
from gensim.test.utils import common_corpus, common_dictionary

# Training parameters
alpha = 0.1
eta = 0.1
#alpha = 'auto'
#eta = 'auto'
num_topics = [5,10,15,20] # sizes of the topics
chunksize = 2000
iterations = 400
passes = 20
eval_every = None  # Don't evaluate model perplexity, takes too much time.

for topics in num_topics:
    print("Start with number of topics:", topics)
    lda_model = models.LdaMulticore(
                    corpus = corpus,
                    id2word=common_dictionary,
                    num_topics = topics, 
                    passes = passes,
                    iterations = iterations,
                    alpha = alpha, 
                    eta = eta,
                    random_state = 42
                   )
    # Compute Perplexity
    print("Perplexity: ", lda_model.log_perplexity(df_complaints))  # a measure of how good the model is. lower = better.
    # save the trained model
    print("Saving the model...")
    lda_model.save("/runs_Sprint3/LDA/lda_model_" + str(topics) + "topics")

Start with number of topics: 5


Load models:

In [None]:
# load the saved model
lda_model5 = LdaModel.load("/runs_Sprint3/LDA/lda_model_" + 5 +"topics")
lda_model10 = LdaModel.load("/runs_Sprint3/LDA/lda_model_" + 10 +"topics")
lda_model15 = LdaModel.load("/runs_Sprint3/LDA/lda_model_" + 15 +"topics")
lda_model20 = LdaModel.load("/runs_Sprint3/LDA/lda_model_" + 20 +"topics")

Show topics?

In [None]:
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))
