<h1 style='color:#00868b'>Read, balance and clean dataset<span class="tocSkip"></span></h1>

# Start

In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

## Read dataset

In [2]:
df = pd.read_csv("complaints-2020-01-22_08_24.csv", encoding="utf-8")

In [3]:
df.head()

Unnamed: 0,Date received,Product,Sub-product,Issue,Sub-issue,Consumer complaint narrative,Company public response,Company,State,ZIP code,Tags,Consumer consent provided?,Submitted via,Date sent to company,Company response to consumer,Timely response?,Consumer disputed?,Complaint ID
0,07/23/19,"Credit reporting, credit repair services, or o...",Credit reporting,Credit monitoring or identity theft protection...,Problem canceling credit monitoring or identif...,I have complained many times that the credit r...,Company has responded to the consumer and the ...,Experian Information Solutions Inc.,CA,926XX,,Consent provided,Web,07/23/19,Closed with explanation,Yes,,3315279
1,07/26/19,Debt collection,I do not know,False statements or representation,Attempted to collect wrong amount,please review the current fraud account and al...,Company believes it acted appropriately as aut...,"Ideal Collection Services, Inc.",FL,333XX,,Consent provided,Web,07/26/19,Closed with explanation,Yes,,3319487
2,06/03/19,Debt collection,I do not know,Attempts to collect debt not owed,Debt was paid,Called multiple times over the years for a deb...,,"ONEMAIN FINANCIAL HOLDINGS, LLC.",FL,327XX,,Consent provided,Web,06/07/19,Closed with explanation,Yes,,3262794
3,07/03/19,Debt collection,Other debt,Attempts to collect debt not owed,Debt was result of identity theft,I sent in a letter to the company to have them...,,"Diversified Consultants, Inc.",VA,232XX,,Consent provided,Web,07/03/19,Closed with explanation,Yes,,3295208
4,07/14/19,"Credit reporting, credit repair services, or o...",Credit reporting,Improper use of your report,Received unsolicited financial product or insu...,On XX/XX/19 I applied for a Debt Relief Produc...,,"ClearOne Advantage, LLC",PA,191XX,"Older American, Servicemember",Consent provided,Web,07/18/19,Closed with explanation,Yes,,3306130


## Select columns to keep

### Consumer complaint narrative + Product (cp)

In [4]:
df_complaints = df["Consumer complaint narrative"]

In [5]:
df_complaints.head()

0    I have complained many times that the credit r...
1    please review the current fraud account and al...
2    Called multiple times over the years for a deb...
3    I sent in a letter to the company to have them...
4    On XX/XX/19 I applied for a Debt Relief Produc...
Name: Consumer complaint narrative, dtype: object

In [6]:
df_complaints.shape

(485701,)

## Data preprocessing

### Clean

In [11]:
import re
import string

def clean_document(complaint):
    # turn text to lowercase
    complaint = complaint.lower()
    # remove URLs
    complaint = re.sub('https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+', '', complaint)
    complaint = re.sub('https? ?: ?// ?(?:[-\w.]|(?:%[\da-fA-F]{2}))+', '', complaint)
    # removed censored words
    complaint = re.sub('[xxxx]{2,3,4,5,6,7,8,9}', '', complaint)
    complaint = re.sub('[XXXX]{2,3,4,5,6,7,8,9}', '', complaint)
    # remove special and non-sensical characters
    complaint = re.sub("[.,#'-\(\):$;\?%}{*]",' ', complaint)
    complaint = re.sub('\n', '', complaint)
    complaint = re.sub('\t', '', complaint)
    # normalise spaces to just one space
    complaint = re.sub(" +", " ", complaint);
    # remove normal dates and censored dates
    complaint = re.sub('[\dx]{1,2}/[\dx]{1,2}/[\dx]{2,4}', '', complaint)
    return complaint


In [12]:
df_complaints["Consumer complaint narrative"] = df_complaints["Consumer complaint narrative"].apply(clean_document)

### Tokenization

In [None]:
from nltk.tokenize import word_tokenize 

word_tokenize(df_complaints["Consumer complaint narrative"]

### Remove stop words

These include: 
* common English words;
* company names, which we can obtain from the company column, although this may not cover all companies mentioned in the consumer complaint narrative;
* combinations of two or more x letters to hide personal information (these were already removed in the data cleaning step);
* state names.

### Stemming

This reduces the variation in text data by converting words to their word stem. Applying stemming allows for LDA to focus much more finely on the base form of a word, rather than focusing on the differences in the various variations of a word.

### Document Term Matrix

### Dimensionality reduction

### Export to csv

In [13]:
# Export to csv for later use
df_complaints.to_csv("corpus_sprint2_LDA.csv", index=False)