# Capstone 3 - Severity Labels Ground Truth

- Manually creating severity labels on a random sample of complaints to be able to apply supervised model to full dataset

In [179]:
import pandas as pd
import numpy as np
import time as time
import os
from nltk import sent_tokenize, word_tokenize
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.util import ngrams
from textblob import TextBlob


In [177]:
save_path = "/Users/joeboardman/Documents/Springboard/GitHub/CapstoneThree/"

In [36]:
large = 'https://files.consumerfinance.gov/ccdb/complaints.csv.zip' # 1.8m rows
banks = ['CITIBANK, N.A.', 
         'JPMORGAN CHASE & CO.', 
         'WELLS FARGO & COMPANY', 
         'BANK OF AMERICA, NATIONAL ASSOCIATION']

#read in larget dataset - 1.8m rows - 19000 from chase
df_gt = pd.read_csv(large)
# only include values from top 4 banks in list banks (exclude credit bureaus and smaller institutions)
df_gt = df_gt[df_gt.Company.isin(banks)]
df_gt.head()

Unnamed: 0,Date received,Product,Sub-product,Issue,Sub-issue,Consumer complaint narrative,Company public response,Company,State,ZIP code,Tags,Consumer consent provided?,Submitted via,Date sent to company,Company response to consumer,Timely response?,Consumer disputed?,Complaint ID
29,2020-10-01,Checking or savings account,Checking account,Managing an account,Problem accessing account,,,JPMORGAN CHASE & CO.,NY,12549,,,Referral,2020-10-02,Closed with explanation,Yes,,3879354
56,2020-10-08,Checking or savings account,Checking account,Managing an account,Problem using a debit or ATM card,,Company has responded to the consumer and the ...,WELLS FARGO & COMPANY,TX,77071,,,Phone,2020-10-08,Closed with explanation,Yes,,3888551
89,2019-02-16,"Credit reporting, credit repair services, or o...",Credit reporting,Incorrect information on your report,Information belongs to someone else,On XX/XX/XXXX I received a letter from the IRS...,Company has responded to the consumer and the ...,WELLS FARGO & COMPANY,CA,956XX,,Consent provided,Web,2019-02-16,Closed with non-monetary relief,Yes,,3154014
114,2019-03-04,Mortgage,VA mortgage,Struggling to pay mortgage,,I am filing this complaint after receiving a l...,,JPMORGAN CHASE & CO.,GA,305XX,Servicemember,Consent provided,Web,2019-03-04,Closed with explanation,Yes,,3169014
118,2019-03-23,Credit card or prepaid card,General-purpose credit card or charge card,"Other features, terms, or problems",Credit card company forcing arbitration,,Company has responded to the consumer and the ...,"CITIBANK, N.A.",FL,334XX,,Consent not provided,Web,2019-03-23,Closed with explanation,Yes,,3188950


In [37]:
# drop rows without a written complaint
df_gt = df_gt.dropna(subset=['Consumer complaint narrative']).reset_index(drop=True)
df_gt.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 76868 entries, 0 to 76867
Data columns (total 18 columns):
 #   Column                        Non-Null Count  Dtype 
---  ------                        --------------  ----- 
 0   Date received                 76868 non-null  object
 1   Product                       76868 non-null  object
 2   Sub-product                   68584 non-null  object
 3   Issue                         76868 non-null  object
 4   Sub-issue                     41572 non-null  object
 5   Consumer complaint narrative  76868 non-null  object
 6   Company public response       56525 non-null  object
 7   Company                       76868 non-null  object
 8   State                         76297 non-null  object
 9   ZIP code                      59702 non-null  object
 10  Tags                          14421 non-null  object
 11  Consumer consent provided?    76868 non-null  object
 12  Submitted via                 76868 non-null  object
 13  Date sent to com

In [38]:
# drop needless columns
df_gt.drop(['Date received', 'Product', 'Sub-product', 'Issue', 'Sub-issue', 'Company public response', 'Company',
            'State', 'ZIP code', 'Tags', 'Consumer consent provided?', 'Submitted via', 'Date sent to company', 
            'Company response to consumer', 'Timely response?', 'Consumer disputed?'], 
            axis=1, inplace=True)

In [39]:
# rename columns to remove spaces
df_gt = df_gt.rename(columns={"Consumer complaint narrative": "complaint_text", 
                              "Complaint ID":"complaint_ID"})

In [40]:
# create column for our target labels
df_gt['severe'] = 0
df_gt.head()

Unnamed: 0,complaint_text,complaint_ID,severe
0,On XX/XX/XXXX I received a letter from the IRS...,3154014,0
1,I am filing this complaint after receiving a l...,3169014,0
2,"On XX/XX/XXXX, Bank of America mailed out a ca...",3121663,0
3,I made an appointment at XXXX for myself and a...,3128523,0
4,I was using my chase credit card like a regula...,3288810,0


In [41]:
# take random sample of dataframe to create ground truth labels
df_gt = df_gt.sample(n=100, random_state=34)
df_gt.head()

Unnamed: 0,complaint_text,complaint_ID,severe
70509,on XX/XX/2018 i applied for a credit through w...,2975717,0
14855,I had previously submitted the following compl...,3493387,0
17961,I have a wells fargo account. XXXX XXXX kept ...,3583070,0
18625,The Home Depot/Citibank North America says tha...,3775391,0
50357,"Hello, my name is XXXX XXXX and I am contactin...",1787885,0


In [42]:
# words to remove from stop words
stop_remove = ['no', 'not', "don't"]
stop_add = ['xx', 'xxxx', '00']

def add_or_remove_stop_words(remove_list, add_list):
    '''adding or removing multiple stop words instead of repetitive code'''
    
    global STOP_WORDS
    STOP_WORDS = stopwords.words('english')
    for i in range(len(remove_list)):
        STOP_WORDS.remove(remove_list[i])
    
    for i in range(len(add_list)):
        STOP_WORDS.append(add_list[i])
    
# remove negation stop words
add_or_remove_stop_words(stop_remove, stop_add)

In [43]:
# text preprocessing formulas
def preprocess_text(text):
    '''tokenize, lemmatize and remove stop words from text'''
    
    # Tokenise words while ignoring punctuation
    tokeniser = RegexpTokenizer(r'\w+')
    tokens = tokeniser.tokenize(text)
    bigram = list(ngrams(tokens, 2)) 
    
    # Lowercase and lemmatise 
    lemmatiser = WordNetLemmatizer()
    lemmas = [lemmatiser.lemmatize(token.lower(), pos='v') for token in tokens]
    
    # Remove stopwords
    keywords= [lemma for lemma in lemmas if lemma not in STOP_WORDS]
    
    #convert text back to string
    text_str = " ".join(keywords)
    return text_str

In [44]:
# Get the polarity score of each complaint
def get_TextBlob_score(text):
    '''Assign polarity score between -1 to 1 for text'''
    
    polarity = TextBlob(text).sentiment.polarity
    return polarity

In [45]:
# Preprocess complaint_text column then run the textblob polarity 
df_gt['preprocessed_complaint'] = df_gt.complaint_text.apply(preprocess_text)

df_gt['polarity'] = [get_TextBlob_score(str(complaint)) for complaint in df_gt.preprocessed_complaint]

In [47]:
df_gt.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100 entries, 70509 to 8285
Data columns (total 5 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   complaint_text          100 non-null    object 
 1   complaint_ID            100 non-null    int64  
 2   severe                  100 non-null    int64  
 3   preprocessed_complaint  100 non-null    object 
 4   polarity                100 non-null    float64
dtypes: float64(1), int64(2), object(2)
memory usage: 4.7+ KB


# Create Labels for Complaints - Severe (1) or Not Severe (0)

### Severe defined as complaints that should be addressed as they would pose reputational risk and could be used as learning examples for customer service moving forward

### Examples:
- claims of fees where there should not be
    - overdraft fees with an account with money in it
    - additional principal
- fraudulent or closed accounts appearing on credit reporting that would affect credit score
- complaints where customer is insistent on wrongdoing
    - maintaining/regaining customer satisfaction by ensuring most angered customers are addressed and situation is explained

In [48]:
df_gt.iloc[0:9,:]

Unnamed: 0,complaint_text,complaint_ID,severe,preprocessed_complaint,polarity
70509,on XX/XX/2018 i applied for a credit through w...,2975717,0,2018 apply credit well fargo online site inten...,0.029282
14855,I had previously submitted the following compl...,3493387,0,previously submit follow complaint regard bank...,-0.030556
17961,I have a wells fargo account. XXXX XXXX kept ...,3583070,0,well fargo account keep chagring bank account ...,-0.2
18625,The Home Depot/Citibank North America says tha...,3775391,0,home depot citibank north america say late dat...,0.066667
50357,"Hello, my name is XXXX XXXX and I am contactin...",1787885,0,hello name contact find government agency may ...,0.053444
74545,The reported JPM Chase accounts on my credit r...,3042956,0,report jpm chase account credit report duplica...,0.054464
37960,"To my knowledge, I was never late on my Chase ...",3177404,0,knowledge never late chase credit card account...,-0.090702
13482,This account does not belong to me. I called M...,3877825,0,account not belong call macys say unable locat...,-0.066667
36475,"Last night, XX/XX/XXXX, I checked my Bank of A...",2785597,0,last night check bank america check account li...,0.056585


In [59]:
# rows 1-10
df_gt.iloc[0,2] = 0 #rejected for credit card due to credit score
df_gt.iloc[1,2] = 0 #can't send mail to BofA
df_gt.iloc[2,2] = 1 #overdraft fees, closed account, causing many life issues
df_gt.iloc[3,2] = 0 #customer not sure if they paid late
df_gt.iloc[4,2] = 1 #customer thinks theyre owed 35k more than they should
df_gt.iloc[5,2] = 0 #accounts being shown on credit report
df_gt.iloc[6,2] = 0 #customer claiming never late payments
df_gt.iloc[7,2] = 0 #fraudulent macys account
df_gt.iloc[8,2] = 1 #unjustified? overdraft fee of $35 and poor customer service
df_gt.iloc[9,2] = 0 #customer unable to pay credit line - wants reduction in charges


In [60]:
df_gt.iloc[10:19,:]

Unnamed: 0,complaint_text,complaint_ID,severe,preprocessed_complaint,polarity
23860,Wife received a letter that wanted personal in...,2876335,0,wife receive letter want personal information ...,0.0
52313,My situation is very clear and simple. I have ...,1534576,0,situation clear simple make cash deposit atm c...,-0.020771
24734,I 'm in bankruptcy I pay mortgage company and ...,2499450,0,bankruptcy pay mortgage company bankruptcy cou...,-0.4
11802,The following account is not mine JPMCB CARD A...,3571895,0,follow account not mine jpmcb card account num...,0.0
75071,Wells Fargo is misapplying payments across two...,3059830,0,well fargo misapply payments across two studen...,-0.0625
12916,I have not received confirmation not certified...,3570994,0,not receive confirmation not certify testimony...,-0.06
72723,"In XX/XX/XXXX, my husband and I refinanced our...",2909978,0,husband refinance home mortgage chase bank low...,0.045524
61569,In XXXX we filed a chapter XXXX bankruptcy in ...,1377573,0,file chapter bankruptcy order try keep home cr...,-0.0111
68802,I have communicated with this organization num...,2921695,0,communicate organization numerous time demand ...,-0.086364


In [79]:
# rows 11-20
df_gt.iloc[10,2] = 1 #card fraudulently closed - will affect credit score
df_gt.iloc[11,2] = 0 #customer claims ATM stole money
df_gt.iloc[12,2] = 0 #late payments - customer disagrees
df_gt.iloc[13,2] = 0 #not the customer's account on credit report
df_gt.iloc[14,2] = 1 #inaccurate payment allocation to student loan accounts causing late fees
df_gt.iloc[15,2] = 1 #fraudulent, inaccurate reporting on credit report - affecting credit
df_gt.iloc[16,2] = 1 #fee assessed on $0 balance
df_gt.iloc[17,2] = 1 #bank adding credit demerits when they were already covered in bakruptcy filing
df_gt.iloc[18,2] = 0 #customer had account opened fraudulently
df_gt.iloc[19,2] = 0 #clerical error to be fixed

In [80]:
df_gt.iloc[20:29,:]

Unnamed: 0,complaint_text,complaint_ID,severe,preprocessed_complaint,polarity
59113,I had a levy from a debt collector and law fir...,1837858,0,levy debt collector law firm base place bank a...,-0.160185
29564,My business account is being manipulated by th...,2691620,0,business account manipulate bank check come ge...,0.0
39528,I made a charge on my credit card and later di...,2341120,0,make charge credit card later discover scam ge...,-0.2
74069,I was selling furniture on XXXX and had a buye...,3035566,0,sell furniture buyer send cashier check larger...,0.08625
46880,My apt was closed bad on what i believe to be ...,2123753,0,apt close bad believe fraudulent activity tell...,-0.033673
14748,"Today, XX/XX/2020 I needed to withdraw funds f...",3599554,0,today 2020 need withdraw fund well fargo new j...,0.047971
43953,"I got a Heloc loan for XXXX, ( which is a line...",1579500,0,get heloc loan line credit citi bank notsure a...,0.320015
40425,M y line of credit that I had with Citibank ( ...,2500370,0,line credit citibank best buy suddenly close d...,-0.025781
28964,On the evening of XXXX I received a text from ...,2702964,0,even receive text advise hard credit inquiry m...,-0.291667


In [92]:
#rows 21-30
df_gt.iloc[20,2] = 1 #customer not returned full fee levied illegally
df_gt.iloc[21,2] = 0 #banks freezing checks for security reasons
df_gt.iloc[22,2] = 0 #customer made a charge on card that was scam - police issue
df_gt.iloc[23,2] = 0 #customer fell victim to scam - police issue
df_gt.iloc[24,2] = 0 #alleging fraudulent activity on card
df_gt.iloc[25,2] = 0 #customer wants to walk up to drive in ATM
df_gt.iloc[26,2] = 0 #too many documents to fill out for HELOC
df_gt.iloc[27,2] = 1 #identify theft causing credit issues leading to account closure
df_gt.iloc[28,2] = 0 #customer inquiry of an application for credit
df_gt.iloc[29,2] = 0 #customer asking about loan mod

In [93]:
df_gt.iloc[30:39,:]

Unnamed: 0,complaint_text,complaint_ID,severe,preprocessed_complaint,polarity
38721,Made a deposit to my Chase account on XX/XX/XX...,3117027,0,make deposit chase account check pay clear iss...,0.3
47095,I work with XXXX on behalf of the seller to he...,2011456,0,work behalf seller help facilitate short sale ...,-0.116667
70150,In XX/XX/XXXX I made arrangements with XXXX XX...,2907254,0,make arrangements take dollars monthly account...,0.0625
36435,Chase financial holds my mgt@6.75 for 10 yrs. ...,1565467,0,chase financial hold mgt 6 75 10 yrs advise st...,0.13
74780,Please consider this my formal complaint again...,3093637,0,please consider formal complaint sear cbna rep...,0.147222
38655,A man came to fix my furnace. He told me to pa...,3186219,0,man come fix furnace tell pay advance service ...,-0.157143
72226,I have already put in a complaint about this. ...,2988989,0,already put complaint im not hear back back se...,0.048052
67084,Citibank refused to honor me welcome bonus XXX...,2705526,0,citibank refuse honor welcome bonus miles meet...,0.286147
47020,I started the application for a mortgage with ...,1969584,0,start application mortgage bank america since ...,-0.11303


In [104]:
#rows 31-40
df_gt.iloc[30,2] = 1 #$230 overdraft fee after a check deposit
df_gt.iloc[31,2] = 0 #someone working on short sale not satisfied with speed of transaction
df_gt.iloc[32,2] = 0 #customer upset with sears selling info
df_gt.iloc[33,2] = 0 #not clear
df_gt.iloc[34,2] = 1 #derogatory account appearing on customer credit report
df_gt.iloc[35,2] = 0 #man didn't fix furnace
df_gt.iloc[36,2] = 1 #CD disappeared, been harrassed by some family
df_gt.iloc[37,2] = 0 #customer not satisfied with terms of welcome bonus
df_gt.iloc[38,2] = 0 #loan closing keeps getting delayed
df_gt.iloc[39,2] = 0 #customer victim of fraud - conflicting $ amounts in complaint

In [63]:
df_gt.iloc[40:49,:]

Unnamed: 0,complaint_text,complaint_ID,severe,preprocessed_complaint,polarity
38222,"Citibank will not print, display on line, or e...",3156681,0,citibank not print display line email statemen...,0.35
73756,Bank of America granted me two -- -three month...,3036887,0,bank america grant two three month deferrals p...,-0.169444
11878,"First and foremost, There are multiple complai...",3551370,0,first foremost multiple complaints like submit...,-0.051354
387,"On a statement sent on XX/XX/2019, Wells Fargo...",3304680,0,statement send 2019 well fargo send escrow sta...,-0.090476
3294,i was denied a home loan modification by bank ...,3285451,0,deny home loan modification bank america loan ...,0.003125
38003,I have been working with XXXX to save my home....,2811947,0,work save home lender well fargo submit paperw...,-0.067424
40368,My Best buy credit card was stolen in XX/XX/20...,3655528,0,best buy credit card steal 2020 ship even rece...,0.012617
14048,"On XX/XX/XXXX at XXXX, XXXX XXXX , with XXXX X...",3478608,0,dba help center reach via text loan forgivenes...,-0.046411
52354,i keep getting credit card offers from Chase b...,1733264,0,keep get credit card offer chase bank act beha...,0.0


In [115]:
#rows 41-50
df_gt.iloc[40,2] = 0 #bank won't provide statement if no activity for that month
df_gt.iloc[41,2] = 1 #customer given payment relief due to hurricane, not reflected in account
df_gt.iloc[42,2] = 1 #overdraft fees (okay), ATM didn't deposit money appropriately, customer without money
df_gt.iloc[43,2] = 0 #disagreement on escrow payment
df_gt.iloc[44,2] = 0 #customer unable to modify loan - in bankruptcy
df_gt.iloc[45,2] = 0 #want loan mod but has been delinquent on payments for 37 months - partial payments
df_gt.iloc[46,2] = 0 #stolen credit card, charges refunded except $13
df_gt.iloc[47,2] = 0 #scam from other company - bank stopped charges - police contacted
df_gt.iloc[48,2] = 0 #keep getting credit card offers
df_gt.iloc[49,2] = 0 #customer had to file for bankruptcy

In [116]:
df_gt.iloc[50:59,:]

Unnamed: 0,complaint_text,complaint_ID,severe,preprocessed_complaint,polarity
1456,XX/XX/2019 {$780.00} XXXX XXXX XXXX XX/XX/2019...,3206805,0,2019 780 2019 780 2019 540 receive fraud alert...,-0.05625
67871,"On XX/XX/XXXX, a series of fraudulent charges ...",2943284,0,series fraudulent charge make account one clea...,0.034091
25420,I received a check for amount of {$6000.00} fr...,2552798,0,receive check amount 6000 bank check make sign...,0.033766
48381,I have home equity loan with Bank of America. ...,1971336,0,home equity loan bank america loan 10 year ter...,-0.072619
25527,Myself and my wife are first-time home-buyers....,2555150,0,wife first time home buyers bank well fargo se...,0.063524
27809,I filed a homeowner insurance claim with XXXX ...,2575524,0,file homeowner insurance claim insurance compa...,0.014286
18690,Please help!! My name is XXXX XXXX and I obtai...,3755809,0,please help name obtain mortgage 140000 sell r...,-0.011894
23418,I opened up a free checking account at Wells F...,1985305,0,open free check account well fargo bank months...,-0.00625
16527,I have a had multiple accounts of money being ...,3855961,0,multiple account money take account place call...,0.0


In [127]:
#rows 51-60
df_gt.iloc[50,2] = 1 #credit card compromised - customer on hook for charges
df_gt.iloc[51,2] = 0 #issue still ongoing. bank still trying to fix
df_gt.iloc[52,2] = 1 #lost check was cashed fraudulently
df_gt.iloc[53,2] = 1 #customer's credit score is being impacted while modifying HE loan
df_gt.iloc[54,2] = 1 #hidden closing cost items in mortgage during application - only disclose 3 days before closing
df_gt.iloc[55,2] = 1 #issue with depositing check - not correct name on account for who check was for?
df_gt.iloc[56,2] = 1 #mortgage sold and added 51k in new money owed
df_gt.iloc[57,2] = 0 #inactivity charge on bank account
df_gt.iloc[58,2] = 0 #company called hidden listings taking money out of account
df_gt.iloc[59,2] = 0 #merchant charged subscription customer didn't want

In [128]:
df_gt.iloc[60:69,:]

Unnamed: 0,complaint_text,complaint_ID,severe,preprocessed_complaint,polarity
53426,I am posting this complaint to alert CitiMortg...,1904827,0,post complaint alert citimortgage fact althoug...,0.012326
25391,I currently bank with Wells Fargo and am using...,2535325,0,currently bank well fargo use app call order a...,0.166667
13544,I applied for a Macy 's card financed through ...,3555448,0,apply macy card finance dsnb lack home loan ac...,0.0
33990,I recently checked my credit score with Chase....,2807413,0,recently check credit score chase com show sco...,0.107273
31926,Bank of America through their Trustee Attorney...,2764659,0,bank america trustee attorney subsidiary misre...,0.113333
63030,I opened a wells Fargo loan at XXXX XX/XX/2016...,2262640,0,open well fargo loan 2016 never make late paym...,-0.143182
34930,"On XX/XX/XXXXor XX/XX/2018, I requested my ban...",2845758,0,xxxxor 2018 request bank well fargo place stop...,-0.0125
54072,I have a credit card from Band of America and ...,2210724,0,credit card band america set automatic payment...,0.05
38912,Chase closed my bank accounts in XX/XX/2019. T...,3143125,0,chase close bank account 2019 problem online a...,0.025


In [139]:
#rows 61-70
df_gt.iloc[60,2] = 1 #customers mortgage is with defunct company
df_gt.iloc[61,2] = 0 #customer uses app that runs slow
df_gt.iloc[62,2] = 0 #customer's credit history not strong enough for credit card
df_gt.iloc[63,2] = 0 #confused by credit report
df_gt.iloc[64,2] = 1 #needs title transfer - homeless
df_gt.iloc[65,2] = 1 #retaliatory closed loan account?
df_gt.iloc[66,2] = 0 #ongoing handling by bank
df_gt.iloc[67,2] = 0 #automatic payment mix-up
df_gt.iloc[68,2] = 0 #cannot access closed accounts
df_gt.iloc[69,2] = 0 #customer didn't notice a charge on account

In [66]:
df_gt.iloc[70:79,:]

Unnamed: 0,complaint_text,complaint_ID,severe,preprocessed_complaint,polarity
36073,Returns made for purchases that were made at X...,2550133,0,return make purchase make pay chase visa not a...,0.183333
14257,"JP Morgan Chase, back on my accounts how the m...",3658160,0,jp morgan chase back account mess things get p...,-0.058163
18358,I recently had my mortgage loan refinanced by ...,3893813,0,recently mortgage loan refinance another compa...,0.0625
61191,I opened a CITIGOLD Interest Checking Account ...,1968661,0,open citigold interest check account citibank ...,0.019723
11282,The credit card statements fail to indicate th...,3531561,0,credit card statements fail indicate correct b...,-0.152778
8823,My checking account with Bank of America was e...,3479444,0,check account bank america excessively charge ...,-0.069298
12073,"Upon reviewing my credit report, I noticed an ...",3516434,0,upon review credit report notice outstanding b...,0.116667
40356,I ssued a Macy 's s tore card in return for on...,2071355,0,ssued macy tear card return one day discount p...,-0.071429
48894,Bait & Switch predatory lending from Wells Far...,1953521,0,bait switch predatory lend well fargo mortgage...,0.041667


In [150]:
#rows 71-80
df_gt.iloc[70,2] = 0 #refund in cash instead of on card
df_gt.iloc[71,2] = 1 #confusion about payment on loan
df_gt.iloc[72,2] = 0 #haven't gotten letter about payoff of loan
df_gt.iloc[73,2] = 0 #confusion about checking account promotion
df_gt.iloc[74,2] = 0 #question about policy
df_gt.iloc[75,2] = 0 #questioning getting assessed fees
df_gt.iloc[76,2] = 1 #derogatory mark stays on credit report even when customer tried to pay in full and close account
df_gt.iloc[77,2] = 0 #needed to make payment on credit card
df_gt.iloc[78,2] = 0 #didn't qualify for $1k first-time home buyer promotion
df_gt.iloc[79,2] = 1 #deceived customer lost down payment

In [67]:
df_gt.iloc[80:89,:]

Unnamed: 0,complaint_text,complaint_ID,severe,preprocessed_complaint,polarity
7003,I had a mileage plus credit card and I lost my...,3410443,0,mileage plus credit card lose job unable pay o...,0.166667
24719,This letter summarizes our current situation. ...,2499024,0,letter summarize current situation thank boa b...,0.180564
36615,1. BANK OF AMERICA- They reported negative inf...,2791533,0,1 bank america report negative information cre...,-0.099
9543,On XX/XX/2020 I tried calling Citibank after s...,3608703,0,2020 try call citibank spend several hours onl...,-0.033929
30000,I received an alert from XXXX XXXX about an in...,2717263,0,receive alert inquiry think mistake fraud aler...,0.083333
14265,On XX/XX/2020 I received a voice mail from Cha...,3512623,0,2020 receive voice mail chase voice mail trans...,0.140476
19188,Tried to open an additional savings account wi...,3826057,0,try open additional save account merrill edge ...,0.0
5516,I wrote a check out to myself for {$8600.00} f...,3381169,0,write check 8600 citibank account fund remove ...,0.016667
58688,When speaking to my Chase rep who is supposed ...,1594245,0,speak chase rep suppose help submit modificati...,-0.065714


In [161]:
#rows 81-90
df_gt.iloc[80,2] = 1 #account should be removed from credit file
df_gt.iloc[81,2] = 0 #not satisfied with new offer
df_gt.iloc[82,2] = 0 #customer had late payments and credit went down
df_gt.iloc[83,2] = 1 #customer on hold for 2 hours, call dropped - result of missed paper statements
df_gt.iloc[84,2] = 0 #customer notified of a credit application that was denied
df_gt.iloc[85,2] = 0 #customer doesn't want to be called
df_gt.iloc[86,2] = 1 #accounts closed and won't release funds
df_gt.iloc[87,2] = 1 #funds pulled from account - check returned for insufficient funds but never returned funds
df_gt.iloc[88,2] = 0 #questioning advice of agent
df_gt.iloc[89,2] = 0 #airline wouldn't allow them on the plane

In [162]:
df_gt.iloc[90:99,:]

Unnamed: 0,complaint_text,complaint_ID,severe,preprocessed_complaint,polarity
57992,Please consider the additional information set...,1635027,0,please consider additional information set ame...,-0.00441
23208,I have a checking account with Chase Bank. The...,1655612,0,check account chase bank deceptive ways show a...,0.186667
11770,"Closed credit card with Citicard in XXXX, XXXX...",3650079,0,close credit card citicard citi keep bill send...,-0.177778
8931,This is a follow up complaint Referring to my ...,3619985,0,follow complaint refer prior complaint company...,-0.194444
2569,Step 1 : What is this complaint about? - Payme...,3059864,0,step 1 complaint payment allocation step 2 typ...,-0.034211
63384,We have been solicited by our current servicer...,2208570,0,solicit current servicer jp morgan chase harp ...,0.097222
14588,Bank of America provides different level of se...,3497922,0,bank america provide different level service c...,-0.15
71938,I'm writting to dispute an unauthorized charge...,3035496,0,writting dispute unauthorized charge amount 10...,0.165
13266,My checking account was closed by Citibank wit...,3592574,0,check account close citibank without warn noti...,0.132846


In [172]:
# read complaints
df_gt.iloc[99,0]

"Vehicle was financed on XXXX XXXX and last payment was XXXX balance was XXXX. Chase auto never release the Title of the vehicle. I called chase Auto finance division in yearXXXX their answer was last payment was not clear. '' I asked them if they sent any written notice by mail if they did it send that but they never replied back ''"

In [173]:
#rows 91-100
df_gt.iloc[90,2] = 1 #credit issue data changed when added wife to account - hurt credit score
df_gt.iloc[91,2] = 0 #doesn't understand the balance and how pending charges affect
df_gt.iloc[92,2] = 1 #still contacted on account closed a while ago
df_gt.iloc[93,2] = 0 #customer still owes some fees after settlement
df_gt.iloc[94,2] = 0 #doesn't agree with the way the balance is paid
df_gt.iloc[95,2] = 1 #refi miscommunication
df_gt.iloc[96,2] = 0 #doesn't want to speak on telephone
df_gt.iloc[97,2] = 1 #unauthorized charge hitting account
df_gt.iloc[98,2] = 1 #account closed without notification, customer can't access branch
df_gt.iloc[99,2] = 0 #title not released, final payment hadn't cleared

In [174]:
df_gt.describe()

Unnamed: 0,complaint_ID,severe,polarity
count,100.0,100.0,100.0
mean,2799804.0,0.36,0.004429
std,685252.0,0.482418,0.123192
min,1377573.0,0.0,-0.4
25%,2235563.0,0.0,-0.066856
50%,2893425.0,0.0,0.0
75%,3427484.0,1.0,0.062756
max,3893813.0,1.0,0.35


In [176]:
df_gt

Unnamed: 0,complaint_text,complaint_ID,severe,preprocessed_complaint,polarity
70509,on XX/XX/2018 i applied for a credit through w...,2975717,0,2018 apply credit well fargo online site inten...,0.029282
14855,I had previously submitted the following compl...,3493387,0,previously submit follow complaint regard bank...,-0.030556
17961,I have a wells fargo account. XXXX XXXX kept ...,3583070,1,well fargo account keep chagring bank account ...,-0.200000
18625,The Home Depot/Citibank North America says tha...,3775391,0,home depot citibank north america say late dat...,0.066667
50357,"Hello, my name is XXXX XXXX and I am contactin...",1787885,1,hello name contact find government agency may ...,0.053444
...,...,...,...,...,...
63384,We have been solicited by our current servicer...,2208570,1,solicit current servicer jp morgan chase harp ...,0.097222
14588,Bank of America provides different level of se...,3497922,0,bank america provide different level service c...,-0.150000
71938,I'm writting to dispute an unauthorized charge...,3035496,1,writting dispute unauthorized charge amount 10...,0.165000
13266,My checking account was closed by Citibank wit...,3592574,1,check account close citibank without warn noti...,0.132846


In [180]:
save = os.path.join(save_path, 'data/processed', 'complaints_ground_truth.csv')
df_gt.to_csv(save)

# Model to predict severity

In [182]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV, cross_validate, cross_val_score
from sklearn import metrics

In [185]:
load_path = "/Users/joeboardman/Documents/Springboard/GitHub/CapstoneThree/data/processed/"
os.chdir(load_path)
tfidf_full = pd.read_csv('tfidf_train.csv', index_col=0)
tfidf_full.head()

Unnamed: 0.1,Unnamed: 0,10,10 business,10 business days,10 day,10 days,10 years,100,1000,1000 almost,...,yet,yet receive,yet still,yet take,york,young,young park,yrs,zero,zero balance
0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.078769,0.093218
1,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,0.047248,0.0,0.0,0.0,0.0,0.0,0.0,0.157611,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.064674,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
rf = RandomForestClassifier(random_state=34)
model = cross_validate(rf,df_gt.drop('severe', axis=1), df_gt.severe)
y_pred = model.predict(tfidf_full)