# Chatbot Conversations From Customer Service Scripts

In [297]:
import numpy as np
import pandas as pd
import sys, os, re, itertools, collections, string, time
from io import BytesIO
from collections import Counter
from time import time
import datetime
from sklearn import model_selection
from sklearn import preprocessing
from sklearn.preprocessing import OneHotEncoder
   


In [349]:
# https://catalog.data.gov/dataset/consumer-complaint-database 
complaints_df_raw = pd.read_csv("consumer_complaints.csv", 
                usecols=('Product','Consumer complaint narrative', 'Sub-issue'),
                dtype={'consumer_complaint_narrative': object})
# Only interested in data with consumer complaints
complaints_df_raw=complaints_df_raw[complaints_df_raw['Consumer complaint narrative'].notnull()]
complaints_df_raw=complaints_df_raw[complaints_df_raw['Product'].notnull()]

# remove XXXX from narratives
complaints_df_raw['Consumer complaint narrative'] =  complaints_df_raw['Consumer complaint narrative'].replace({'X':''}, regex=True)

# always seed your random generators for reporducilibity 
complaints_df_raw = complaints_df_raw.sample(200000, replace=False, random_state=1)

# basic sentence prep
# set to lower
complaints_df_raw['Consumer complaint narrative'] = complaints_df_raw['Consumer complaint narrative'].str.lower()
# remove special characters
complaints_df_raw['Consumer complaint narrative'] = complaints_df_raw['Consumer complaint narrative'].str.replace('\W', ' ')

# remove elements with no text
complaints_df_raw= complaints_df_raw[complaints_df_raw['Consumer complaint narrative'] != '']

# any dups
complaints_df_raw = complaints_df_raw.drop_duplicates(subset=['Consumer complaint narrative'])

In [365]:
complaints_df_raw.head()

Unnamed: 0,Product,Sub-issue,Consumer complaint narrative
332635,Bank account or service,,Wells Fargo //2015 took {$16000.00} from my cr...
992324,"Credit reporting, credit repair services, or o...",Credit inquiries on your report that you don't...,computer base system pre approve me for a cred...
902590,Debt collection,Debt is not yours,I have a dept on my Credit report which is not...
618944,Credit reporting,Information is not mine,"Upon getting my credit report, I noted an addr..."
834643,"Credit reporting, credit repair services, or o...",Information belongs to someone else,"I never consented to be a co signer, account d..."


# Clean Up Data

In [301]:
complaints_df = complaints_df_raw.copy()
complaints_df['Consumer complaint narrative'] = word_data

In [302]:
word_similarity=complaints_df['Consumer complaint narrative'].str.split(' ').map(Counter)
word_similarity_ratio = []
complaints_df.shape 

(194490, 3)

In [303]:
for wu in word_similarity:
    word_similarity_ratio.append(np.sum([x[1] for x in wu.items()])/np.float(len(wu)))
    
complaints_df['narrative_similarity_ratio'] = word_similarity_ratio
complaints_df['narrative_similarity_ratio'].describe()

count    194490.000000
mean          1.662741
std           0.469002
min           1.000000
25%           1.361111
50%           1.600000
75%           1.889831
max          55.782609
Name: narrative_similarity_ratio, dtype: float64

In [304]:
# thin out some entries that contain too much duplicated lines within

complaints_df = complaints_df[complaints_df['narrative_similarity_ratio'] <= 1.5]
complaints_df.reset_index(drop=True,inplace=True)
complaints_df.shape

(77503, 4)

In [306]:
list(complaints_df['Consumer complaint narrative'])[0]

'computer base system pre approve me for a credit limit then decline my application'

# Get Key Verbs And Nouns

In [307]:
# find most common verbs and measure coverage 
import spacy
# Load English tokenizer, tagger, parser, NER and word vectors
#nlp = spacy.load('en')

# just load what we need to avoid taxing memory
nlp = spacy.load('en', parser=False, entity=False)


In [308]:
# create one big blob of text to process things a bit faster
blob_complaints = ''.join(list(complaints_df['Consumer complaint narrative']))

# Max text of length of 1000000
n = 900000
blog_chunks = [blob_complaints[i:i+n] for i in range(0, len(blob_complaints), n)]
len(blog_chunks)

31

In [309]:
just_verbs = []
just_nouns = []
counter_=len(blog_chunks)
for sentence in blog_chunks:
    counter_ -= 1
    if (counter_ % 10 == 0): print(counter_)
    print(counter_)
    doc = nlp(sentence.decode('utf-8'))
    temp_verb = []
    temp_noun = []
    for token in doc: 
        if (token.pos_ == u'VERB'): 
            temp_verb.append(token.text)
        if (token.pos_ == u'NOUN'):
            temp_noun.append(token.text)
            

    just_verbs.append(' '.join(temp_verb).encode('utf-8'))
    just_nouns.append(' '.join(temp_noun).encode('utf-8'))
    
    

30
30
29
28
27
26
25
24
23
22
21
20
20
19
18
17
16
15
14
13
12
11
10
10
9
8
7
6
5
4
3
2
1
0
0


In [358]:
just_verbs[0].split()[0:10]

['approve',
 'decline',
 'have',
 'is',
 'have',
 'spoke',
 'being',
 'reported',
 'do',
 'know']

In [359]:
just_nouns[0].split()[0:10]

['computer',
 'base',
 'system',
 'pre',
 'credit',
 'limit',
 'applicationi',
 'dept',
 'credit',
 'report']

In [310]:
print('count just_verbs: %i' % len(just_verbs))
print('count just_nouns: %i' % len(just_nouns))
    

count just_verbs: 31
count just_nouns: 31


In [311]:
# pickle both objects so you don't have to re-run spacy 
import pickle
pickle_file = "verbs_nouns.p"

overwrite_old_pickle = True
if overwrite_old_pickle:
    with open(pickle_file, "wb") as f:
        pickle.dump([just_verbs, just_nouns], f)
    
# read in saved pickle
with open(pickle_file, "rb") as f:
    backup_pos = pickle.load(f)

## Sorting Out Verbs

In [312]:
all_verbs = backup_pos[0]
len(all_verbs)

# append all verbs together so we can run frequency counts
verbs = []
for verb_set in all_verbs:
    verbs.append(verb_set.split())
    #verbs = [verb for verb in verb_set[0].split()]

len(verbs)
verbs_master = [val for sublist in verbs for val in sublist]
len(verbs_master)

1130054

In [360]:
# what is your upper and lower cut offs?
from collections import Counter
verbs_df = pd.DataFrame(Counter([verb for verb in verbs_master]).most_common(), columns = ['verb', 'count'])
verbs_df.head(20)

Unnamed: 0,verb,count
0,have,70226
1,is,58620
2,was,55698
3,are,23786
4,has,23560
5,been,21781
6,be,20973
7,do,17471
8,had,16062
9,am,15028


In [314]:
len(verbs_df[verbs_df['count'] > 1000])
verbs_df = verbs_df[verbs_df['count'] > 1000]

## Sorting Out Nouns

In [315]:
all_nouns = backup_pos[1]

# append all verbs together so we can run frequency counts
nouns = []
for noun_set in all_nouns:
    nouns.append(noun_set.split())

nouns_master = [val for sublist in nouns for val in sublist]
len(nouns_master)

1140118

In [316]:
# what is your upper and lower cut offs?
from collections import Counter
nouns_df = pd.DataFrame(Counter([noun for noun in nouns_master]).most_common(), columns = ['noun', 'count'])
nouns_df.head()

Unnamed: 0,noun,count
0,credit,58934
1,account,35048
2,report,23740
3,debt,22419
4,information,19445


In [317]:
len(nouns_df[nouns_df['count'] > 1000])
nouns_df = nouns_df[nouns_df['count'] > 1000]

## Binarize DataFrame With Official Verb & Noun List

In [318]:
# create new data frame with key verbs and nouns as features
key_words = list(nouns_df['noun']) + list(verbs_df['verb'])
row_bools = []
counter_ = len(complaints_df['Consumer complaint narrative'])
for sentence in complaints_df['Consumer complaint narrative']:
    counter_ -= 1
    if (counter_ % 10000 == 0): print(counter_)
    row_bool = []
    words = sentence.split()
    for kw in key_words:
        row_bool.append(kw in words)
    row_bools.append(row_bool)
    
print('length:', len(row_bools))
row_bools = pd.DataFrame(row_bools, columns=key_words)    
row_bools = row_bools.astype(int)
row_bools.shape

    

70000
60000
50000
40000
30000
20000
10000
0
('length:', 77503)


(77503, 395)

In [361]:
row_bools.head()

Unnamed: 0,credit,account,report,debt,information,company,loan,payment,bank,card,...,charging,lived,lost,disputing,mailed,needed,came,contacting,keeps,cluster
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,3
1,1,0,1,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,35
2,1,0,1,0,1,0,1,0,0,1,...,0,0,0,0,0,0,0,0,0,13
3,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,23
4,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,5


# Cluster of popular sentences

In [319]:
from sklearn.cluster import KMeans

TOTAL_CLUSTERS = 50

# Number of clusters
kmeans = KMeans(n_clusters=TOTAL_CLUSTERS)
# Fitting the input data
kmeans = kmeans.fit(row_bools)
# Getting the cluster labels
labels = kmeans.predict(row_bools)
# Centroid values
centroids = kmeans.cluster_centers_

# add cluster back to data frame 
row_bools['cluster'] = labels

row_bools['cluster'].value_counts().head()

10    4400
15    3000
22    2653
2     2518
19    2492
38    2298
3     2249
5     2101
23    2100
27    2044
35    2029
13    2011
48    1983
45    1923
1     1896
16    1857
4     1840
20    1777
32    1756
7     1635
30    1565
6     1556
25    1556
47    1505
12    1452
11    1444
46    1439
28    1372
8     1371
14    1337
42    1317
9     1314
29    1297
26    1225
33    1207
0     1201
21    1144
37    1110
44    1084
31    1081
41    1068
40    1040
17     981
34     909
43     855
49     825
39     292
36     209
18     111
24      74
Name: cluster, dtype: int64

In [362]:
row_bools['cluster'].value_counts().head()

10    4400
15    3000
22    2653
2     2518
19    2492
Name: cluster, dtype: int64

In [334]:

# add cluster number back to orginal corpus
complaints_df['Cluster'] = labels
# import sys
# reload(sys)
# sys.setdefaultencoding('utf8')
import itertools
from collections import Counter
import nltk
from nltk.util import ngrams

unique_complaints_2grams = []
unique_complaints_3grams = []
unique_complaints_4grams = []
unique_complaints_5grams = []
unique_complaints_6grams = []
# loop through each cluster
for cluster_to_search in range(min(row_bools['cluster']), max(row_bools['cluster'])+1):
    # cluster-level research
    print('Cluster: %i' % cluster_to_search)
    df_tmp = complaints_df[complaints_df['Cluster']==cluster_to_search].copy()
    print('data cluster shape: %s' % len(df_tmp))
    
    bigrams = []
    trigrams = []
    fourgrams = []
    fivegrams = []
    sixgrams = []
    
    for index, row in df_tmp.iterrows(): 
        token = nltk.word_tokenize(row['Consumer complaint narrative'].decode('utf-8'))
        bigrams.append([' '.join(pair) for pair in list(ngrams(token,2)) if len(set(pair))==2])
        trigrams.append([' '.join(pair) for pair in list(ngrams(token,3)) if len(set(pair))==3])
        fourgrams.append([' '.join(pair) for pair in list(ngrams(token,4)) if len(set(pair))==4])
        fivegrams.append([' '.join(pair) for pair in list(ngrams(token,5)) if len(set(pair))==5])
        sixgrams.append([' '.join(pair) for pair in list(ngrams(token,6)) if len(set(pair))==6])
        
    bigrams = [val for sublist in bigrams for val in sublist]
    trigrams = [val for sublist in trigrams for val in sublist]
    fourgrams = [val for sublist in fourgrams for val in sublist]
    fivegrams = [val for sublist in fivegrams for val in sublist]
    sixgrams = [val for sublist in sixgrams for val in sublist]
    
    # find top x most popular grams per size
    # 2 bigrams
    freqx = pd.DataFrame(Counter([noun for noun in bigrams]).most_common(50), columns=['bigrams','frequency'])
    freqx['Cluster'] = cluster_to_search
    unique_complaints_2grams.append(freqx)
    # 3 bigrams
    freqx = pd.DataFrame(Counter([noun for noun in trigrams]).most_common(50), columns=['trigrams','frequency'])
    freqx['Cluster'] = cluster_to_search
    unique_complaints_3grams.append(freqx)
    # 4 bigrams
    freqx = pd.DataFrame(Counter([noun for noun in fourgrams]).most_common(50), columns=['fourgrams','frequency'])
    freqx['Cluster'] = cluster_to_search
    unique_complaints_4grams.append(freqx)
    # 5 bigrams
    freqx = pd.DataFrame(Counter([noun for noun in fivegrams]).most_common(50), columns=['fivegrams','frequency'])
    freqx['Cluster'] = cluster_to_search
    unique_complaints_5grams.append(freqx)
    # 6 bigrams
    freqx = pd.DataFrame(Counter([noun for noun in sixgrams]).most_common(50), columns=['sixgrams','frequency'])
    freqx['Cluster'] = cluster_to_search
    unique_complaints_6grams.append(freqx)
 
  

Cluster: 0
data cluster shape: 1201
Cluster: 1
data cluster shape: 1896
Cluster: 2
data cluster shape: 2518
Cluster: 3
data cluster shape: 2249
Cluster: 4
data cluster shape: 1840
Cluster: 5
data cluster shape: 2101
Cluster: 6
data cluster shape: 1556
Cluster: 7
data cluster shape: 1635
Cluster: 8
data cluster shape: 1371
Cluster: 9
data cluster shape: 1314
Cluster: 10
data cluster shape: 4400
Cluster: 11
data cluster shape: 1444
Cluster: 12
data cluster shape: 1452
Cluster: 13
data cluster shape: 2011
Cluster: 14
data cluster shape: 1337
Cluster: 15
data cluster shape: 3000
Cluster: 16
data cluster shape: 1857
Cluster: 17
data cluster shape: 981
Cluster: 18
data cluster shape: 111
Cluster: 19
data cluster shape: 2492
Cluster: 20
data cluster shape: 1777
Cluster: 21
data cluster shape: 1144
Cluster: 22
data cluster shape: 2653
Cluster: 23
data cluster shape: 2100
Cluster: 24
data cluster shape: 74
Cluster: 25
data cluster shape: 1556
Cluster: 26
data cluster shape: 1225
Cluster: 27
dat

In [339]:
    df = pd.concat(unique_complaints_4grams)
    # freqx = pd.DataFrame(Counter([noun for noun in fourgrams]).most_common(50), columns=['fourgrams','frequency'])
    df = df.drop_duplicates(subset=['fourgrams'], keep=False)
    df.head()

Unnamed: 0,fourgrams,frequency,Cluster
17,my loan was sold,11,0
18,loan was sold to,11,0
19,i was denied a,11,0
21,loan was paid off,10,0
23,i was not notified,10,0


In [346]:
# find top x most popular grams per size
see_grams = 6


if see_grams==2:
    df = pd.concat(unique_complaints_2grams)
    df = df.drop_duplicates(subset=['bigrams'], keep=False)
elif see_grams==3:
    df = pd.concat(unique_complaints_3grams)
    df = df.drop_duplicates(subset=['trigrams'], keep=False)
elif see_grams==4:
    df = pd.concat(unique_complaints_4grams)
    df = df.drop_duplicates(subset=['fourgrams'], keep=False)
elif see_grams==5:
    df = pd.concat(unique_complaints_5grams)
    df = df.drop_duplicates(subset=['fivegrams'], keep=False)
elif see_grams==6:
    df = pd.concat(unique_complaints_6grams)
    df = df.drop_duplicates(subset=['sixgrams'], keep=False)
 
df = df.sort_values('Cluster')
df[df['frequency'] > 10]  



Unnamed: 0,sixgrams,frequency,Cluster
1,i had a credit card with,14,1
3,my bank of america credit card,14,1
4,a credit card in my name,11,1
0,i applied for a credit card,18,1
14,from my credit reports that do,23,3
31,deleted item . i did not,11,3
5,remove accounts from my credit reports,31,3
7,accounts from my credit reports that,27,3
11,credit reports that do not belong,24,3
13,reports that do not belong to,24,3


## Tie It Back To Complaint

In [364]:
# tie it back to look into a couple of actual complaints
keywords = "attempting to collect a debt from"
 
for index, row in complaints_df.iterrows():
    txt = row['Consumer complaint narrative'] 
    if (keywords in txt):
        print(txt)
        print('------')
    
 

stellar recovery inc is attempting to collect a debt from me that i do n't owe them. i never established a contract with this company and they have reported negative items onto my and credit reports which is in violation of the fcra.
------
i received a message from global credit and collection company who said it was attempting to collect a debt i owe. i called the number back who said it was a non working number for . this same company left a message on other phone numbers that have never been associated with any account i have. it was disclosed on answering machines voicemail that they were attempting to collect a debt from me.
------
trident asset management is attempting to collect a debt from me that i do n't owe them. i never established a contract with this company and they have reported negative items onto my and credit reports which is in violation of the fcra.
------
wakefield and associates of , colorado, has been attempting to collect a debt from me for one year. i have ne