In [2]:
# Please install gensim & NLTK packages beforehand
# https://pypi.python.org/pypi/gensim
# http://www.nltk.org/install.html

import pandas as pd
import numpy as np
import gensim

# This program takes too long to execute :( 
# Gensim constructs a vector for a document
# which takes a long time since it creates a 'model'
# which is trained on a corpus

In [3]:
# read the dataset
print("Reading data.csv")
data = pd.read_csv('Consumer_Complaints.csv')
print("number of records - ", len(data))
print(data.columns)

Reading data.csv
number of records -  891360
Index(['Date received', 'Product', 'Sub-product', 'Issue', 'Sub-issue',
       'Consumer complaint narrative', 'Company public response', 'Company',
       'State', 'ZIP code', 'Tags', 'Consumer consent provided?',
       'Submitted via', 'Date sent to company', 'Company response to consumer',
       'Timely response?', 'Consumer disputed?', 'Complaint ID'],
      dtype='object')


In [6]:
df = data.iloc[:,[5,7]]
df.columns
df=df.dropna(subset = ['Consumer complaint narrative'])
df

Unnamed: 0,Consumer complaint narrative,Company
1,I have outdated information on my credit repor...,"TRANSUNION INTERMEDIATE HOLDINGS, INC."
2,I purchased a new car on XXXX XXXX. The car de...,"CITIZENS FINANCIAL GROUP, INC."
7,An account on my credit report has a mistaken ...,Experian Information Solutions Inc.
12,This company refuses to provide me verificatio...,"The CBE Group, Inc."
16,This complaint is in regards to Square Two Fin...,SQUARETWO FINANCIAL CORPORATION
25,Started the refinance of home mortgage process...,AMERICAN NEIGHBORHOOD MORTGAGE
26,"In XXXX, I and my ex-husband applied for a ref...",HSBC NORTH AMERICA HOLDINGS INC.
28,I have disputed several accounts on my credit ...,"EQUIFAX, INC."
29,Mortgage was transferred to Nationstar as of X...,NATIONSTAR MORTGAGE
36,"Was a happy XXXX card member for years, in lat...",BARCLAYS BANK DELAWARE


In [7]:
# Grouping Complaints by Company

all_companies = set(df['Company'])

com_comp = {}
for index, company, complaint in df.itertuples():
    if com_comp.get(company):
        com_comp[company].add(complaint)
    else:
        com_comp[company] = set()
        com_comp[company].add(complaint)

print(len(com_comp))


187520


In [None]:
# Simple string operations to merge into one long complaint list per company

extracted = pd.DataFrame()
extracted['complaints'] = com_comp.keys()
extracted['company'] = ['\n'.join(value) for value in com_comp.values()]


In [None]:
# words concatenated to form one long string 

complaint_set={}
count=0
for c in set(extracted['company']):
    temp=extracted[extracted['company']==c]['complaints']
    ts=""
    for t in temp:
        ts+=str(t)
    
    complaint_set[c]=ts

    count+=1
print("done")

In [None]:
full_complaint_set = complaint_set
c_set = {}
count=0
for k, v in complaint_set.items():
    c_set[k]=v
    


In [23]:

# use gensim to make TaggedDocuments
from gensim.models.doc2vec import TaggedDocument
tagged_docs=[]

for k,v in complaint_set.items():
    td = TaggedDocument(gensim.utils.to_unicode(complaint_set[k]).split(), k)
    tagged_docs.append(td)
print("done")



done


In [24]:
# Just printing last tagged document to verify
print(td)

TaggedDocument(['rec', 'in', 'the', 'mail', 'offer', '6.29', 'apr,', 'if', 'approved,', 'was', 'approve', 'but', 'came', 'back', '11.5', 'apr', 'their', 'adv', 'is', 'misleading'], OpenRoad Lending)


In [28]:
model = gensim.models.Doc2Vec(tagged_docs, dm = 0, alpha=0.2, size= 20, min_alpha=0.050, min_count=0, workers=2)
print("creatd")

creatd


In [29]:
# start training

for epoch in range(100):
    if epoch % 10 == 0:
        print ('Now training epoch %s'%epoch)
    model.train(tagged_docs, total_examples=model.corpus_count, epochs=model.iter)
    model.alpha -= 0.2  # decrease the learning rate
    model.min_alpha = model.alpha  # fix the learning rate, no decay
 
print("All done")

Now training epoch 0
Now training epoch 10
Now training epoch 20
Now training epoch 30
Now training epoch 40
Now training epoch 50
Now training epoch 60
Now training epoch 70
Now training epoch 80
Now training epoch 90
All done


In [31]:
# Try clustering with NLTK

import nltk
from gensim.models import Doc2Vec
from nltk.cluster import KMeansClusterer
from nltk.corpus import stopwords
import re

In [32]:
def preprocess(str):
    # remove links
    str = re.sub(r'http(s)?:\/\/\S*? ', "", str)
    return str

In [33]:
def preprocess_document(text):
    text = preprocess(text)
    return ''.join([x if x.isalnum() or x.isspace() else " " for x in text ]).split()

In [37]:
vectors = []
used_objects = []
for k,v in complaint_set.items():
    vectors.append(model.infer_vector(preprocess_document(v)))
    used_objects.append(k)
#vectors



In [45]:
# K-means clustering
NUM_CLUSTERS=19

kclusterer = KMeansClusterer(NUM_CLUSTERS, distance=nltk.cluster.util.cosine_distance, repeats=25)
assigned_clusters = kclusterer.cluster(vectors, assign_clusters=True)
print("done")

done


In [46]:
assigned_clusters[0:10]

[16, 2, 2, 10, 9, 18, 15, 13, 11, 11]

In [47]:
def get_objects_by_cluster(id):
    list = []
    for x in range(0, len(assigned_clusters)):
        if (assigned_clusters[x] == id):
            list.append(used_objects[x])
    return list

In [56]:
f=open("Clusters.txt","w+")
for i in range(NUM_CLUSTERS):
    objects = get_objects_by_cluster(i)
    print("Cluster ",i, " ",len(objects))
    f.write("Cluster "+ str(i)+"\n\n")
    f.write("Companies - "+ str(objects)+"\n\n")
    for o in objects:
        f.write(complaint_set[o]+"\n\n---\n\n")
    f.write("================================================\n\n")
f.close()

Cluster  0   60
Cluster  1   36
Cluster  2   179
Cluster  3   117
Cluster  4   51
Cluster  5   28
Cluster  6   68
Cluster  7   19
Cluster  8   68
Cluster  9   60
Cluster  10   73
Cluster  11   131
Cluster  12   75
Cluster  13   116
Cluster  14   68
Cluster  15   12
Cluster  16   121
Cluster  17   52
Cluster  18   77
