In [2]:
import gzip
import numpy as np
import pandas as pd
import random
import string
import re
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from tqdm import tqdm
tqdm.pandas(desc='progress-bar')
from gensim import corpora, models
from sklearn import model_selection
from sklearn import cross_validation
from tabulate import tabulate
import pyLDAvis.gensim
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import matplotlib.pyplot as plt
%matplotlib inline 
from sklearn import svm



In [2]:
api = "https://data.consumerfinance.gov/resource/jhzv-w97w.json"
query = '?&$limit=1000000'
#$where=date%20between%20%272014-01-01T00:00:00%27%20and%20%272015-01-01T00:00:00%27'
    
dataset_identifier = 'jhzv-w97w'
APP_TOKEN = '48ozcpj4nCO3mqgJOl8GoIJgF'
token = '?$$app_token='
full_query = api+query
cfpb = pd.read_json(full_query)

In [3]:
cfpb.shape

(780644, 18)

In [141]:
cfpb.loc[1,:]

company                                                     Alpine Credit, Inc
company_public_response      Company disputes the facts presented in the co...
company_response                                       Closed with explanation
complaint_id                                                           2383241
complaint_what_happened      I have not been contacted about this negative ...
consumer_consent_provided                                     Consent provided
consumer_disputed                                                           No
date_received                                          2017-03-13T00:00:00.000
date_sent_to_company                                   2017-03-15T00:00:00.000
issue                                    Cont'd attempts collect debt not owed
product                                                        Debt collection
state                                                                       AR
sub_issue                                           

In [142]:
cfpb.head()

Unnamed: 0,company,company_public_response,company_response,complaint_id,complaint_what_happened,consumer_consent_provided,consumer_disputed,date_received,date_sent_to_company,issue,product,state,sub_issue,sub_product,submitted_via,tags,timely,zip_code
0,PORTFOLIO RECOVERY ASSOCIATES INC,,Closed with non-monetary relief,2443295,,,No,2017-04-19T00:00:00.000,2017-04-19T00:00:00.000,Communication tactics,Debt collection,NJ,Frequent or repeated calls,Credit card,Web,,Yes,083XX
1,"Alpine Credit, Inc",Company disputes the facts presented in the co...,Closed with explanation,2383241,I have not been contacted about this negative ...,Consent provided,No,2017-03-13T00:00:00.000,2017-03-15T00:00:00.000,Cont'd attempts collect debt not owed,Debt collection,AR,Debt was paid,I do not know,Web,,No,729XX
2,"Collection Bureau Services, Inc.",,Closed with explanation,2371772,I paid off another account a couple of years a...,Consent provided,Yes,2017-03-04T00:00:00.000,2017-03-04T00:00:00.000,Cont'd attempts collect debt not owed,Debt collection,ID,Debt is not mine,"Other (i.e. phone, health club, etc.)",Web,,Yes,837XX
3,AES/PHEAA,,Closed with explanation,2442800,,,No,2017-04-19T00:00:00.000,2017-04-19T00:00:00.000,Dealing with my lender or servicer,Student loan,CA,Received bad information about my loan,Federal student loan servicing,Web,,Yes,914XX
4,Ditech Financial LLC,Company believes it acted appropriately as aut...,Closed with explanation,2371574,,Consent not provided,No,2017-03-04T00:00:00.000,2017-03-04T00:00:00.000,"Loan servicing, payments, escrow account",Mortgage,MI,,Conventional fixed mortgage,Web,,Yes,48328


In [143]:
cfpb['product'].value_counts()/sum(cfpb['product'].value_counts())


Mortgage                                                                        0.292985
Debt collection                                                                 0.190328
Credit reporting                                                                0.180103
Credit card                                                                     0.114405
Bank account or service                                                         0.110578
Student loan                                                                    0.042477
Consumer Loan                                                                   0.040521
Payday loan                                                                     0.007110
Money transfers                                                                 0.006869
Credit reporting, credit repair services, or other personal consumer reports    0.005169
Prepaid card                                                                    0.004895
Credit card or prepai

In [4]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le.fit(cfpb['product'])
cfpb['le_product'] = le.transform(cfpb['product'])

In [5]:
data = cfpb[['le_product', 'complaint_what_happened', 'date_received','company']]
data = data.dropna()
data['product'] = data['le_product'].map(int)
print('{} complaints'.format(len(data)))
data = data[['complaint_what_happened','product','date_received','company']]
data.head()

158868 complaints


Unnamed: 0,complaint_what_happened,product,date_received,company
1,I have not been contacted about this negative ...,7,2017-03-13T00:00:00.000,"Alpine Credit, Inc"
2,I paid off another account a couple of years a...,7,2017-03-04T00:00:00.000,"Collection Bureau Services, Inc."
9,disputed incorrect information and accounts wi...,5,2017-03-21T00:00:00.000,EXPERIAN DELAWARE GP
20,Capital One reported my auto loan payments lat...,3,2017-03-04T00:00:00.000,CAPITAL ONE FINANCIAL CORPORATION
32,I contacted a representative of Key Bank about...,0,2017-03-04T00:00:00.000,KEYCORP


In [18]:
additional_stop_words = ['xxx','xxxx','00','xx','xx xx','000','00and','xxxxxxxx']
additional_stop_words = [unicode(i,'utf-8') for i in additional_stop_words]
stop = set(stopwords.words('english') + list(string.punctuation) + additional_stop_words)
stemmer = PorterStemmer()
re_punct = re.compile('[' + ''.join(string.punctuation) + ']')

In [19]:
def preprocess(text):
    try:
        text = text.lower()
        tokens = nltk.word_tokenize(text)
        tokens = [t for t in tokens if not t in stop]
        tokens = [re.sub(re_punct, '', t) for t in tokens]
        tokens = [re.sub(r'\d','',t) for t in tokens] # there are random digits in this dataset
        tokens = [re.sub(r'xx+','',t) for t in tokens] # there are also random xs all through it for anonymization
        tokens = [t for t in tokens if len(t) > 2]
        tokens = [stemmer.stem(t) for t in tokens]
        if len(tokens) == 0:
            return None
        else:
            return ' '.join(tokens)
    except:
        return None

In [20]:
print preprocess('Meag is going grocery shopping')

meag go groceri shop


In [38]:
re.sub(r'x+','','I have xxxxxxxxfunny')

'I have funny'

In [152]:
data['tokens'] = data['complaint_what_happened'].progress_map(preprocess)
print data.shape

progress-bar: 100%|██████████| 158690/158690 [15:48<00:00, 167.28it/s]


(158690, 4)


In [9]:
data = data[data['tokens'].notnull()]
print data.shape
data.reset_index(inplace=True)
data.drop('index', inplace=True, axis=1)
print('{} complaints'.format(len(data)))
data.head()

KeyError: 'tokens'

In [3]:
data = pd.read_csv('data/sent_data.csv')

In [4]:
data.head(2)

Unnamed: 0,complaint_what_happened,product,date_received,tokens,company
0,I paid off another account a couple of years a...,7,2017-03-04,paid anoth account coupl year ago colbrsrv dec...,
1,disputed incorrect information and accounts wi...,5,2017-03-21,disput incorrect inform account experian thitr...,


### Build a dictionary and construct a corpus

In [5]:
texts = [tokens.split() for tokens in data.tokens]
id2word = corpora.Dictionary(texts)
corpus = [id2word.doc2bow(text) for text in texts]

In [29]:
data['complaint_what_happened'][139]

'XXXX filed a lawsuit against me and on XX/XX/XXXX levied my entire bank account without notifying me of any of this. This is from a debt that is over 7 years old I believe, and that I had believed to have been settled years ago. I had no idea this debt was still active and no idea any lawsuit had been filed. The docket number I found out after I called because my bank account was levied is XXXX.'

In [42]:
data['tokens'][139]

'file lawsuit levi entir bank account without notifi debt year old believ believ settl year ago idea debt still activ idea lawsuit file docket number found call bank account levi'

In [33]:
print texts[139]

['file', 'lawsuit', 'levi', 'entir', 'bank', 'account', 'without', 'notifi', 'debt', 'year', 'old', 'believ', 'believ', 'settl', 'year', 'ago', 'idea', 'debt', 'still', 'activ', 'idea', 'lawsuit', 'file', 'docket', 'number', 'found', 'call', 'bank', 'account', 'levi']


In [41]:
print id2word.doc2bow(texts[139]) # Equivalent to corpus

[(4, 1), (9, 1), (15, 2), (17, 2), (18, 2), (29, 2), (49, 1), (124, 1), (197, 2), (219, 2), (237, 1), (351, 1), (399, 1), (526, 1), (615, 1), (616, 1), (638, 1), (698, 2), (789, 2), (1828, 2), (1829, 1)]


In [37]:
print corpus[139]

[(9, 3.0), (46, 4.0), (54, 1.0), (60, 1.0), (65, 1.0), (67, 1.0), (146, 1.0), (147, 1.0), (155, 1.0), (161, 1.0), (165, 1.0), (174, 1.0), (197, 1.0), (204, 1.0), (209, 1.0), (219, 1.0), (323, 1.0), (337, 1.0), (345, 1.0), (351, 1.0), (354, 1.0), (469, 1.0), (470, 1.0), (753, 1.0), (792, 4.0), (824, 1.0), (1005, 1.0), (1064, 1.0), (1150, 1.0), (1327, 1.0), (1561, 1.0), (1825, 1.0), (1826, 1.0), (1827, 1.0)]


In [38]:
print(model[corpus[139]])

[(0, 0.02368134408469132), (13, 0.025867681075064632), (19, 0.023206996848775967), (56, 0.023408861416139029), (62, 0.072653116618223393), (73, 0.14114481895578182), (88, 0.024974081577051121), (121, 0.029500032941771521), (135, 0.32310772542644994), (162, 0.023476379478931442), (163, 0.068289683105249049), (193, 0.024664241467245739), (196, 0.025131393354620991), (227, 0.14894015527791477)]


In [36]:
query = data.loc[139,'complaint_what_happened']
query = preprocess(query).split()
new_id2word = corpora.Dictionary()
_ = new_id2word.merge_with(id2word) 
query = new_id2word.doc2bow(query)
topic_relevance = list(sorted(model[query], key=lambda x: -x[1]))
print topic_relevance

[(122, 0.33400000000004471), (224, 0.33399999999996427), (20, 0.16733333333336395)]


In [40]:
model.print_topic(224)

u'0.243*"practic" + 0.093*"decept" + 0.079*"believ" + 0.044*"use" + 0.041*"standard" + 0.035*"lead" + 0.032*"staff" + 0.032*"engag" + 0.030*"illeg" + 0.029*"intent"'

In [163]:
corpora.MmCorpus.serialize('complaints_full.mm', corpus)
id2word.save('complaints_full.dict')

### Latent Dirichlet Allocation (LDA)

The LDA implementation shamelessly taken from Evann Smith at Thrasher. Annotations mine.

docs = https://radimrehurek.com/gensim/models/ldamodel.html

In [164]:
def ldaCV(n_topics, corpus, id2word, test_size=0.2, sample=None):
    if not isinstance(n_topics, list):
        n_topics = [n_topics]
    obs = len(corpus)
    corpus = np.array(corpus) # draws from corpus created above
    
    if sample is not None:
        if sample < 1:
            sample_idx = random.sample(range(obs), int(obs*sample)) # inital pass assigns each sample to a random
        else:
            sample_idx = random.sample(range(obs), int(sample))
        corpus = corpus[sample_idx]
    
    train, test = cross_validation.train_test_split(corpus, test_size=test_size) # cross validated
    perplexities = []
    
    for n in n_topics:
        print('{} topics'.format(n))
        model = models.ldamodel.LdaModel(train, num_topics=n, id2word=id2word)
        perplexity = model.log_perplexity(test)       
        print(' - Perplexity: {}'.format(round(perplexity, 3)))
        perplexities.append((n, perplexity))
    
    return perplexities    

In [None]:
%time p = ldaCV(list(range(50,450,50)),corpus,id2word)

In [None]:
x, y = zip(*p)
plt.plot(x, y)
plt.scatter(x, y)
plt.show()

### Full Model

In [None]:
topicnum = min(y)

In [165]:
model = models.ldamodel.LdaModel(corpus, num_topics=250, id2word=id2word)

In [166]:
model.save('complaints_full_lda.model')

In [167]:
model.print_topics(num_topics = 10)

[(231,
  u'0.711*"home" + 0.061*"equiti" + 0.020*"line" + 0.019*"summon" + 0.016*"sell" + 0.016*"googl" + 0.015*"foreclos" + 0.010*"washington" + 0.008*"inspector" + 0.007*"mutual"'),
 (72,
  u'0.125*"illeg" + 0.120*"fals" + 0.100*"real" + 0.069*"estat" + 0.054*"case" + 0.029*"defend" + 0.026*"plaintiff" + 0.026*"dismiss" + 0.026*"enforc" + 0.022*"court"'),
 (19,
  u'0.189*"concern" + 0.117*"mistak" + 0.092*"best" + 0.087*"may" + 0.077*"honor" + 0.044*"abil" + 0.041*"fault" + 0.035*"suffici" + 0.032*"count" + 0.030*"definit"'),
 (238,
  u'0.213*"adjust" + 0.175*"california" + 0.149*"oper" + 0.089*"usual" + 0.050*"extort" + 0.047*"doubt" + 0.046*"construct" + 0.030*"afni" + 0.030*"demonstr" + 0.024*"builder"'),
 (168,
  u'0.207*"correspond" + 0.118*"determin" + 0.064*"feder" + 0.055*"attent" + 0.045*"suspici" + 0.043*"navi" + 0.042*"identif" + 0.033*"regard" + 0.028*"slip" + 0.024*"personnel"'),
 (180,
  u'0.077*"get" + 0.049*"tri" + 0.029*"time" + 0.026*"work" + 0.024*"help" + 0.020*"b

### Query the model

In [62]:
print data.loc[3332,'complaint_what_happened']

i filed a complaint # XXXX against capital one on XXXX/XXXX/17 re : paying XXXX, a scam artist who falsely claimed they would sell my timeshare after paying them {$2800.00} in title fees up front. They did n't sell my timeshare, as a matter of fact, once they got my credit card # they never called me again. Within capital ones dispute time frame, i sent them evidence that this outfit was a scam. In their response to my complaint they did n't acknowlege any of the facts i had backing my case, including a cease and desist order from XXXX XXXX, the attorney general of North Dakota, against premier timeshare solutions. An expose on XXXX tv identifying XXXX as crooks. An F rating on XXXX from the Better Business Burea. The fact that XXXX XXXX the owner of XXXX Is in jail now for this very crime. On top of that, the lawyer they sent after me, XXXX and XXXX backed off as soon as i sent them all this evidence. Capital one 's response to my complaint is, '' our research confirmed our actions ta

In [64]:
query = data.loc[3332,'complaint_what_happened']
query = preprocess(query).split()

In [65]:
new_id2word = corpora.Dictionary()
_ = new_id2word.merge_with(id2word) # this doesn't return us anything of use right now

In [66]:
query = new_id2word.doc2bow(query)
print(query)

[(890, 1), (1313, 1), (1446, 5), (2160, 1), (2440, 1), (2562, 2), (3791, 1), (6192, 1), (8311, 1), (10924, 1), (11045, 1), (11280, 1), (11562, 1), (12833, 1), (13348, 1), (13495, 1), (14498, 1), (15874, 5), (16960, 3), (17643, 3), (18142, 1), (18795, 1), (18798, 1), (19316, 1), (19663, 1), (19743, 1), (19835, 1), (20326, 3), (20730, 1), (20984, 2), (21614, 3), (21961, 1), (23162, 1), (23195, 1), (23309, 1), (24004, 1), (24134, 1), (24639, 1), (25895, 1), (27135, 1), (28004, 1), (28041, 2), (28865, 2), (29864, 1), (30323, 1), (30485, 1), (31811, 1), (32378, 1), (33512, 1), (34623, 1), (35262, 2), (37164, 1), (38270, 3), (38682, 1), (39312, 1), (40524, 1), (41167, 1), (41544, 1), (41975, 1), (42936, 3), (43330, 1), (44616, 1), (45264, 1), (45385, 1), (50613, 1), (52819, 1), (53559, 1), (53881, 1), (54004, 1), (54600, 3), (57191, 1), (58560, 1), (60661, 1), (61663, 3), (62009, 1), (62895, 1), (63494, 2), (64475, 1), (65420, 1), (66327, 1), (66519, 5), (68833, 1)]


In [67]:
topic_relevance = list(sorted(model[query], key=lambda x: -x[1]))
print('Most relevant: {}'.format(topic_relevance[0]))
print('Least relevant: {}'.format(topic_relevance[-1]))

Most relevant: (111, 0.38492307692309558)
Least relevant: (13, 0.077230769230760832)


In [68]:
model.print_topic(topic_relevance[0][0]) # most related

u'0.737*"person" + 0.043*"caller" + 0.032*"inform" + 0.017*"embarrass" + 0.017*"solicit" + 0.017*"aggress" + 0.014*"cheat" + 0.014*"look" + 0.013*"conclus" + 0.010*"call"'

In [69]:
model.print_topic(topic_relevance[-1][0]) # least related

u'0.084*"deal" + 0.066*"extrem" + 0.064*"frustrat" + 0.057*"better" + 0.028*"quickli" + 0.026*"sens" + 0.022*"poor" + 0.021*"experi" + 0.021*"time" + 0.018*"wast"'

In [70]:
topic_relevance

[(111, 0.38492307692309558),
 (242, 0.15415384615384026),
 (238, 0.077230769230785049),
 (169, 0.077230769230775181),
 (202, 0.077230769230768756),
 (159, 0.077230769230766189),
 (13, 0.077230769230760832)]

In [174]:
topic_relevance

NameError: name 'topic_relevance' is not defined

### Visualize the model

In [6]:
id2word = corpora.Dictionary.load('complaints_full.dict')
corpus = corpora.MmCorpus('complaints_full.mm')
model = models.ldamodel.LdaModel.load('complaints_full_lda.model')

In [177]:
complaints_vis = pyLDAvis.gensim.prepare(model, corpus, id2word)
pyLDAvis.display(complaints_vis)

In [178]:
pyLDAvis.save_html(complaints_vis, 'complaints_full_lda_graphic.html')

This is so fucking cool.

### Distribution of topics over documents

In [22]:
data['product'].value_counts()

7     38970
10    32107
5     30461
3     18350
0     14549
15    10312
2      9089
12     1699
9      1448
14     1408
11      275
17       16
Name: product, dtype: int64

In [24]:
id2word[1773]

u'personnel'

In [108]:
len(corpus[77898])

70

In [12]:
model[corpus[77898]]

[(15, 0.16052538700083965),
 (46, 0.022824705960543805),
 (48, 0.12319712598342604),
 (49, 0.023995979236658194),
 (58, 0.025486656706479482),
 (61, 0.023842034521884086),
 (72, 0.083589973601356435),
 (101, 0.023068543297684312),
 (129, 0.022971773444891732),
 (139, 0.023032413238025132),
 (164, 0.014089963186193717),
 (172, 0.11378467790553896),
 (179, 0.023109966718590697),
 (181, 0.051870235628727723),
 (193, 0.048522801250990517),
 (200, 0.077837014594648243),
 (207, 0.045700660724444252),
 (208, 0.023815495887752405),
 (246, 0.038978331069735271)]

In [26]:
corpus[17860]

[(8, 2.0),
 (22, 4.0),
 (107, 1.0),
 (125, 1.0),
 (181, 1.0),
 (190, 1.0),
 (192, 2.0),
 (207, 1.0),
 (213, 1.0),
 (246, 2.0),
 (260, 1.0),
 (357, 1.0),
 (459, 2.0),
 (530, 1.0),
 (692, 1.0),
 (794, 1.0),
 (886, 1.0),
 (1009, 1.0),
 (1835, 1.0),
 (2766, 1.0)]

In [134]:
for i in corpus[17860]:
    print id2word[i[0]]

score
credit
home
card
time
damag
inquiri
mortgag
mani
appli
multipl
someon
amount
make
creditor
caus
financ
much
difficult
decreas


In [189]:
# data.to_csv('data/sent_data.csv',encoding='utf-8',header=True,index=False)

In [15]:
# all_topics.to_csv('data/all_topics.csv',encoding='utf-8',header=False)

### Doc2Vec 

In [16]:
data = pd.read_csv('data/sent_data.csv')
data['date_received'] = pd.to_datetime(data['date_received'])

In [17]:
sent_data = data.copy()
data = pd.read_csv('data/sent_data.csv')
# sent_data.drop('level_0',inplace=True,axis=1)
# sent_data.drop('index',inplace=True,axis=1)
sent_data.head()


Unnamed: 0,complaint_what_happened,product,date_received,tokens,company
0,I paid off another account a couple of years a...,7,2017-03-04,paid anoth account coupl year ago colbrsrv dec...,
1,disputed incorrect information and accounts wi...,5,2017-03-21,disput incorrect inform account experian thitr...,
2,Capital One reported my auto loan payments lat...,3,2017-03-04,capit one report auto loan payment late sever ...,"Collection Bureau Services, Inc."
3,I contacted a representative of Key Bank about...,0,2017-03-04,contact repres key bank mortgag first thing to...,
4,I bought a money order from XXXX XXXX to pay a...,11,2017-03-31,bought money order pay bill found payment cred...,


In [24]:
# Prep data; doing this without a predict value for the time being, since I'll need to add this in later
sent_data = data.copy()
# sent_data.drop('level_0',inplace=True,axis=1)  
# sent_data.loc[sent_data['overall'] < 3, 'overall'] = 0
# sent_data.loc[sent_data['overall'] > 3, 'overall'] = 1
y = []
doc_vectors = []
for i, row in sent_data.iterrows():
    doc_vectors.append(TaggedDocument(row['tokens'].split(), ['doc_' + str(i)]))
    y.append(row['product'])
print(len(y), len(doc_vectors))

(158684, 158684)


In [6]:
def shuffle_docs(docs):
    random.shuffle(docs)
    return docs

In [194]:
model = Doc2Vec(size=100, window=10, min_count=1, workers=4)
model.build_vocab(doc_vectors)
for epoch in range(40):
    print('Epoch {}'.format(epoch)),
    model.train(shuffle_docs(doc_vectors))
d2v = {d: vec for d, vec in zip(model.docvecs.offset2doctag, model.docvecs.doctag_syn0)}
X = []
for d in range(len(doc_vectors)):
    X.append(d2v['doc_' + str(d)])
X = np.array(X)

Epoch 0 Epoch 1 Epoch 2 Epoch 3 Epoch 4 Epoch 5 Epoch 6 Epoch 7 Epoch 8 Epoch 9 Epoch 10 Epoch 11 Epoch 12 Epoch 13 Epoch 14 Epoch 15 Epoch 16 Epoch 17 Epoch 18 Epoch 19 Epoch 20 Epoch 21 Epoch 22 Epoch 23 Epoch 24 Epoch 25 Epoch 26 Epoch 27 Epoch 28 Epoch 29 Epoch 30 Epoch 31 Epoch 32 Epoch 33 Epoch 34 Epoch 35 Epoch 36 Epoch 37 Epoch 38 Epoch 39


In [195]:
model.save('complaints_full_doc2vec.model')

In [43]:
model = Doc2Vec.load('complaints_full_doc2vec.model')

In [26]:
d2v = {d: vec for d, vec in zip(model.docvecs.offset2doctag, model.docvecs.doctag_syn0)}
X = []
for d in range(len(doc_vectors)):
    X.append(d2v['doc_' + str(d)])
X = np.array(X)

In [27]:
model.most_similar('titl')

[(u'mortgag', 0.5278648138046265),
 (u'readvertis', 0.46978434920310974),
 (u'mortag', 0.466302752494812),
 (u'lean', 0.4471130073070526),
 (u'sitespecif', 0.43911993503570557),
 (u'endors', 0.43268442153930664),
 (u'employeeto', 0.43175238370895386),
 (u'scamembarass', 0.4251365065574646),
 (u'insur', 0.42509591579437256),
 (u'wire', 0.4224180579185486)]

In [10]:
model.most_similar('mortgag')

[(u'mortag', 0.7029309272766113),
 (u'loan', 0.699162483215332),
 (u'morgag', 0.6175943613052368),
 (u'mtg', 0.5883992910385132),
 (u'heloc', 0.5447637438774109),
 (u'titl', 0.5278648138046265),
 (u'dba', 0.47409069538116455),
 (u'mortagag', 0.4625827670097351),
 (u'financ', 0.45193812251091003),
 (u'aka', 0.4502801299095154)]

In [200]:
model.most_similar('fee')

[(u'feethank', 0.556215763092041),
 (u'feessurcharg', 0.5554369688034058),
 (u'penalti', 0.5338647961616516),
 (u'expens', 0.4798528254032135),
 (u'carecreditgecrb', 0.4658757448196411),
 (u'cost', 0.4657619893550873),
 (u'feesexpens', 0.4608253240585327),
 (u'pape', 0.45263051986694336),
 (u'prioir', 0.45236194133758545),
 (u'specialistthat', 0.4386531412601471)]

In [201]:
model.most_similar(positive=['woman','husband'],negative=['man'])

[(u'wife', 0.7806158661842346),
 (u'father', 0.7096401453018188),
 (u'mother', 0.670414388179779),
 (u'mom', 0.6671707034111023),
 (u'son', 0.6444637775421143),
 (u'spous', 0.609491229057312),
 (u'exhusband', 0.6069841384887695),
 (u'daughter', 0.5768710374832153),
 (u'parent', 0.5752948522567749),
 (u'dad', 0.5628846287727356)]

In [51]:
model.most_similar(positive=['bank'],negative=['mortgage'])

[(u'atm', 0.40179917216300964),
 (u'branch', 0.3923277258872986),
 (u'express', 0.38614311814308167),
 (u'billview', 0.3822770118713379),
 (u'store', 0.38126140832901),
 (u'kolh', 0.3750460743904114),
 (u'financialinc', 0.3747432827949524),
 (u'backreward', 0.37255075573921204),
 (u'vendor', 0.36216551065444946),
 (u'taper', 0.35721737146377563)]

In [49]:
model.most_similar(positive=['compani'],negative=['loan'])

[(u'agencieson', 0.47409629821777344),
 (u'vice', 0.4695762097835541),
 (u'agenc', 0.46380293369293213),
 (u'usbank', 0.4628336727619171),
 (u'collector', 0.43785393238067627),
 (u'friendssupervisor', 0.43704307079315186),
 (u'freedom', 0.4364060163497925),
 (u'offic', 0.42728549242019653),
 (u'carrier', 0.4197801351547241),
 (u'vanderbilt', 0.41828247904777527)]

In [12]:
print texts[55582]

NameError: name 'texts' is not defined

In [11]:
doc_vectors[23]

TaggedDocument(words=['current', 'serv', 'appli', 'barclay', 'bank', 'luxuri', 'gold', 'card', 'time', 'barclay', 'great', 'reput', 'program', 'offer', 'militari', 'member', 'went', 'well', 'beyond', 'scra', 'standard', 'offer', 'apr', 'part', 'scra', 'agreement', 'also', 'waiv', 'transact', 'fee', 'well', 'annual', 'fee', 'militari', 'durat', 'servic', 'explicit', 'reason', 'appli', 'card', 'past', 'barclay', 'decid', 'chang', 'term', 'condit', 'offer', 'new', 'militari', 'member', 'state', 'tri', 'bring', 'selv', 'line', 'everyon', 'els', 'term', 'scra', 'everi', 'card', 'servic', 'repres', 'talk', 'state', 'previou', 'card', 'holder', 'would', 'affect', 'new', 'account', 'holder', 'month', 'card', 'receiv', 'notic', 'mail', 'state', 'could', 'longer', 'honor', 'origin', 'term', 'would', 'begin', 'access', 'account', 'annual', 'fee', 'understand', 'barclay', 'bank', 'went', 'well', 'beyond', 'creditor', 'regard', 'scra', 'howev', 'honor', 'origin', 'term', 'agreement', 'current', 'ca

### Train and test a classifier

In [41]:
x_train, x_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.3)

In [20]:
classifier = svm.LinearSVC()
classifier.fit(x_train, y_train)
print('Accuracy: {}'.format(round(classifier.score(x_test, y_test), 3)))

Accuracy: 0.205


In [None]:
from sklearn.svm import SVC
clf = svm.SVC(kernel='rbf',C=10)
clf.fit(x_train,y_train)
print('Accuracy: {}'.format(round(clf.score(x_test, y_test), 3)))

### Test the SVM classification
Sanity check it

0 Bank account or service
2 Consumer Loan
3 Credit card
5 Credit reporting
7 Debt collection
9 Money transfers
10 Mortgage
11 Other financial service
12 Payday loan
14 Prepaid card
15 Student loan
17 Virtual currency

In [209]:
model.infer_vector(preprocess('I tried to reach the phone company to cancel my bill but was directed to customer service'))

array([ 0.17907736, -0.18046661, -0.09831718,  0.0829933 ,  0.37157869,
       -0.63550055, -0.0283353 , -0.1866217 ,  0.14354159, -0.21643676,
        0.2708956 , -0.1409138 ,  0.43431783, -0.04355631,  0.09619011,
        0.06040679, -0.19532369,  0.31307355,  0.19544083, -0.22759935,
       -0.26166505, -0.12196559,  0.16174534, -0.38839123,  0.37992996,
       -0.02993813, -0.26422694, -0.28137329,  0.2439533 ,  0.13481113,
        0.04330436,  0.03381678, -0.18146871,  0.17992218,  0.01423303,
       -0.27879339,  0.2483215 ,  0.13126862,  0.32891789, -0.3172676 ,
       -0.00737016, -0.26359296,  0.11225697, -0.17602052, -0.16413976,
        0.19309391, -0.16677693, -0.05901907, -0.12515211, -0.21373965,
        0.17389262,  0.06960808,  0.11227234, -0.38014016,  0.11621318,
       -0.4175688 ,  0.39145246,  0.20752962, -0.12521632, -0.16266043,
       -0.24660715, -0.37105349,  0.13669388,  0.67263263, -0.16480532,
       -0.15671536,  0.37880298,  0.17520456, -0.52742356, -0.26

In [44]:
preprocess("I tried to reach the phone company")

u'tri reach phone compani'

In [29]:
classifier.predict(model.infer_vector(preprocess('I tried to reach the phone company to cancel my bill but was directed to customer service')))

TypeError: can only join an iterable

In [21]:
def test_svm(text):
    test_y = classifier.predict(model.infer_vector(preprocess(text)))
    print test_y, le.inverse_transform(sum(test_y))

In [264]:
def test_rbf_svm(text):
    test_y = clf.predict(model.infer_vector(preprocess(text)))
    print test_y, le.inverse_transform(sum(test_y))

In [24]:
def test_stories(num):
    complaint = data.loc[num,'complaint_what_happened']
    actual = data.loc[num,'product']
    lin_svm = test_svm(complaint)
#     rbf_svm = test_rbf_svm(complaint)
    print '------'
    print "Actual:", actual, le.inverse_transform(actual)
    print "Linear Predicted:", test_svm(complaint)
#     print "RBF Predicted:",test_rbf_svm(complaint)

In [27]:
test_stories(100)

TypeError: can only join an iterable

In [233]:
test_svm(data.loc[42,'complaint_what_happened'])



u'Credit reporting'

In [228]:
test_text = 'transunion sold me a credit reporting software that I thought would alert me that there was a bad threat to my credit, but it never worked.'
test_svm(test_text)



u'Debt collection'

In [236]:
le.inverse_transform(7)

u'Debt collection'

In [210]:
clf

SVC(C=10, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [30]:
from sklearn.linear_model import LogisticRegression
Y = y
x_train, x_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.3)
clf = LogisticRegression()
clf.fit(x_train,y_train)
print('Accuracy: {}'.format(round(clf.score(x_test, y_test), 3)))


Accuracy: 0.243


In [31]:
y = [1 if i==7 else 0 for i in Y]
x_train, x_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.3)
clf = LogisticRegression()
clf.fit(x_train,y_train)
print('Accuracy: {}'.format(round(clf.score(x_test, y_test), 3)))

Accuracy: 0.755


In [71]:
clf.predict_proba(X)

array([[ 0.75805449,  0.24194551],
       [ 0.75539218,  0.24460782],
       [ 0.74851319,  0.25148681],
       ..., 
       [ 0.75086622,  0.24913378],
       [ 0.76737174,  0.23262826],
       [ 0.72992227,  0.27007773]])

In [72]:
for i in prod_7_pred:
    if i !=0:
        print i

In [73]:
from sklearn.neighbors import KNeighborsClassifier
clf = KNeighborsClassifier()
clf.fit(x_train,y_train)
print('Accuracy: {}'.format(round(clf.score(x_test, y_test), 3)))

Accuracy: 0.703


In [136]:
a = clf.predict(X)

KeyboardInterrupt: 

In [None]:
sum(a)

In [None]:
# Load pickled multi-class linear svm
with open('../cfpb_actions/gcloud_model/mult_lin_svm.pkl', 'rb') as f:
    classifier = pickle.load(f)

In [None]:
print 1