In [93]:
#for semantic data. uses readability nltk

In [24]:
import json, pickle
import numpy as np
from sklearn.feature_extraction import text
from sklearn.decomposition import TruncatedSVD
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from scipy.spatial.distance import pdist, squareform
import nltk
#import raw_data_utils as utils
#import extract_features as extract
#import clean_data, qm_clean
import readability

In [25]:
with open('results_delivery_test.json') as f:
    tokens_mini = json.load(f)
with open('stopwords.txt') as f:
    stopwords = set(f.read().split())
    
reducedDim = 5

In [26]:
def get_doc_name_iterator(doc_token_collection, by_sentence=False, indiv_tokens = False, stemmed=False,
                     stopwords=None, filter_fn = lambda x: True, get_key = False, skip_form_letters=False, 
                     condense_form_letters=False):
    form_letter_appearance=set()
    for item in doc_token_collection:
        k = item['isFormLetter']
        if skip_form_letters:
            if k[0:3]!="nfl":
                continue
        if condense_form_letters:
            if k[0:3]!="nfl": 
                if k[0:3] not in form_letter_appearance:
                    form_letter_appearance.add(k[0:3])
                else:
                    continue
        yield item
        

In [27]:
from __future__ import division
with open('hedges_hyland2005.txt') as f:
    hedges = f.read().split('\n')
modals = [['may', 'might'], ['can', 'cant', 'could', 'couldnt'], ['would', 'wouldnt'], ['should', 'shouldnt'], ['will', 'wont'],
          ['must'], ['wish', 'wished'], ['need', 'needed'], ['want', 'wanted']]
self_singular = ['i', 'ive', 'im', 'me', 'my', 'mine', 'myself']
self_plural = ['we', 'weve', 'were', 'our', 'ours', 'ourselves', 'us']
other_singular = ['you', 'your', 'youre']
other_plural = ['they', 'theyre', 'their']
negations =['no', 'not', 'none', 'nobody', 'nothing', 'neither', 'nowhere', 'never', 'doesnt', 'isnt', 'wasnt', 'shouldnt', 'wouldnt', 'couldnt', 'wont', 'cant', 'dont']

'''ADD: 
ALL THE OTHER FEATURES FROM THE LIST LOL
negation
average length
 total length
 contains keyword
 readability ease
 swearwords corpus
 if form letter
 sentiment words, 
 capitalization
 punctuation
 POS tags?
##ratio of xth pronoun over all pronouns.

percent of sentences with 1st singular, 1st plural, 2nd singular pronouns 
average number of words per sentence, along with number of sentences
percent of sentences with negation, with imperatives, with must, should, will, may, can etc. that identify various attitudes
Flesch-Kincaid Reading Ease score, and frequency of swear words (fairly powerful in identifying legal terminology)
Mention of critical people and organizations – while these were hard coded, it should be possible to extract form the NER tags.

'''


def word_freq_sentence(word, sentence):
    return sentence.count(word) / total_words(sentence)

def word_freq_comment(word, comment):
    count = 0
    total = 0
    for sentence in comment:
        count += sentence.count(word)
        total += total_words(sentence)
    return count / total
    
def num_words(sentence):
    return sentence.count(' ') + 1

def pronoun_freq_self(comment):
    singular = 0
    plural = 0
    total_words = 0
    for sentence in comment:
        words = sentence.split()
        total_words += num_words(sentence)
        for word in words:
            if word in self_singular:
                singular += 1
            if word in self_plural:
                plural += 1
    total = singular + plural
    if total == 0:
        return [0, 0]
    return [(singular - plural) / total, total / total_words]

def pronoun_freq_others(comment):
    singular = 0
    plural = 0
    total_words = 0
    for sentence in comment:
        total_words += num_words(sentence)
        words = sentence.split()
        for word in words:
            if word in other_singular:
                singular += 1
            if word in other_plural:
                plural += 1
    total = singular + plural
    if total == 0:
        return [0, 0]
    return [(singular - plural) / total, total / total_words]
    
def hedge_freq(comment):
    count = 0
    total = 0
    hedge_counts = [0] * len(hedges)
    for sentence in comment:
        words = sentence.split()
        total += num_words(sentence)
        for word in words:
            if word in hedges:
                count += 1
                hedge_counts[hedges.index(word)] += 1
    return [count / total, hedge_counts]

def negation_freq(comment):
    count = 0
    total = 0
    neg_counts = [0] * len(negations)
    for sentence in comment:
        words = sentence.split()
        total += num_words(sentence)
        for word in words:
            if word in negations:
                count += 1
                neg_counts[negations.index(word)] += 1
    return [count / total, neg_counts]

def modal_freq(comment):
    modal_counts = [0] * len(modals)
    total = 0
    count_tot = 0
    for sentence in comment:
        words = sentence.split()
        total += num_words(sentence)
        for word in words:
            for i, modal_type in enumerate(modals):
                if word in modal_type:
                    count_tot += 1
                    modal_counts[i] += 1
    for i, count in enumerate(modal_counts):
        modal_counts[i] = count/total
    return [count_tot/total, modal_counts]
    

In [119]:
titles="""Product,Sub-product,Issue,Sub-issue,Company public response,Company,State,ZIP code,Tags,Submitted via,Date sent to company,Company response to consumer,Timely response?,Consumer disputed?"""
titles = titles.split(",")

#title preprocessing:
#for each title, count every single possible response and numerize it. 

#struct title to list of titles
one_hot_lookup={}
one_hot_dims={}
dims_sum=0
names = []
for title in titles:
    answers=[]
    for doc in get_doc_name_iterator(tokens_mini, stemmed=False, condense_form_letters=True):
        res = doc[title]
        if res not in answers:
            answers.append(res)
            names.append((title, res))
    one_hot_lookup[title]=dict(zip(answers,range(len(answers))))
    one_hot_dims[title]=len(answers)
    dims_sum+=len(answers)

    

In [120]:
feature_d_no_topics = {}
name_array=[""]* dims_sum
for k in get_doc_name_iterator(tokens_mini, stemmed=False, condense_form_letters=True):
    v = k['narrative']
    features=[]
# deal with linguistic features later.
#     features = pronoun_freq_self(v)
#     features += pronoun_freq_others(v)
#     features.append(hedge_freq(v)[0])
#     features.append(negation_freq(v)[0])
#     features.append(modal_freq(v)[0])
    dims_so_far = 0
    feature_vec = [0] * dims_sum
    for t in titles:
        features.append(one_hot_lookup[t][k[t]]) 
        feature_vec[dims_so_far+one_hot_lookup[t][k[t]]]=1
        name_array[dims_so_far+one_hot_lookup[t][k[t]]]=t+" = "+k[t]
        dims_so_far+=one_hot_dims[t]
     
    #features.append(fl_keys.index(k[k.rfind('#') + 1:]))
    #flesch kincaid?
    #feature_d_no_topics[k] = features
    i += 1
    feature_d_no_topics[k['complaintID']]=feature_vec

In [121]:
feat_matrix=[]
for k in feature_d_no_topics:
    feat_matrix.append(feature_d_no_topics[k])
    
tsvd = TruncatedSVD(reducedDim)
tsvd.fit(feat_matrix)
V_tf = tsvd.components_
U_tf = tsvd.transform(feat_matrix)
print V_tf.shape
print U_tf.shape

(5, 687)
(497, 5)


In [130]:
topic_words={}
for i in range(reducedDim):
    row = tsvd_tfidf.components_[i,:]
    argsorted = np.argsort(row)[::-1][:15]
    scores = row[argsorted]
    top_words = [name_array[i] for i in argsorted]
    topic_words[i]=' , '.join(top_words)
for i in topic_words:
    print i
    print topic_words[i]

18
Sub-issue =  , Product = Mortgage , Sub-product = Conventional fixed mortgage , Issue = Loan servicing, payments, escrow account , Consumer disputed? = Yes , Issue = Loan modification,collection,foreclosure , Company = Bank of America , Product = Credit card , Sub-product = Conventional adjustable mortgage (ARM) , Company response to consumer = Closed with explanation , Product = Bank account or service , Tags = Older American , Product = Consumer Loan , Company = Ocwen , Sub-product = FHA mortgage
3
Submitted via = Web , Timely response? = Yes , Tags =  , Company response to consumer = Closed with explanation , Consumer disputed? = No , Company public response =  , Sub-issue =  , Sub-product =  , Consumer disputed? = Yes , Product = Debt collection , Product = Mortgage , Company public response = Company chooses not to provide a public response , Product = Credit reporting , State = CA , Product = Credit card
52
Sub-product =  , Company public response = Company chooses not to prov

In [135]:
ref_docs = list(get_doc_name_iterator(tokens_mini, indiv_tokens=False, stemmed=False, condense_form_letters=True))

for i, row in enumerate(U_tf.T):
    argsort = np.argsort(row)[::-1]
    topic_word_sorted = np.argsort(V_tf[i,:])[::-1]
    print i
    print 'Topic words: '
    print ' , '.join([name_array[i] for i in topic_word_sorted[:20]])
#     print 'Example 1: ' #for every topic, the highest ranking document.
#     print ref_docs[argsort[0]]
#     print 'Example 2: '
#     print ref_docs[argsort[1]]
#     print ' '

0
Topic words: 
Submitted via = Web , Timely response? = Yes , Tags =  , Company response to consumer = Closed with explanation , Consumer disputed? = No , Company public response =  , Sub-issue =  , Sub-product =  , Consumer disputed? = Yes , Product = Debt collection , Product = Mortgage , Company public response = Company chooses not to provide a public response , Product = Credit reporting , State = CA , Product = Credit card , Sub-product = Conventional fixed mortgage , Issue = Loan servicing, payments, escrow account , Company response to consumer = Closed with non-monetary relief , Issue = Cont'd attempts collect debt not owed , State = FL
1
Topic words: 
Sub-issue =  , Product = Mortgage , Sub-product = Conventional fixed mortgage , Issue = Loan servicing, payments, escrow account , Consumer disputed? = Yes , Issue = Loan modification,collection,foreclosure , Company = Bank of America , Product = Credit card , Sub-product = Conventional adjustable mortgage (ARM) , Company respo

In [138]:
def tsne_viz(
        mat,
        rownames,
        colors=None,
        output_filename=None,
        figheight=40,
        figwidth=50):     
    """2d plot of `mat` using t-SNE, with the points labeled by `rownames`, 
    aligned with `colors` (defaults to all black).
    
    Parameters
    ----------    
    mat : 2d np.array
        The matrix to visualize.
        
    rownames : list of str
        Names of the points to visualize.
                
    colors : list of colornames or None (default: None)
        Optional list of colors for rownames. The color names just need to 
        be interpretable by matplotlib. If they are supplied, they need to 
        have the same length as rownames, or indices if that is not None. 
        If `colors=None`, then all the words are displayed in black.
      
    output_filename : str (default: None)
        If not None, then the output image is written to this location. The 
        filename suffix determines the image type. If None, then 
        `plt.plot()` is called, with the behavior determined by the 
        environment.
        
    figheight : int (default: 40)
        Height in display units of the output.
            
    figwidth : int (default: 50)
        Width in display units of the output.
        
    """
    indices = list(range(len(rownames)))
    # Colors:
    if not colors:
        colors = ['black' for i in indices]    
    # Recommended reduction via PCA or similar:
    n_components = 50 if mat.shape[1] >= 50 else mat.shape[1]
    dimreduce = PCA(n_components=n_components)
    mat = dimreduce.fit_transform(mat)
    
    # t-SNE:
    tsne = TSNE(n_components=2, random_state=0)
    np.set_printoptions(suppress=True)    
    tsnemat = tsne.fit_transform(mat) 
   
    # Plot values:
    vocab = np.array(rownames)[indices]
    xvals = tsnemat[indices, 0] 
    yvals = tsnemat[indices, 1]
    # Plotting:
    fig, ax = plt.subplots(nrows=1, ncols=1)
#     fig.set_figheight(40)
#     fig.set_figwidth(50)
    ax.plot(xvals, yvals, marker='', linestyle='')
    # Text labels:
    for word, x, y, color in zip(vocab, xvals, yvals, colors):
        ax.annotate(word, (x, y), fontsize=8, color=color)
    # Output:
    if output_filename:
        plt.savefig(output_filename, bbox_inches='tight')
    else:
        plt.show()
        
    

def plot_embedding(X, doc_to_topic_matrix, title=None):
    #try tsne on clustering
    x_min, x_max = np.min(X, 0), np.max(X, 0)
    X = (X - x_min) / (x_max - x_min)

    plt.figure()
    ax = plt.subplot(111)
    labels =  np.argmax(doc_to_topic_matrix, axis=1)
    for i in range(X.shape[0]):
        
        plt.text(X[i, 0], X[i, 1], labels[i],
                 fontdict={'weight': 'bold', 'size': 9})

    plt.xticks([]), plt.yticks([])
    if title is not None:
        plt.title(title)

In [None]:
for row in U_tf:
    row[0]=-100 #just found that the 0th dimension is garbage - everything falls under it. 
print U_tf
rownames = np.argmax(U_tf, axis=1)#zip(np.argmax(U_tf, axis=1),[item[0] for item in ref_docs])
dists = pdist(feat_matrix,'cosine') # or U_tf toarray? no
tsne_viz(squareform(dists), rownames)