In [1]:
# clean these up
import os
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
#import re

from __future__ import print_function

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import euclidean_distances
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import confusion_matrix
from fuzzywuzzy import fuzz
import pandas as pd
from pyemd import emd
import tensorflow as tf

In [2]:
#http://stackoverflow.com/questions/20154303/pandas-read-csv-expects-wrong-number-of-columns-with-ragged-csv-file

def get_fuzzy(data_path):
    """
    Compute fuzzy wuzzy calculations on each pair of strings
    Return: the resulting dataframe as fuzzy.csv
    """
    temp = pd.read_csv(data_path,sep='^',header=None,prefix='X')
    temp2 = temp.X0.str.split(',',expand=True)

    df = pd.DataFrame(columns=['ratio','partial','sort','set'],
                      index=range(len(temp2)))

    for row in range(temp2.shape[0]):
        df['ratio'][row] = fuzz.ratio(temp2[1][row],temp2[2][row])
        df['partial'][row] = fuzz.partial_ratio(temp2[1][row],temp2[2][row])
        df['sort'][row] = fuzz.token_sort_ratio(temp2[1][row],temp2[2][row])
        df['set'][row] = fuzz.token_set_ratio(temp2[1][row],temp2[2][row])

    df.to_csv('{}/fuzzy.csv'.format(os.path.dirname(data_path)),index=False)

In [3]:
# http://vene.ro/blog/word-movers-distance-in-python.html

def get_dist(s_1,s_2,word_embeds,vocab_d):
    """compare strings s_1 and s_2, using word embeddings from word_embeds, 
    and index mappings (word:index in embeddings) in dictionary vocab_d"""
    
    # eliminate underscores -- necessary?
    ##s_1 = re.sub(r'\_+','',s_1)
    ##s_2 = re.sub(r'\_+','',s_2)
    
    # fit cv on the strings
    # no more stop words
    vect = CountVectorizer().fit([s_1, s_2])
    
    # get normalized 'flow' vectors
    v_1, v_2 = vect.transform([s_1, s_2])
    v_1 = v_1.toarray().ravel().astype(np.float64)
    v_2 = v_2.toarray().ravel().astype(np.float64)
    v_1 /= v_1.sum()
    v_2 /= v_2.sum()
    
    # get normalized distance matrix for words in both docs
    W_ = word_embeds[[vocab_d[w] for w in vect.get_feature_names()]]
    D_ = euclidean_distances(W_).astype(np.float64)
    D_ /= D_.max()

    distances = emd(v_1,v_2,D_)
    
    return distances

In [4]:
def data_generator(data_path,word_embeds,index_dict,fuzzy_path,
                   distance_func=get_dist,shuffle=True,seed=42):
    """shuffle is default true because data are sorted.
    
    Returns: X, Y, shuffled indices, and X (as pairs of strings)
    """
    X_in = np.genfromtxt(data_path,
                  delimiter=',',usecols=(1,2),dtype=str)
    Y_in = np.genfromtxt(data_path,
                  delimiter=',',usecols=(0)).reshape((-1,1))
    # fuzzy_file columns are: simple ratio, partial ratio, 
    # token sort ratio, and token set ratio
    fuzzy_file = np.genfromtxt(fuzzy_path,
              delimiter=',',dtype=float,skip_header=1)
    # string lengths for each pair
    str1_len = [len(pair[0]) for pair in X_in]
    str2_len = [len(pair[1]) for pair in X_in]

    # exclude any observations where the WMD produces a nan, either
    # because of a division by zero or only stopword strings <<but now stopwords gone>>
    # raises a couple warnings about divide
    X = []
    Y = []
    
    # could put in some check here to say if total words in both is less than 3,
    # put in some fake super rare words in the cv
    
    for i, strings in enumerate(X_in):
        try:
            score = distance_func(strings[0],strings[1],word_embeds,index_dict)
            if score >= 0: #use a cleaner way to check if it's a number
                X.append(score)
                # additional features from fuzzywuzzy
                X.append(fuzzy_file[i][0])
                X.append(fuzzy_file[i][1])
                X.append(fuzzy_file[i][2])
                X.append(fuzzy_file[i][3])
                # string lengths as features
                X.append(str1_len[i])
                X.append(str2_len[i])
                
                Y.append(Y_in[i])
        #save the words that cause exceptions to a list?
        except ValueError:
            continue
        except KeyError:
            continue
    X = np.asarray(X).reshape((-1,7))#3
    Y = np.asarray(Y).reshape((-1,1))
    
    indices = range(X.shape[0])
    # randomly shuffle the data
    if shuffle:
        np.random.seed(seed)
        np.random.shuffle(indices)
        X = X[indices]
        Y = Y[indices]
    
    # transform Y from either 1 or 2 to a one-hot vector 
    # indicating the index: 0 for minor, 1 for major
    # could also make this an optional parameter
    y_list = []
    for i, label in enumerate(Y):
        if label == 2:
            label = 1
            y_list.append(np.insert(label,0,0))
        elif label == 1:
            y_list.append(np.insert(label,1,0))
        else:
            raise ValueError("Y label must be either 1 (minor) or 2 (major). \
                             Problem at index ", i)
    Y = np.asarray(y_list)
    
    return X,Y,indices,X_in


In [9]:
binary_file = "/Users/Rutherford/Desktop/data/GoogleNews-vectors-negative300.bin"
w2v_dat = "/Users/Rutherford/Desktop/data/embed.dat"
w2v_vocab = "/Users/Rutherford/Desktop/data/embed.vocab"
vocab_size = 3000000
embedding_dim = 300


In [10]:
embeddings = np.memmap(w2v_dat, dtype=np.float64, mode="r", shape=(vocab_size, embedding_dim))
with open(w2v_vocab) as f:
    vocab_list = map(lambda string: string.strip(), f.readlines()) 
vocab_dict = {w: i for i, w in enumerate(vocab_list)}
assert len(embeddings) == vocab_size

In [6]:
get_fuzzy('/users/Rutherford/desktop/test.csv')

In [13]:
x,y,ind,raw = data_generator('/users/Rutherford/desktop/test.csv',embeddings,vocab_dict,'/users/Rutherford/desktop/fuzzy.csv')

In [14]:
tf.reset_default_graph() 
g = tf.Graph() 
with g.as_default():
    
    # 7 inputs (WMD, 4 FuzzyWuzzy calculations, length of each string)
    # 2 outputs (one-hot vector of index of prediction)
    X = tf.placeholder(tf.float32, shape=[None, 7])
    Y = tf.placeholder(tf.float32, shape=[None, 2])
    
    glob_step = tf.Variable(0,dtype=tf.float32,trainable=False)
    keep_prob = tf.placeholder(tf.float32)
    #lr = tf.train.exponential_decay(learning_rate=.1, 
    #                               global_step=glob_step, 
    #                               decay_steps=100, 
    #                               decay_rate=0.96, 
    #                               staircase=True)
    
    
    weight_shape1 = [7,64]
    weight_shape2 = [64,128]
    weight_shape3 = [128,16]
    weight_shape4 = [16,2]
    

    [n_inputs1,n_outputs1,n_inputs3,n_outputs3,n_outputs_final] = \
        weight_shape1[0],weight_shape1[1],weight_shape3[0], \
        weight_shape3[1],weight_shape4[1]
    
    # Weights initialized a la Glorot & Bengio paper
    # but with batch normalization this may be irrelevant
    init_range1 = tf.sqrt(6.0/(n_inputs1+n_outputs1))
    init_range2 = tf.sqrt(6.0/(n_outputs1+n_inputs3))
    init_range3 = tf.sqrt(6.0/(n_inputs3+n_outputs3))
    init_range4 = tf.sqrt(6.0/(n_outputs3+n_outputs_final))
    w1 = tf.Variable(tf.random_uniform(weight_shape1,
                                       -init_range1,init_range1),name='w1')
    w2 = tf.Variable(tf.random_uniform(weight_shape2,
                                       -init_range2,init_range2),name='w2')
    w3 = tf.Variable(tf.random_uniform(weight_shape3,
                                       -init_range3,init_range3),name='w3')
    w4 = tf.Variable(tf.random_uniform(weight_shape4,
                                       -init_range4,init_range4),name='w4')
    # no need for biases after batch normalizing
    #b1 = tf.Variable(tf.constant(.1,shape=[n_outputs1]))
    #b2 = tf.Variable(tf.constant(.1,shape=[n_inputs3]))
    #b3 = tf.Variable(tf.constant(.1,shape=[n_outputs3]))
    b = tf.Variable(tf.constant(.1,shape=[n_outputs_final]))
        
    # Network -- 3 batch normalized dropout layers
    batch_normed1 = tf.contrib.layers.batch_norm(tf.matmul(X,w1)) 
    rel1 = tf.nn.relu(batch_normed1)
    rel1_drop = tf.nn.dropout(rel1,keep_prob)
    
    batch_normed2 = tf.contrib.layers.batch_norm(tf.matmul(rel1_drop,w2)) 
    rel2 = tf.nn.relu(batch_normed2)
    rel2_drop = tf.nn.dropout(rel2,keep_prob)
    
    batch_normed3 = tf.contrib.layers.batch_norm(tf.matmul(rel2_drop,w3)) 
    rel3 = tf.nn.relu(batch_normed3)
    rel3_drop = tf.nn.dropout(rel3,keep_prob)
    
    logits = tf.matmul(rel3_drop,w4)+b
    
    # Predictions
    probs_x = tf.nn.softmax(logits)
    y_pred = tf.argmax(probs_x,dimension=1)
    
    # Cost
    # per pair
    rows_of_cost = \
        tf.nn.softmax_cross_entropy_with_logits(logits,Y,name='rows_of_cost')
    # average over all pairs; loss
    cost = tf.reduce_mean(rows_of_cost,reduction_indices=None,
                          keep_dims=False,name='cost')

    # gradients and training
    opt = tf.train.AdagradOptimizer(learning_rate=.02)
    train_op = opt.minimize(cost,global_step=glob_step,var_list=[w1,w2,w3,w4,b])
    
    # save model
    saver = tf.train.Saver()
    
with tf.Session(graph=g) as sess:
    sess.run(tf.initialize_all_variables())
    
    saver.restore(sess, "/Users/Rutherford/Desktop/scribie data/model.ckpt")
    print("Model restored.")
    
    print(sess.run(y_pred,feed_dict={X:x,keep_prob:1.}))
    """mini_batch_size = 32
    start_end = zip(range(0,len(x_train),mini_batch_size), 
                   range(mini_batch_size,len(x_train)+1,mini_batch_size))
    cost_list = []
    num_passes = 401
    for pass_i in range(num_passes):
        for (s,e) in start_end:
            cost_val,_ = sess.run([cost,train_op], #need a backslash here?
                feed_dict={X: x_train[s:e,],Y: y_train[s:e],keep_prob:.8})
            cost_list.append(cost_val)
        if pass_i % 50 == 0: 
            test_result = sess.run([y_pred],feed_dict={X:x_test,keep_prob:1.})
            # OOS accuracy
            print(pass_i,np.mean(np.argmax(y_test,axis=1) == test_result[0]))
    save_path = saver.save(sess,'{}/model.ckpt'.format(os.path.dirname(data_path)))
    print("Model saved in file: {}".format(save_path))
"""

Model restored.
[]


In [56]:
def get_dist(s_1,s_2):
    """Return Word Mover's Distance between strings s_1 and s_2, 
    using word embeddings and mappings created upon initialization 
    of the class instance, and ultimately calculating the distance with
    emd (Earth Mover's Distance) from PyEMD. 
    """
    cv_types = ["","lowercase=False","stop_words='english'",
                "stop_words='english',lowercase=False"]
    # fit CV on the strings, with no stop words
    results_ = []
    for method in cv_types:
        try:
            vect = CountVectorizer('{}'.format(method)).fit([s_1, s_2])        
            # for getting rid of items not in Google vectors
            features = np.asarray(vect.get_feature_names())
            bad_indices = [idx for (idx,word) in \
                           enumerate(features) if word not in vocab_dict]

            # get 'flow' vectors
            v_1, v_2 = vect.transform([s_1, s_2])
            v_1 = v_1.toarray().ravel().astype(np.float64)
            v_2 = v_2.toarray().ravel().astype(np.float64)

            # eliminate OOV items from vectors
            features = np.delete(features,bad_indices)    
            v_1 = np.delete(v_1,bad_indices)
            v_2 = np.delete(v_2,bad_indices)

            # normalize 'flow' vectors
            ###v_1 /= v_1.sum()
            ###v_2 /= v_2.sum()

            # get normalized distance matrix for words in both strings
            W_ = embeddings[[vocab_dict[w] for w in features]] ##  if w in self.vocab_dict
            # print(W_) # empty list for just numbers 
            D_ = euclidean_distances(W_).astype(np.float64)
            ###D_ /= D_.max()

            # using emd (Earth Mover's Distance) from PyEMD
            distances = emd(v_1,v_2,D_)
            #print(distances)
            results_.append(distances)
        except ValueError:
            return [0,0,0,0]#or like [-999...] but you'd have to fix logic check (>=0)

    return results_


In [50]:
vocab_dict["and"]

KeyError: 'and'

In [58]:
get_dist("i'm",'i')

[0, 0, 0, 0]

In [59]:
dingy = np.asarray([['tiger','liger'],['man','ham'],['chinaman','heineman'],['beagle','eagle']])

In [61]:
for s1,s2 in dingy:
    print(get_dist(s1,s2))

[3.3650433100162997, 3.3650433100162997, 3.3650433100162997, 3.3650433100162997]
[3.5730327376204007, 3.5730327376204007, 3.5730327376204007, 3.5730327376204007]
[0.0, 0.0, 0.0, 0.0]
[4.521330026974679, 4.521330026974679, 4.521330026974679, 4.521330026974679]


In [114]:
s_1 = 'charles in charge'
s_2 = 'of a life'
vect = CountVectorizer().fit([s_1,s_2])

In [115]:
features = np.asarray(vect.get_feature_names())
bad_indices = [idx for (idx,word) in \
               enumerate(features) if word not in vocab_dict]


In [116]:
bad_indices

[4]

In [117]:
v_1, v_2 = vect.transform([s_1, s_2])
v_1 = v_1.toarray().ravel().astype(np.float64)
v_2 = v_2.toarray().ravel().astype(np.float64)


In [120]:
sum(v_2[bad_indices])

1.0

In [109]:
v_2

array([ 0.,  0.,  0.,  1.,  1.])

In [110]:
v_1

array([ 1.,  1.,  1.,  0.,  0.])

In [111]:
features = np.delete(features,bad_indices)    
v_1 = np.delete(v_1,bad_indices)
v_2 = np.delete(v_2,bad_indices)


In [113]:
v_2

array([ 0.,  0.,  0.,  1.])

In [112]:
v_1

array([ 1.,  1.,  1.,  0.])

In [121]:
stop_words_set = {'and','so','but','like','kinda','laughter','chuckle','pause'}

In [133]:
def predict(str_1,str_2):
#def predict(self,pair_array)
    """
    Predicts the type of error between the two strings.
    Returns: 
        0 for minor, 1 for major, 
        'No error' for identical strings, 
        and 0 if a prediction cannot be made. # was 'Unknown'
    """
    #for str_1,str_2 in pair_array:
    #    same
    if str_1 == str_2:
        return 'No error'
    if 0 < len(str_1.split()) < 4:
        words = str_1.split()
        if any([word in stop_words_set for word in words]):
            return 0

        try:                
            maybe_timestamp = int(words[0])
            # have to check the string, because if it starts with 000, those will be dropped
            if 4 < len(words[0]) < 7:
                return 0
        except ValueError:
            pass

    try:
        return 'model prediction'
    except:
        #print(0)
        return 0 #'Unknown'

In [134]:
for pair in testing:
    print(predict(pair[0],pair[1]))

model prediction
model prediction
model prediction
0
model prediction
model prediction
model prediction
model prediction
model prediction
0
0
model prediction
model prediction
model prediction
model prediction
model prediction
model prediction
model prediction
model prediction
model prediction
model prediction
model prediction
model prediction
model prediction
model prediction
model prediction
0
model prediction
model prediction
0
model prediction
model prediction
model prediction
0
model prediction
model prediction
model prediction
model prediction
0
model prediction
model prediction
model prediction
model prediction
model prediction
model prediction
model prediction
model prediction


In [123]:
testing = np.asarray([['tutoring', 'torturing'],
       ["i don't", "i'm trying to"],
       ['s', 's3'],
       ['102302', '102315'],
       ['bike bicycle', 'bike in that bicycle'],
       ['____', 'planar'],
       ['we are', "that we're"],
       ['increased', 'increased 001514 s1'],
       ['data', ''],
       ['026142', '026152'],
       ['expertise like', 'expertise'],
       ['bits of', 'feeds twitter'],
       ['idea', 'that they did'],
       ['more', 'different'],
       ['indoctrined yeah', 'indoctrine yeah'],
       ['them 010021 s1 yeah 010034 s2', 'them'],
       ['were taking', 'had taken'],
       ['knew 000047 s ____ 000054 s', 'knew'],
       ['money because', "money 'cause"],
       ['front', 'front laughter 005572 s3'],
       ['connatate', 'connotate'],
       ['wba', 'wnba'],
       ["that's why", 'so'],
       ['flesh', 'fleshing'],
       ["s1 well you've", "s2 you've"],
       ['____', 'this would be'],
       ['004195 s1', ''],
       ['you', 'sure to'],
       ["we tell them don't run it", "something's gone wrong"],
       ['was like', 'was'],
       ['side is', "side it's"],
       ['came', "came that's great"],
       ['guess', 'get'],
       ['and so what', '09468 s what'],
       ['flirt', 'flitter'],
       ['pick', 'picked'],
       ['____', 'our incumbents'],
       ['tonight yeah', 'tonight'],
       ['like', 'it'],
       ['within', 'within the'],
       ['____', 'proprietary'],
       ['is photo creds so', 'is photo creds so that'],
       ['____', '____ mike ____'],
       ['sixty five hundred', '6500'],
       ['input', 'inputted'],
       ['integral differences which ____',
        'individual differences if healthy'],
       ['100000-dollar', '100000']])

## 10/11

In [None]:
from __future__ import print_function
import os
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import euclidean_distances
from sklearn.metrics.pairwise import cosine_similarity,cosine_distances###
from sklearn.metrics import confusion_matrix
from fuzzywuzzy import fuzz
import pandas as pd
from pyemd import emd
import tensorflow as tf


data_path = '/Users/Rutherford/Desktop/data'
w2v_dat = os.path.join(data_path,'embed.dat') 
w2v_vocab = os.path.join(data_path,'embed.vocab')

if not os.path.exists(w2v_dat):
    print("Caching word embeddings in memmapped format. \
            Please be patient...")
    from gensim.models.word2vec import Word2Vec
    wv = Word2Vec.load_word2vec_format(
        binary_file,binary=True)
    fp = np.memmap(w2v_dat, dtype=np.double, 
                   mode='w+', shape=wv.syn0.shape)
    fp[:] = wv.syn0[:]
    with open(w2v_vocab, "w") as f:
        for _, w in sorted((voc.index, word) \
                           for word, voc in wv.vocab.items()):
            print(w, file=f)
    del fp, wv

# create word embeddings and mapping of vocabulary item to index
embeddings = np.memmap(w2v_dat, dtype=np.float64, 
                            mode="r", shape=(3000000, 300))
with open(w2v_vocab) as f:
    vocab_list = map(lambda string: string.strip(), f.readlines()) 
vocab_dict = {w: i for i, w in enumerate(vocab_list)}


In [None]:
X_in = np.genfromtxt('/Users/Rutherford/Desktop/data/dataset.csv',
              delimiter=',',usecols=(1,2),dtype=str)
Y_in = np.genfromtxt('/Users/Rutherford/Desktop/data/dataset.csv',
              delimiter=',',usecols=(0)).reshape((-1,1))
fuzzy_file = np.genfromtxt('/Users/Rutherford/Desktop/data/fuzzy.csv',
          delimiter=',',dtype=float,skip_header=1)


str1_len = [len(pair[0]) for pair in X_in]
str2_len = [len(pair[1]) for pair in X_in]

str1_count = [len(pair[0].split()) for pair in X_in]
str2_count = [len(pair[1].split()) for pair in X_in]
