### tensorflow

In [4]:
import tensorflow as tf
def sim_exact_cross(embs1, embs2):
    input1 = tf.placeholder(tf.float64, shape=(None))
    input2 = tf.placeholder(tf.float64, shape=(None))

    norm1 = tf.nn.l2_normalize(input1, axis=1)
    norm2 = tf.nn.l2_normalize(input2, axis=1)
    cosine_similarities = tf.tensordot(norm1, norm2, axes=((1),(1)))
#     cosine_similarities = tf.reduce_sum(vectorsdot, axis=1)
    clip_cosine_similarities = tf.clip_by_value(cosine_similarities, -1.0, 1.0)
    angular_dist = 1.0 - tf.acos(clip_cosine_similarities)
    
    with tf.Session() as session:
        session.run(tf.global_variables_initializer())
        session.run(tf.tables_initializer())
  
        norm1, norm2, angdist, cossim = session.run([norm1, norm2, angular_dist, cosine_similarities], 
                                         feed_dict={ input1: embs1,
                                                     input2: embs2 })
          
    return angdist, cossim

In [1]:
import tensorflow as tf
def sim_exact(embs1, embs2):
    input1 = tf.placeholder(tf.float64, shape=(None))
    input2 = tf.placeholder(tf.float64, shape=(None))

    norm1 = tf.nn.l2_normalize(input1, axis=1)
    norm2 = tf.nn.l2_normalize(input2, axis=1)
    cosine_similarities = tf.reduce_sum(tf.multiply(norm1, norm2), 1, keepdims=True)
#     cosine_similarities = tf.reduce_sum(tf.multiply(norm1, norm2,), axis=1)    
#     cosine_similarities = tf.diag_part(tf.tensordot(norm1, norm2, axes=((1),(1))))   
    clip_cosine_similarities = tf.clip_by_value(cosine_similarities, -1.0, 1.0)    
    angular_dist = 1.0 - tf.acos(clip_cosine_similarities)
    
    with tf.Session() as session:
        session.run(tf.global_variables_initializer())
        session.run(tf.tables_initializer())
  
        norm1, norm2, angdist, cossim = session.run([norm1, norm2, angular_dist, cosine_similarities], 
                                         feed_dict={ input1: embs1,
                                                     input2: embs2 })
          
    return angdist, cossim

### ploting

In [5]:
import seaborn as sns
def plot_similarity(labels, corr): 
    sns.set(font_scale=1.2)
    g = sns.heatmap(
      corr,
      xticklabels=labels,
      yticklabels=labels,
      vmin=0,
      vmax=1,
      cmap="YlOrRd")
    g.set_xticklabels(labels, rotation=90)
    g.set_title("Semantic Textual Similarity")

In [7]:
from sklearn.metrics import precision_recall_curve,average_precision_score
import matplotlib.pyplot as plt
from sklearn.utils.fixes import signature

In [5]:
def plot_precision_recall_curve(y_score, y_true, display_plot=True):
    from sklearn.metrics import precision_recall_curve,average_precision_score
    import matplotlib.pyplot as plt
    from sklearn.utils.fixes import signature
    # y_true : array, shape = [n_samples] or [n_samples, n_classes] True binary labels or binary label indicators.
    # y_score : array, shape = [n_samples] or [n_samples, n_classes]  Target scores, can either be probability estimates of the positive
    # class, confidence values, or non-thresholded measure of decisions   (as returned by "decision_function" on some classifiers).
    average_precision = average_precision_score(y_true, y_score)
    # y_true : array, shape = [n_samples]  True targets of binary classification in range {-1, 1} or {0, 1}.
    # probas_pred : array, shape = [n_samples]  Estimated probabilities or decision function.
    precision, recall, _ = precision_recall_curve(y_true, y_score)

    if display_plot:
        # In matplotlib < 1.5, plt.fill_between does not have a 'step' argument
        step_kwargs = ({'step': 'post'}
                       if 'step' in signature(plt.fill_between).parameters
                       else {})
        plt.step(recall, precision, color='b', alpha=0.2,
                 where='post')
        plt.fill_between(recall, precision, alpha=0.2, color='b', **step_kwargs)

        plt.xlabel('Recall')
        plt.ylabel('Precision')
        plt.ylim([0.0, 1.05])
        plt.xlim([0.0, 1.0])
        plt.title('2-class Precision-Recall curve: AP={0:0.2f}'.format(average_precision))
    
    return average_precision

In [6]:
def plot_auc_curve(y_score, y_true, display_plot=True):
    import matplotlib.pyplot as plt
    from sklearn.metrics import roc_curve, auc
    # Compute ROC curve and area the curve
    # y_true : array, shape = [n_samples] True binary labels. If labels are not either {-1, 1} or {0, 1}, then pos_label should be explicitly given.
    # y_score : array, shape = [n_samples] Target scores, can either be probability estimates of the positive class, confidence values, 
    # or non-thresholded measure of decisions (as returned by "decision_function" on some classifiers).
    fpr, tpr, thresholds = roc_curve(y_true, y_score)
    roc_auc = auc(fpr, tpr)
    
    if display_plot:
        plt.plot(fpr, tpr, lw=1, label='ROC (area = %0.2f)' % (roc_auc))
        plt.plot([0, 1], [0, 1], '--', color=(0.6, 0.6, 0.6), label='Luck')
        plt.xlim([-0.05, 1.05])
        plt.ylim([-0.05, 1.05])
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title('Receiver operating characteristic')
        plt.legend(loc="lower right")
        plt.show()
    
    return roc_auc

### pandas

In [2]:
def apply_by_chunks(df_big, chunk_size, chunk_delegate):  
    df_chunks = [df_big[i:i+chunk_size] for i in range(0,df_big.shape[0],chunk_size)]    
    df_results = []
    for df_chunk in df_chunks:
        df_chunk = chunk_delegate(df_chunk)       
        df_results.append(df_chunk)
    df_res = pd.concat(df_results)
    return df_res

In [None]:
import pandas as pd
def df_crossjoin(df1, df2, **kwargs):
    df1['_tmpkey'] = 1
    df2['_tmpkey'] = 1

    res = pd.merge(df1, df2, on='_tmpkey', **kwargs).drop('_tmpkey', axis=1)
    res.index = pd.MultiIndex.from_product((df1.index, df2.index))

    df1.drop('_tmpkey', axis=1, inplace=True)
    df2.drop('_tmpkey', axis=1, inplace=True)

    return res

### mongodb

In [2]:
def iterate_by_chunks(collection, chunksize=1, start_from=0, query={}, projection={}):
    chunks = range(start_from, collection.find(query).count(), int(chunksize))
    num_chunks = len(chunks)
    for i in range(1,num_chunks+1):
        if i < num_chunks:
            yield collection.find(query, projection=projection)[chunks[i-1]:chunks[i]]
        else:
            yield collection.find(query, projection=projection)[chunks[i-1]:chunks.stop]