In [1]:
import os
from sklearn.neighbors import NearestNeighbors
from math import log
import pandas as pd
import numpy as np

In [2]:
# Set working directory as the current directory of the ipython notebook
working_dir = os.getcwd()

print("Working directory: %s" % working_dir)

Working directory: C:\Study\CS102\project\project2\repro\CS109Project


# Calculate term vectors and document clusters

Generating clusters is done in two steps.

1. First, for each document, rank the set of terms for the document with G2 scores. 
2. Then, for each document, generate clusters of K most similar documents.

By default the term ranking stage uses MetaMap CUIs for terms; this can be configured as a parameter to the respective functions.

In [3]:
def g2(a, b, c, d):
    """ Calculate Dunning's log-likelihood score

    :param a: term frequency in corpus 1
    :type a: double
    :param b: term frequency in corpus 2
    :type b: double
    :param c: corpus 1 frequency of other terms (size - a)
    :type c: double
    :param d: corpus 2 frequency of other terms (size - b)
    :type d: double
    :return: log-likelihood score
    :rtype: double
    """
    a = a + 1.0
    b = b + 1.0
    c = c + 1.0
    d = d + 1.0
    G2 = 2 * (
        a * log(a) + b * log(b) + c * log(c) + d * log(d) - (a + b) * log(a + b) - (a + c) * log(a + c) - (b + d) * log(
            b + d) - (c + d) * log(c + d) + (a + b + c + d) * log(a + b + c + d))
    return G2

In [4]:
def term_frequencies(data, column='cui'):
    return data[column].value_counts()

def aggregate_data(data, column='cui'):
    grouped = data.groupby(data['nct_id'])
    aggregated = grouped.agg({column: lambda x: x.tolist()})
    return aggregated

def rank_terms(freq_map, aggregated, column='cui'):
    def calc_freqs(r):
        k,v = np.unique(r, return_counts=True)
        return dict(zip(k,v))

    scores = [] # 2D matrix: 1 document per row, 1 term per column
    corpus_size = freq_map.sum()
    for row in aggregated[column].get_values()[:100]:
        doc_freqs = calc_freqs(row)
        doc_size = sum(doc_freqs.values())
        doc_vector = []
        for term, freq in freq_map.iteritems():
            a = int(doc_freqs[term]) if term in doc_freqs else 0
            b = freq
            c = doc_size - a
            d = corpus_size - b
            score = g2(a,b,c,d)
            doc_vector.append(score)
        scores.append(doc_vector)
    return np.array(scores)

def generate_clusters(term_vectors, k=5):
    nn = NearestNeighbors(n_neighbors=k, algorithm='ball_tree').fit(term_vectors)
    distances, indices = nn.kneighbors(term_vectors)
    return distances, indices

In [5]:
data_dir = os.path.join(working_dir, 'data')
df = pd.read_pickle(os.path.join(data_dir, 'mm.pckl'))

df.head()

Unnamed: 0,nct_id,criteria_id,ngram_index,score,term,cui,stype,cid
0,NCT00001149,0,0,11.49,Historical aspects qualifier,C0019665,inpr,[x.x.x]
1,NCT00001149,0,0,8.34,History,C0019664,ocdi,[K01.400]
2,NCT00001149,0,1,17.8,Seizures,C0036572,sosy,"[C10.228.140.490.631, C10.597.742, C23.888.592..."
3,NCT00001149,0,3,8.34,Time,C0040223,tmco,[G01.910]
4,NCT00001149,0,7,17.8,Seizures,C0036572,sosy,"[C10.228.140.490.631, C10.597.742, C23.888.592..."


*term_frequencies(data_frame)* returns the counts for each term (CUI) across the whole dataset.

In [6]:
freq_map = term_frequencies(df)
freq_map.head()

C0030705    11315
C0036572    10535
C0019664     7651
C0019665     7651
C0012634     7284
Name: cui, dtype: int64

*aggregate_data(data_frame)* groups the MetaMap-tagged data (one row per one MM tag) according to the NCT ID of the original document.

In [7]:
agg = aggregate_data(df)
agg.head()

Unnamed: 0_level_0,cui
nct_id,Unnamed: 1_level_1
NCT00001149,"[C0019665, C0019664, C0036572, C0040223, C0036..."
NCT00001192,"[C0030705, C0042960, C3661466, C0019665, C0019..."
NCT00001205,"[C0030705, C0039798, C0008059, C0042153, C0021..."
NCT00001218,"[C0031206, C0018684, C0031206, C0018684, C1708..."
NCT00001262,"[C0021289, C0021270, C0012634, C0021289, C0021..."


*rank_terms(frequency_map, aggregated_terms)* calculates the scores for each term per each document. It returns a list where each row corresponds to each document in *aggregated* and each column corresponds to each term in *frequency_map*.

In [8]:
term_vectors = rank_terms(freq_map, agg)
print term_vectors[:10]

[[  1.45022006e+01   3.96392436e+01   1.28777739e+01 ...,   1.21767907e+01
    1.21767907e+01   1.21767907e+01]
 [  8.58362224e-02   9.26124912e-01   1.01247500e+01 ...,   1.30902074e+01
    1.30902074e+01   1.30902074e+01]
 [  7.16466057e+00   1.45620625e+01   9.29422268e+00 ...,   1.00233137e+01
    1.00233137e+01   1.00233137e+01]
 ..., 
 [  5.28966357e-01   4.05534714e-01   6.81057321e-02 ...,   1.35979165e+01
    1.35979165e+01   1.35979165e+01]
 [  1.10678277e+00   1.18678705e+01   7.43337903e+00 ...,   1.03260907e+01
    1.03260907e+01   1.03260907e+01]
 [  1.35655608e-03   2.92904885e+00   1.16642162e-01 ...,   1.48219351e+01
    1.48219351e+01   1.48219351e+01]]


*generate_clusters(term_vectors, k_neighbours)* calculates *k* nearest neighbour for each document. It returns (1) a list of distances between each document and its neighbours (the smaller the better) and (2) a list of indexes of documents in the original data frame.

In [9]:
distances, indices = generate_clusters(term_vectors)
print distances, indices

[[   0.          123.43665517  126.98453058  128.74422355  131.58699936]
 [   0.          159.98126099  164.24578892  166.94152577  167.03454542]
 [   0.          369.72215681  377.16360152  378.94414316  381.79516892]
 [   0.          132.8194023   134.40514032  136.87465117  141.28660178]
 [   0.          116.18613413  120.52871196  123.80496218  129.39347071]
 [   0.          193.48010625  195.1504781   198.7936668   200.29022659]
 [   0.           96.60769074  103.24285222  108.95863413  115.93608256]
 [   0.          133.50801158  134.88174669  136.57904711  138.33775187]
 [   0.          256.11704843  261.5567016   267.33249578  270.06915452]
 [   0.          104.16659682  107.87277958  108.7198662   113.19401301]
 [   0.          153.74047497  201.89755648  219.8198197   220.17700123]
 [   0.          198.44221673  210.08180529  215.26854593  217.05989916]
 [   0.          139.17011042  139.35144945  143.72153302  146.5049335 ]
 [   0.          133.44372917  151.85073243  153.76