#### Learning Structure
* k: # of groups, or # of containers
* one buffer
* a member in buffer will be examined against all of newly learned distrance, all quantified distance metrics will be ranked by p-value in descending order. The top distance metrics is the one chosen for the user.
* member of a container will be examined for the distance metrics learned for the population. If the member is not cosider enjoy a good fit of the distance metrics, the member will be checked with other alternative distance metris. Until at least one of qualified distance metrics found, the member is sent to the buffer.
* overall fit is measured by the following metrics
$$
f = \sum_{j = 1}^{k} (\sum_{i \in U_{j}} \text{P-value}(u_i,D_j)_{i} / |U_{j}|) + C \cdot |\text{buffer}|
$$
* $u_i$: $i$th user
* $D_j$: the distance metrics learned for $j$th user group
* $U_j$: $j$th user group
* $|\cdot|$: size of users in the container
* $C$: the parameter determines the strength of penalty for more members in the buffer

In [12]:
import numpy as np
import scipy as sp
import pandas as pd

In [35]:
""" wrap up the learning iteration

functions:
----------
a. assigning member to fitted group
b. test members against all distance metrics rather than its previous group's
   [(group_index, pval)]
c. sort all 


Parameters:
-----------
profile_df: {pandas.DataFrame}
networkx: {networkx.Graph}
k: {integer}, the target number of groups to learn
min_delta_f: {float}, the minimal decrease in f score to continue learning

Returns:
--------
"""
def init_embed_list(n):
    """
    """
    ls = []
    for i in range(n):
        ls.append([])
    return ls

def init_dict_list(k):
    """ create dictionary with k items, each
        item is a empty list
    """
    res_dict = {}
    for i in range(k):
        res_dict[i] = []
    return res_dict

k = 2
min_delta_f = 0.001

dist_metrics = init_dict_list(k)
fit_group = init_dict_list(k)
unfit_group = init_dict_list(k)
buffer_group = []
fit_pvals = init_dict_list(k)
unfit_pvals = init_dict_list(k)

# *. distance has been learned
# *. group composite is inherited from the previous iteration

dist_metrics = {0: [0.1, 0.3, 0.6, 0], 1: [0.5, 0.1, 0.4, 0], 2: [0.25, 0.25, 0.25, 0.25]}
fit_group = {0:[1, 2, 4, 5], 1:[3, 6, 7], 2:[8, 9]}
fit_pvals = {0:[0.2, 0.12, 0.04, 0.21], 1: [0.31, 0.22, 0.17], 2: [0.02, 0.05]}
buffer_group = [10, 11]

In [22]:
# examine user aginst the group distance metrics
# retain members bearing fit
# reassign users to unfit_group
for i, g in enumerate(fit_group):
    for j, u in enumerate(g):
        # print "(%d, %d)" % (i, u)
        pval = np.random.uniform(0, 1, 1)[0]
        print pval
        if pval > 0.5:
            fit_group[i].remove(u)
            unfit_group[i].append(u)

0.812647087371
0.420528851955
0.16035846456
0.362154524405
0.750724034385


In [27]:
# fit_group
# unfit_group
for i, g in enumerate(fit_group):
    comp_dist = dist_metrics[:i] + dist_metrics[(i + 1):]
    for j, u in enumerate(g):
        print comp_dist

[[0.5, 0.1, 0.4, 0], [0.1, 0.1, 0.1, 0.7]]
[[0.5, 0.1, 0.4, 0], [0.1, 0.1, 0.1, 0.7]]
[[0.5, 0.1, 0.4, 0], [0.1, 0.1, 0.1, 0.7]]
[[0.5, 0.1, 0.4, 0], [0.1, 0.1, 0.1, 0.7]]
[[0.1, 0.3, 0.6, 0], [0.1, 0.1, 0.1, 0.7]]
[[0.1, 0.3, 0.6, 0], [0.1, 0.1, 0.1, 0.7]]
[[0.1, 0.3, 0.6, 0], [0.1, 0.1, 0.1, 0.7]]


In [76]:
def find_fit_group(uid, dist_metrics, threshold,  current_group = []):
    """ calculate user p-value for the distance metrics of
        each group
        
    Parameters:
    ----------
    uid: {integer}, user id
    current_group: {integer}, group index
    dist_metrics: {dictionary}, all {index: distance_metrics}
    threshold: {float}, threshold for qualifying pvalue of ks-test
    
    Resutls:
    --------
    res: {list}, [group_idx, pvalue]
    """
    if current_group == []:
        other_group = dist_metrics.keys()
        other_dist_metrics = dist_metrics.values()
    else:
        other_group = [i for i in dist_metrics.keys() if i != current_group]
        other_dist_metrics = [d for g, d in dist_metrics.iteritems() if g != current_group]
    
    pvals = []

    for d in other_dist_metrics:
        # loop through all distance metrics and calculate
        # p-value of ks-test by applying it to the user
        # relationships
        pval = np.random.uniform(0, 1, 1)
        pvals.append(pval)
        
    min_pval = min(pvals)[0]
    min_index = [i for i, p in enumerate(pvals) if p == min_pval][0]
    best_group = other_group[min_index]
    
    if min_pval >= threshold:
        # if min_pval >= threshold, user is not considered 
        # to have a good fit by any of distance metrics
        best_group = np.nan
        min_pval = np.nan
    
    return (best_group, min_pval)

def get_fit_score(fit_pvals, buffer_group, c):
    """ calculate the fit score given the member composite
        and its pvalues with its group distance metrics, with
        c determinng the strength of penalty for keeping a 
        larger number of users in buffer_group
        
    Parameters:
    -----------
    fit_pvals: {dict}, {index: [pvalues]}
    buffer_group: {list}, [userid, ...]
    c: {float}, 
    
    Returns:
    --------
    fit_score: {float}, fit score, a smaller value indidcate
                a overall better fit
    
    Examples:
    ---------
    fit_group = fit_group
    fit_pvals = fit_pvals
    buffer_group = buffer_group
    c = 0.1
    fscore = get_fit_score(fit_group, fit_pvals, buffer_group, c)
    """
    
    # weighted sum of pvalues 
    wsum_pval = 0
    for g, v in fit_pvals.iteritems():
        wsum_pval += sum(np.array(v) * 1.0 / len(v))

    penalty = c * len(buffer_group)
    fit_score = wsum_pval + penalty # smaller value indicates a better overall fit
    
    return fit_score

In [105]:
dist_metrics = {0: [0.1, 0.3, 0.6, 0], 1: [0.5, 0.1, 0.4, 0], 2: [0.25, 0.25, 0.25, 0.25]}
fit_group = {0:[1, 2, 4, 5], 1:[3, 6, 7], 2:[8, 9]}
unfit_group = {}
fit_pvals = {0:[0.2, 0.12, 0.04, 0.21], 1: [0.31, 0.22, 0.17], 2: [0.02, 0.05]}
buffer_group = []

threshold = 0.5
c = 0.1

In [106]:
# step 01: learn distance metrics
for g, uids in fit_group.iteritems():
    # function learn
    dist = [np.random.uniform(0, 1, 1)[0] for i in range(4)]
    dist_metrics[g] = dist

In [107]:
#unfit_group[g]
print "dist_metrics:",dist_metrics
print "fit_group:", fit_group 
print "fit_pvals:", fit_pvals
print "unfit_group:", unfit_group
print "buffer_group:", buffer_group

dist_metrics: {0: [0.12241912473716166, 0.40649748413799935, 0.75322811863122952, 0.52007726730978332], 1: [0.35019398025313719, 0.25008452621732069, 0.41236828452548335, 0.78927039183237524], 2: [0.26786174679221131, 0.58029325272907573, 0.70044393219142564, 0.37169895108888917]}
fit_group: {0: [1, 2, 4, 5], 1: [3, 6, 7], 2: [8, 9]}
fit_pvals: {0: [0.2, 0.12, 0.04, 0.21], 1: [0.31, 0.22, 0.17], 2: [0.02, 0.05]}
unfit_group: {}
buffer_group: []


In [108]:
# step 02: update the member composite with updated group distance metrics
# threshold is needed to be defined
fit_group_copy = fit_group.copy()
for g, uids in fit_group_copy.iteritems():
    target_dist = dist_metrics[g]
    for uid in uids:
        # calcualte the ks-pvalue with update distance metrics
        # target_dist
        pval = np.random.uniform(0, 1, 1)[0]
        if pval >= threshold:
            # remove the user and its information 
            # from relevant container
            idx = [i for i, u in enumerate(fit_group[g]) if u == uid][0]
            fit_group[g].pop(idx)
            fit_pvals[g].pop(idx)
            # add the user to the unfit_group
            if g in unfit_group:
                unfit_group[g].append(uid)
            else:
                unfit_group[g] = [uid]

In [109]:
#unfit_group[g]
print "dist_metrics:",dist_metrics
print "fit_group:", fit_group 
print "fit_pvals:", fit_pvals
print "unfit_group:", unfit_group
print "buffer_group:", buffer_group

dist_metrics: {0: [0.12241912473716166, 0.40649748413799935, 0.75322811863122952, 0.52007726730978332], 1: [0.35019398025313719, 0.25008452621732069, 0.41236828452548335, 0.78927039183237524], 2: [0.26786174679221131, 0.58029325272907573, 0.70044393219142564, 0.37169895108888917]}
fit_group: {0: [2, 4], 1: [6], 2: [8, 9]}
fit_pvals: {0: [0.12, 0.04], 1: [0.22], 2: [0.02, 0.05]}
unfit_group: {0: [1, 5], 1: [3, 7]}
buffer_group: []


In [110]:
# step 03: test members in unfit_group to see
# if it has a good fit with other distmetrics
# make a copy of the buffer group container
buffer_group_copy = [i for i in buffer_group]
if len(buffer_group_copy) > 0:
    for uid in buffer_group_copy:
        new_group, new_pval = find_fit_group(uid, dist_metrics, threshold)
        if not np.isnan(new_pval):
            buffer_group.remove(uid)
            if new_group in fit_group:
                fit_group[new_group].append(uid)
                fit_pvals[new_group].append(new_pval)
            else:
                fit_group[new_group] = [uid]
                fit_pvals[new_group] = [new_pval]
                

unfit_group_copy = unfit_group.copy()

for g, uids in unfit_group_copy.iteritems():
    for uid in uids:        
        new_group, new_pval = find_fit_group(uid, dist_metrics, threshold, g)
        if np.isnan(new_pval):
            buffer_group.append(uid)
        else:
            unfit_group[g].remove(uid)
            if new_group in fit_group:
                fit_group[new_group].append(uid)
                fit_pvals[new_group].append(new_pval)
            else:
                fit_group[new_group] = [uid]
                fit_pvals[new_group] = [new_pval]



In [112]:
#unfit_group[g]
print "dist_metrics:",dist_metrics
print "fit_group:", fit_group 
print "fit_pvals:", fit_pvals
print "unfit_group:", unfit_group
print "buffer_group:", buffer_group

dist_metrics: {0: [0.12241912473716166, 0.40649748413799935, 0.75322811863122952, 0.52007726730978332], 1: [0.35019398025313719, 0.25008452621732069, 0.41236828452548335, 0.78927039183237524], 2: [0.26786174679221131, 0.58029325272907573, 0.70044393219142564, 0.37169895108888917]}
fit_group: {0: [2, 4], 1: [6], 2: [8, 9, 1, 7]}
fit_pvals: {0: [0.12, 0.04], 1: [0.22], 2: [0.02, 0.05, 0.12602826225658526, 0.075053088613232899]}
unfit_group: {0: [5], 1: [3]}
buffer_group: [3]


In [113]:
# step 04: calculate current fscore
fs = get_fit_score(fit_pvals, buffer_group, c)

In [114]:
print fs

0.467770337717
