#### Learning Structure
* k: # of groups, or # of containers
* one buffer
* a member in buffer will be examined against all of newly learned distrance, all quantified distance metrics will be ranked by p-value in descending order. The top distance metrics is the one chosen for the user.
* member of a container will be examined for the distance metrics learned for the population. If the member is not cosider enjoy a good fit of the distance metrics, the member will be checked with other alternative distance metris. Until at least one of qualified distance metrics found, the member is sent to the buffer.
* overall fit is measured by the following metrics
$$
fs_1 = \sum_{j = 1}^{k} (\sum_{i \in U_{j}} \text{P-value}(u_i,D_j)_{i} / |U_{j}|) + C \cdot |\text{buffer}|
$$

$$
fs_2 = \sum_{j = 1}^{k} (\sum_{i \in U_{j}} \text{P-value}(u_i,D_j)_{i} / |U_{j}|^2) + C \cdot |\text{buffer}|
$$

$$
fs_3 = \sum_{j = 1}^{k} (\sum_{i \in U_{j}} \text{P-value}(u_i,D_j)_{i} \cdot \frac{N}{|U_{j}|^2}) + C \cdot |\text{buffer}|
$$

* $u_i$: $i$th user
* $D_j$: the distance metrics learned for $j$th user group
* $U_j$: $j$th user group
* $|\cdot|$: size of users in the container
* $C$: the parameter determines the strength of penalty for more members in the buffer
* $N$: the total number of users in fit group, $\sum_{j = 1}^{k} |U_{j}|$

In [32]:
import numpy as np
import scipy as sp
import pandas as pd

from GWDLearner import *

In [33]:
""" wrap up the learning iteration

functions:
----------
a. assigning member to fitted group
b. test members against all distance metrics rather than its previous group's
   [(group_index, pval)]
c. sort all 


Parameters:
-----------
profile_df: {pandas.DataFrame}
networkx: {networkx.Graph}
k: {integer}, the target number of groups to learn
min_delta_f: {float}, the minimal decrease in f score to continue learning

Returns:
--------
"""
def init_embed_list(n):
    """
    """
    ls = []
    for i in range(n):
        ls.append([])
    return ls

def init_dict_list(k):
    """ create dictionary with k items, each
        item is a empty list
    """
    res_dict = {}
    for i in range(k):
        res_dict[i] = []
    return res_dict

k = 2
min_delta_f = 0.001

dist_metrics = init_dict_list(k)
fit_group = init_dict_list(k)
unfit_group = init_dict_list(k)
buffer_group = []
fit_pvals = init_dict_list(k)
unfit_pvals = init_dict_list(k)

# results value
fs_hist = []
knowledge_pkg = []

# *. distance has been learned
# *. group composite is inherited from the previous iteration

dist_metrics = {0: [0.1, 0.3, 0.6, 0], 1: [0.5, 0.1, 0.4, 0], 2: [0.25, 0.25, 0.25, 0.25]}
fit_group = {0:[1, 2, 4, 5], 1:[3, 6, 7], 2:[8, 9]}
fit_pvals = {0:[0.2, 0.12, 0.04, 0.21], 1: [0.31, 0.22, 0.17], 2: [0.02, 0.05]}
buffer_group = [10, 11]

In [34]:
# examine user aginst the group distance metrics
# retain members bearing fit
# reassign users to unfit_group
for i, g in fit_group.iteritems():
    for j, u in enumerate(g):
        # print "(%d, %d)" % (i, u)
        pval = np.random.uniform(0, 1, 1)[0]
        print pval
        if pval > 0.5:
            fit_group[i].remove(u)
            unfit_group[i].append(u)

0.192895674001
0.891109151516
0.538987111551
0.719288232637
0.244418785844
0.666233350896


KeyError: 2

In [9]:
for i, g in fit_group.iteritems():
    print g

[1, 2, 4, 5]
[3, 6, 7]
[8, 9]


In [27]:
# fit_group
# unfit_group
for i, g in enumerate(fit_group):
    comp_dist = dist_metrics[:i] + dist_metrics[(i + 1):]
    for j, u in enumerate(g):
        print comp_dist

[[0.5, 0.1, 0.4, 0], [0.1, 0.1, 0.1, 0.7]]
[[0.5, 0.1, 0.4, 0], [0.1, 0.1, 0.1, 0.7]]
[[0.5, 0.1, 0.4, 0], [0.1, 0.1, 0.1, 0.7]]
[[0.5, 0.1, 0.4, 0], [0.1, 0.1, 0.1, 0.7]]
[[0.1, 0.3, 0.6, 0], [0.1, 0.1, 0.1, 0.7]]
[[0.1, 0.3, 0.6, 0], [0.1, 0.1, 0.1, 0.7]]
[[0.1, 0.3, 0.6, 0], [0.1, 0.1, 0.1, 0.7]]


In [44]:
def find_fit_group(uid, dist_metrics, threshold,  current_group = []):
    """ calculate user p-value for the distance metrics of
        each group
        
    Parameters:
    ----------
    uid: {integer}, user id
    current_group: {integer}, group index
    dist_metrics: {dictionary}, all {index: distance_metrics}
    threshold: {float}, threshold for qualifying pvalue of ks-test
    
    Resutls:
    --------
    res: {list}, [group_idx, pvalue]
    """
    if current_group == []:
        other_group = dist_metrics.keys()
        other_dist_metrics = dist_metrics.values()
    else:
        other_group = [i for i in dist_metrics.keys() if i != current_group]
        other_dist_metrics = [d for g, d in dist_metrics.iteritems() if g != current_group]
    
    pvals = []

    for d in other_dist_metrics:
        # loop through all distance metrics and calculate
        # p-value of ks-test by applying it to the user
        # relationships
        # sdist, ddist = user_grouped_dist(user_id = uid, weights=dist_metrics, *profile_df{DataFramw, ID}*,
        #                   *friends_networkx*)
        # pval = user_dist_kstest(sim_dist_vec=sdist, diff_dist_vec=ddist, fit_rayleigh=True, _n=1000)
        pval = np.random.uniform(0, 1, 1)
        pvals.append(pval)
        
    min_pval = min(pvals)[0]
    min_index = [i for i, p in enumerate(pvals) if p == min_pval][0]
    best_group = other_group[min_index]
    
    if min_pval >= threshold:
        # if min_pval >= threshold, user is not considered 
        # to have a good fit by any of distance metrics
        best_group = np.nan
        min_pval = np.nan
    
    return (best_group, min_pval)

def get_fit_score(fit_pvals, buffer_group, c, t=2):
    """ calculate the fit score given the member composite
        and its pvalues with its group distance metrics, with
        c determinng the strength of penalty for keeping a 
        larger number of users in buffer_group
        
    Parameters:
    -----------
    fit_pvals: {dict}, {index: [pvalues]}
    buffer_group: {list}, [userid, ...]
    c: {float}, 
    t: {integer} 1, 2 or 3
    
    Returns:
    --------
    fit_score: {float}, fit score, a smaller value indidcate
                a overall better fit
    
    Examples:
    ---------
    fit_group = fit_group
    fit_pvals = fit_pvals
    buffer_group = buffer_group
    c = 0.1
    fscore = get_fit_score(fit_group, fit_pvals, buffer_group, c)
    """
    
    # weighted sum of pvalues 
    if t not in [1, 2, 3]:
        raise NameError('Error: type (t) is not legal value (1 or 2)!')
    
    wsum_pval = 0
    if t == 1:
        for g, v in fit_pvals.iteritems():
            wsum_pval += sum(np.array(v) * 1.0 / len(v))
    if t == 2:
        for g, v in fit_pvals.iteritems():
            wsum_pval += sum(np.array(v)) * 1.0 / (len(v) * len(v))
    if t == 3:
        num_users = 0
        for g, v in fit_pvals.iteritems():
            wsum_pval += sum(np.array(v)) * 1.0 / (len(v) * len(v))
            num_users += len(v)
        wsum_pval = num_users * 1.0 * wsum_pval

    penalty = c * len(buffer_group)
    fit_score = wsum_pval + penalty # smaller value indicates a better overall fit
    
    return fit_score

In [36]:
dist_metrics = {0: [0.1, 0.3, 0.6, 0], 1: [0.5, 0.1, 0.4, 0], 2: [0.25, 0.25, 0.25, 0.25]}
fit_group = {0:[1, 2, 4, 5], 1:[3, 6, 7], 2:[8, 9]}
unfit_group = {}
fit_pvals = {0:[0.2, 0.12, 0.04, 0.21], 1: [0.31, 0.22, 0.17], 2: [0.02, 0.05]}
buffer_group = []

threshold = 0.5
c = 0.1

In [37]:
# step 01: learn distance metrics
for g, uids in fit_group.iteritems():
    # function learn
    dist = [np.random.uniform(0, 1, 1)[0] for i in range(4)]
    dist_metrics[g] = dist

In [38]:
#unfit_group[g]
print "dist_metrics:",dist_metrics
print "fit_group:", fit_group 
print "fit_pvals:", fit_pvals
print "unfit_group:", unfit_group
print "buffer_group:", buffer_group

dist_metrics: {0: [0.088655901698830308, 0.2259672307541073, 0.036497357427248578, 0.36885043025878117], 1: [0.76391743027718451, 0.011670212567600591, 0.85608215240779029, 0.310455275695802], 2: [0.77879957146338263, 0.3269606537780031, 0.31095039311610195, 0.32162244322658917]}
fit_group: {0: [1, 2, 4, 5], 1: [3, 6, 7], 2: [8, 9]}
fit_pvals: {0: [0.2, 0.12, 0.04, 0.21], 1: [0.31, 0.22, 0.17], 2: [0.02, 0.05]}
unfit_group: {}
buffer_group: []


In [39]:
# step 02: update the member composite with updated group distance metrics
# threshold is needed to be defined
fit_group_copy = fit_group.copy()
for g, uids in fit_group_copy.iteritems():
    target_dist = dist_metrics[g]
    for uid in uids:
        # calcualte the ks-pvalue with update distance metrics
        # target_dist
        pval = np.random.uniform(0, 1, 1)[0]
        if pval >= threshold:
            # remove the user and its information 
            # from relevant container
            idx = [i for i, u in enumerate(fit_group[g]) if u == uid][0]
            fit_group[g].pop(idx)
            fit_pvals[g].pop(idx)
            # add the user to the unfit_group
            if g in unfit_group:
                unfit_group[g].append(uid)
            else:
                unfit_group[g] = [uid]

In [40]:
#unfit_group[g]
print "dist_metrics:",dist_metrics
print "fit_group:", fit_group 
print "fit_pvals:", fit_pvals
print "unfit_group:", unfit_group
print "buffer_group:", buffer_group

dist_metrics: {0: [0.088655901698830308, 0.2259672307541073, 0.036497357427248578, 0.36885043025878117], 1: [0.76391743027718451, 0.011670212567600591, 0.85608215240779029, 0.310455275695802], 2: [0.77879957146338263, 0.3269606537780031, 0.31095039311610195, 0.32162244322658917]}
fit_group: {0: [2, 4], 1: [6, 7], 2: [8, 9]}
fit_pvals: {0: [0.12, 0.04], 1: [0.22, 0.17], 2: [0.02, 0.05]}
unfit_group: {0: [1, 5], 1: [3]}
buffer_group: []


In [41]:
# step 03: test members in unfit_group to see
# if it has a good fit with other distmetrics
# make a copy of the buffer group container
buffer_group_copy = [i for i in buffer_group]
if len(buffer_group_copy) > 0:
    for uid in buffer_group_copy:
        new_group, new_pval = find_fit_group(uid, dist_metrics, threshold)
        if not np.isnan(new_pval):
            buffer_group.remove(uid)
            if new_group in fit_group:
                fit_group[new_group].append(uid)
                fit_pvals[new_group].append(new_pval)
            else:
                fit_group[new_group] = [uid]
                fit_pvals[new_group] = [new_pval]
                

unfit_group_copy = unfit_group.copy()
for g, uids in unfit_group_copy.iteritems():
    for uid in uids:        
        new_group, new_pval = find_fit_group(uid, dist_metrics, threshold, g)
        if np.isnan(new_pval):
            buffer_group.append(uid)
        else:
            unfit_group[g].remove(uid)
            if new_group in fit_group:
                fit_group[new_group].append(uid)
                fit_pvals[new_group].append(new_pval)
            else:
                fit_group[new_group] = [uid]
                fit_pvals[new_group] = [new_pval]

In [42]:
#unfit_group[g]
print "dist_metrics:",dist_metrics
print "fit_group:", fit_group 
print "fit_pvals:", fit_pvals
print "unfit_group:", unfit_group
print "buffer_group:", buffer_group

dist_metrics: {0: [0.088655901698830308, 0.2259672307541073, 0.036497357427248578, 0.36885043025878117], 1: [0.76391743027718451, 0.011670212567600591, 0.85608215240779029, 0.310455275695802], 2: [0.77879957146338263, 0.3269606537780031, 0.31095039311610195, 0.32162244322658917]}
fit_group: {0: [2, 4, 3], 1: [6, 7, 1], 2: [8, 9]}
fit_pvals: {0: [0.12, 0.04, 0.46019120599190588], 1: [0.22, 0.17, 0.21619468440433143], 2: [0.02, 0.05]}
unfit_group: {0: [5], 1: []}
buffer_group: []


In [45]:
# step 04: calculate current fscore
print get_fit_score(fit_pvals, buffer_group, c, t=1)
print get_fit_score(fit_pvals, buffer_group, c, t=2)
print get_fit_score(fit_pvals, buffer_group, c, t=3)

0.443795296799
0.153765098933
1.23012079146


In [114]:
package = {"dist_metrics": dist_metrics, 
           "fit_group": fit_group, 
           "buffer_group": buffer_group}

#fs_hist = []
#knowledge_pkg = []
fs_hist.append(fs)
knowledge_pkg.append(package)
best_fs = min(fs_hist)

if best_fs - fs <= min_delta_f:
    # _no_imp_counter defined prior to while()
    # while(_no_imp_counter < max_iter):
    _no_imp_counter += _no_imp_counter 
else:
    _no_imp_counter = 0

0.467770337717


#### Develop the iteractive learning framework

In [50]:
"""
input info.:
----------
profile_df
friend_networkx

control parameters:
-------------------
t: fit score type

tuning parameter:
-----------------
threshold: cutoff value for kstest
c: regularization strength
min_delta_f: threshold for significant improvement
max_iter: maxmium number of trivial trial learning in a row 
"""
# input info
# user_profile

# tuing parameters
t = 2
c = 0.1
threshold = 0.5
min_delta_f = 0.02
max_iter = 10

# initiate the containers:
dist_metrics = init_dict_list(k) # distance metrics containers
fit_group = init_dict_list(k)    # members composition in fit groups
fit_pvals = init_dict_list(k)    # members' pvalue of KStest with their group distance metrics
unfit_group = init_dict_list(k)  # members is not considerd fit by its group distance metrics
unfit_pvals = init_dict_list(k)  # pvalues for members in unfit_group (maybe can be deleted)
buffer_group = []                # members are not considered having fit

# results value
fs_hist = []       # list of fit scores in sequence (lastest one is the last)
knowledge_pkg = [] # {index: {"dist_metrics", "fit_group", "buffer_group"}} 

# calculate the the init distance metrics
# samping is subset of users to calculate
# the distance metrics is good method

# dist_metrics: ldm() with subset of users
# fit_group: subsets of users
# buffer_group: useres are not sampled 

# provide initial composition of fit_group
# and buffer_group for iterative learning
# procedure
# the even sampling strategy is implemeted
# here, however, 
samp_size = len(all_uids) / k 
samp_sizes = [samp_size] * k
all_uids_copy = [i for i in all_uids]

for g, samp_size in zip(range(k), samp_sizes):
    # draw samples and assign them to fit_group
    samples = choice(all_uids_copy, samp_size, replace=False)
    fit_group[g] = list(samples)
    # remove samples from population pool
    for uid in samples:
        all_uids_copy.remove(uid)

if len(all_uids_copy) > 0:
    buffer_group = all_uids_copy
else:
    buffer_group = []

_no_imp_counter = 0
_loop_counter = 0
while _no_imp_counter < max_iter:
    
    _loop_counter += 1
    print "%d iteration is in processing ..." % _loop_counter
    # step 01: learn distance metrics
    for g, uids in fit_group.iteritems():
        # learn distance metrics
        # here to update the computational mechanism
        dist = [np.random.uniform(0, 1, 1)[0] for i in range(4)]
        dist_metrics[g] = dist
        
    # step 02: update the member composite with updated group distance metrics
    # threshold is needed to be defined
    fit_group_copy = fit_group.copy()
    for g, uids in fit_group_copy.iteritems():
        target_dist = dist_metrics[g]
        for uid in uids:
            # calcualte the ks-pvalue with update distance metrics
            # target_dist
            pval = np.random.uniform(0, 1, 1)[0]
            if pval >= threshold:
                # remove the user and its information 
                # from relevant container
                idx = [i for i, u in enumerate(fit_group[g]) if u == uid][0]
                fit_group[g].pop(idx)
                fit_pvals[g].pop(idx)
                # add the user to the unfit_group
                if g in unfit_group:
                    unfit_group[g].append(uid)
                else:
                    unfit_group[g] = [uid]
                    
    # step 03: test members in unfit_group to see
    # if it has a good fit with other distmetrics
    # make a copy of the buffer group container
    buffer_group_copy = [i for i in buffer_group]
    if len(buffer_group_copy) > 0:
        for uid in buffer_group_copy:
            new_group, new_pval = find_fit_group(uid, dist_metrics, threshold)
            if not np.isnan(new_pval):
                buffer_group.remove(uid)
                if new_group in fit_group:
                    fit_group[new_group].append(uid)
                    fit_pvals[new_group].append(new_pval)
                else:
                    fit_group[new_group] = [uid]
                    fit_pvals[new_group] = [new_pval]
                    
    unfit_group_copy = unfit_group.copy()
    for g, uids in unfit_group_copy.iteritems():
        for uid in uids:        
            new_group, new_pval = find_fit_group(uid, dist_metrics, threshold, g)
            if np.isnan(new_pval):
                buffer_group.append(uid)
            else:
                unfit_group[g].remove(uid)
                if new_group in fit_group:
                    fit_group[new_group].append(uid)
                    fit_pvals[new_group].append(new_pval)
                else:
                    fit_group[new_group] = [uid]
                    fit_pvals[new_group] = [new_pval]
                    
    # step 04: calculate fit score
    fs = get_fit_score(fit_pvals, buffer_group, c=c, t=1)
    fs_hist.append(fs)
    
    # step 05: evaluate stop criteria
    package = {"dist_metrics": dist_metrics, 
               "fit_group": fit_group, 
               "buffer_group": buffer_group}

    knowledge_pkg.append(package)
    best_fs = min(fs_hist)

    if best_fs - fs <= min_delta_f:
        _no_imp_counter += _no_imp_counter 
    else:
        _no_imp_counter = 0
        
    print "fit score (type-%d): %.3f" % (t, fs)
    print "best fit score: %.3f" % best_fs

1 iteration is in processing ...
fit score (type-2): 0.000
best fit score: 0.000
2 iteration is in processing ...
fit score (type-2): 0.000
best fit score: 0.000
3 iteration is in processing ...
fit score (type-2): 0.000
best fit score: 0.000
4 iteration is in processing ...
fit score (type-2): 0.000
best fit score: 0.000
5 iteration is in processing ...
fit score (type-2): 0.000
best fit score: 0.000
6 iteration is in processing ...
fit score (type-2): 0.000
best fit score: 0.000
7 iteration is in processing ...
fit score (type-2): 0.000
best fit score: 0.000
8 iteration is in processing ...
fit score (type-2): 0.000
best fit score: 0.000
9 iteration is in processing ...
fit score (type-2): 0.000
best fit score: 0.000
10 iteration is in processing ...
fit score (type-2): 0.000
best fit score: 0.000
11 iteration is in processing ...
fit score (type-2): 0.000
best fit score: 0.000
12 iteration is in processing ...
fit score (type-2): 0.000
best fit score: 0.000
13 iteration is in proces

KeyboardInterrupt: 

In [80]:
all_uids = [i + 1 for i in range(100)]

k = 3
samp_size = len(all_uids) / k 
samp_sizes = [samp_size] * k
all_uids_copy = [i for i in all_uids]

for g, samp_size in zip(range(k), samp_sizes):
    # draw samples and assign them to fit_group
    samples = choice(all_uids_copy, samp_size, replace=False)
    fit_group[g] = list(samples)
    # remove samples from population pool
    for uid in samples:
        all_uids_copy.remove(uid)

if len(all_uids_copy) > 0:
    buffer_group = all_uids_copy
else:
    buffer_group = []

In [81]:
print fit_group
print buffer_group

{0: [36, 35, 24, 19, 43, 34, 40, 20, 42, 15, 83, 12, 74, 25, 48, 71, 17, 95, 88, 49, 23, 99, 67, 55, 65, 10, 33, 89, 53, 27, 90, 37, 91], 1: [63, 76, 78, 13, 79, 14, 97, 3, 26, 16, 41, 64, 82, 62, 32, 86, 39, 69, 28, 29, 60, 9, 68, 92, 51, 54, 47, 8, 45, 21, 6, 85, 66], 2: [18, 77, 22, 5, 96, 81, 7, 31, 80, 58, 4, 2, 44, 38, 70, 75, 30, 59, 98, 87, 72, 84, 57, 100, 61, 46, 52, 94, 11, 73, 1, 93, 50]}
[56]
