Develop learning_wrapper algorithm

Utilize the combinations of UserBatch (class) and learning_dist_metrics

[pseudocode active weighting vector learning algorithm]
*step 1*: compute the aggregate distance information with argument of weighting vector [w]:
        a. sum of weighted distances of friends
        b. sum of weighted distances of non-friends
                               
*step 2*: argmin( [w] ) = sum(distances of same class) / sum(distances of different classes)
 
*step 3*: test hypothesis that location difference between friend-distance distribution
        vs. non-friend distance per user
                               
*step 4*: group users based on the previous test (paramer: distribution type, compared non-friend sample size):
        a. a user is assigned to group 1, if its two-type distance distributions location
           are significantly different;
        b. a user is assigned to group 2, if its two-type distance distributions location
           are not significantly different.
               
*step 5*: apply the 1-4 steps on users of group 1, split users of this group into: group 1 (with siganificant
        difference) and group 3 (with insignificant difference);
        Similiarly, apply 1-4 steps on users of group 2, split users of this group into: group 2 (with significant
        difference) and group 3 (with insignificant diference).
 
*step 6*: repeat 5th step until the stoping threshold is met:
        a. fixed iteration, or;
        b. insubstantial change in either weighting vector, or;
        c. insubstantial change in members in group 1-3.

In [11]:
import os
import sys
import glob
import numpy as np
import scipy as sp
import pandas as pd
from matplotlib import pyplot
from scipy.stats import rayleigh
from scipy.stats import ks_2samp
from numpy import linspace
from numpy.random import choice

from learning_dist_metrics.ldm import LDM
from learning_dist_metrics.dist_metrics import weighted_euclidean

import matplotlib.pyplot as plt
%matplotlib inline

load the simulate data sets, the frienships were generated based on "hand-shaking" protocol.

In [2]:
DATA_PATH = "./data/sim_data_yi/" 

users_df   = pd.read_csv(DATA_PATH + "users_profile.csv", header = 0, sep = ",")
friends_df = pd.read_csv(DATA_PATH + "friendships.csv", header = 0, sep = ",")
dist_df    = pd.read_csv(DATA_PATH + "dist_mat.csv", header = 0, sep = ",")

In [3]:
## friends_df is processed
## a. create a new column to denote the user pair
## b. exclude user-pair of non-friends
## c. drop the 'isFriend' columns
friends_df = friends_df[friends_df.isFriend == 1]
friends_df["pair"] = friends_df[["uid_a", "uid_b"]].apply(lambda x: (int(x[0]), int(x[1])), axis=1)
friends_df.drop("isFriend", axis=1, inplace=True)
friends_df = friends_df[["pair", "uid_a", "uid_b"]]
friends_df.head(3)

Unnamed: 0,pair,uid_a,uid_b
1,"(0, 2)",0,2
3,"(0, 4)",0,4
4,"(0, 5)",0,5


In [4]:
cols = ["x0", "x1", "x2", "x3", "x4", "x5"]

## subset users data to retain profile only
profile_df = users_df[["ID"] + cols]

ldm = LDM()
ldm.fit(users_df[cols], friends_df.pair.as_matrix())

--- 28.0377209187 seconds ---


<learning_dist_metrics.ldm.LDM at 0x7fbe38fe0a50>

In [7]:
all_user_ids = list(set(users_df.ID))
the_weights = ldm.get_transform_matrix()
print the_weights

[0.39, 0.61, 0.0, 0.0, 0.0, 0.0]


#### learning wrapper functions
* user_grouped_dist()
* user_dist_kstest()
* users_filter_by_weights()
* ldm_train_with_list

In [17]:
def user_grouped_dist(user_id, weights, profile_df, friends_df):
    """ Calculate distances between a user and whose friends
        and distance between a user and whose non-friends.
        The groupped distance vector will be output.

        Parameters:
        ----------
        * user_id: {integer}, the target user's ID
        * weights: {vector-like, float}, the vector of feature weights which
            is extracted by LDM().fit(x, y).get_transform_matrix()
        * profile_df: {matrix-like, pandas.DataFrame}, user profile dataframe
            with columns: ["ID", "x0" - "xn"]
        * friends_df: {matrix-like, pandas.DataFrame}, pandas.DataFrame store
            pair of user ID(s) to represent connections with columns:
            ["uid_a", "uid_b"]

        Returns:
        -------
        res: {list, list of integers}, a list of two lists, which store the distances
            of either friends and non-friends separately.

        Examples:
        ---------
        weights = ldm().fit(df, friends_list).get_transform_matrix()
        profile_df = users_df[ ["ID"] + cols ]
        user_dist = user_grouped_dist(user_id = 0, weights = weights
            , profile_df, friends_df)
        print user_dist["friends"]
        print user_dist["nonfriends"]
    """
    cols = [col for col in profile_df.columns if col is not "ID"]
    # get the user profile information of the target users
    user_profile = profile_df.ix[profile_df.ID == user_id, cols].as_matrix()
    # get the user_id of friends of the target user
    friends_ls_a = friends_df[friends_df.uid_a == user_id].uid_b.as_matrix()
    friends_ls_b = friends_df[friends_df.uid_b == user_id].uid_a.as_matrix()
    friends_ls = list(set(friends_ls_a)) + list(set(friends_ls_b))
    # calculate the weighted distance of friends
    sim_dist_vec = []
    for f_id in friends_ls:
        friend_profile = profile_df.ix[profile_df.ID == f_id, cols].as_matrix()
        the_dist = weighted_euclidean(user_profile, friend_profile, weights)
        sim_dist_vec.append(the_dist)
    # calculate the weighted distances of non-friends
    non_friends_ls = [ u for u in profile_df.ID if u not in friends_ls + [user_id] ]
    diff_dist_vec = []
    for nf_id in non_friends_ls:
        nonfriend_profile = profile_df.ix[profile_df.ID == nf_id, cols].as_matrix()
        the_dist = weighted_euclidean(user_profile, nonfriend_profile, weights)
        diff_dist_vec.append(the_dist)

    res = [sim_dist_vec, diff_dist_vec]
    return res


def user_dist_kstest(sim_dist_vec, diff_dist_vec):
    """ Test the goodness of a given weights to defferentiate friend distance
        distributions and non-friend distance distributions of a given user.
        The distance distribution is considered to follow Rayleigh distribution.

        Parameters:
        ----------
        * sim_dist_vec: {vector-like (list), float}: distances between friends
            and the user
        * diff_dist_vec: {vector-like (list), float}: distances between non-friends
            and the user

        Returns:
        -------
        * res: {float}: p-value of ks-test with assumption that distances follow
            Rayleigh distribution.

        Examples:
        ---------
        pval = user_dist_kstest(sim_dist_vec, diff_dist_vec)
    """
    _n = 100
    friend_param = rayleigh.fit(sim_dist_vec)
    nonfriend_param = rayleigh.fit(diff_dist_vec)

    samp_friend = rayleigh.rvs(friend_param[0], friend_param[1], _n)
    samp_nonfriend = rayleigh.rvs(nonfriend_param[0], nonfriend_param[1], _n)

    ## ouput p-value of ks-test
    res = ks_2samp(samp_friend, samp_nonfriend)[1]
    return res


def users_filter_by_weights(weights, profile_df, friends_df,
                            pval_threshold=0.20, min_friend_cnt=10):
    """ Split a list of users into two groups, "good fit group"(reject) and
        "invalid group", with respect to the ks-test on the null hypothesis
        that friends' weighted distance is not significantly different from the
        couterpart of non-friends. Assume the weighted distances of each group
        follow Rayleigh distribution.

        Parameters:
        ----------
        * weights: {vector-like, float}, the vector of feature weights which
            is extracted by LDM().fit(x, y).get_transform_matrix()
        * profile_df: {matrix-like, pandas.DataFrame}, user profile dataframe
            with columns: ["ID", "x0" - "xn"]
        * friends_df: {matrix-like, pandas.DataFrame}, pandas.DataFrame store
            pair of user ID(s) to represent connections with columns:
            ["uid_a", "uid_b"]
        * pval_threshold: {float}, the threshold for p-value to reject hypothesis
        * min_friend_cnt: {integer}, drop users whose total of friends is less
            than this minimum count

        Returns:
        -------
        res: {list} grouped list of user ids
           res[0] stores all users whose null hypothesis does not holds
           res[1] stores all users whose null hypothesis hold
           null hypothesis, given weights, distance distribution of all friends
           is significantly different from distance distribution of all non-fri
           -ends

        Examples:
        --------
        weights = ldm().fit(df, friends_list).get_transform_matrix()
        profile_df = users_df[["ID", cols]]
        grouped_users = users_filter_by_weights(weights,
                            profile_df, friends_df,
                            pval_threshold = 0.10, min_friend_cnt = 10)

        Notes:
        -----
        min_friend_cnt is not implemented
    """

    all_users_ids = list(set(profile_df.ID))
    # container for users meeting different critiria
    good_fits = []
    bad_fits = []
    for uid in all_users_ids:
        res_dists = user_grouped_dist(uid, weights, profile_df, friends_df)
        pval = user_dist_kstest(res_dists[0], res_dists[1])
        if pval <= pval_threshold:
            good_fits.append(uid)
        else:
            bad_fits.append(uid)

    res = [good_fits, bad_fits]
    return res

def user_grouped_dist(user_id, weights, profile_df, friends_df):
    """ Calculate distances between a user and whose friends
        and distance between a user and whose non-friends.
        The groupped distance vector will be output.

        Parameters:
        ----------
        * user_id: {integer}, the target user's ID
        * weights: {vector-like, float}, the vector of feature weights which
            is extracted by LDM().fit(x, y).get_transform_matrix()
        * profile_df: {matrix-like, pandas.DataFrame}, user profile dataframe
            with columns: ["ID", "x0" - "xn"]
        * friends_df: {matrix-like, pandas.DataFrame}, pandas.DataFrame store
            pair of user ID(s) to represent connections with columns:
            ["uid_a", "uid_b"]

        Returns:
        -------
        res: {list, list of integers}, a list of two lists, which store the distances
            of either friends and non-friends separately.

        Examples:
        ---------
        weights = ldm().fit(df, friends_list).get_transform_matrix()
        profile_df = users_df[ ["ID"] + cols ]
        user_dist = user_grouped_dist(user_id = 0, weights = weights
            , profile_df, friends_df)
        print user_dist["friends"]
        print user_dist["nonfriends"]
    """
    cols = [col for col in profile_df.columns if col is not "ID"]
    # get the user profile information of the target users
    user_profile = profile_df.ix[profile_df.ID == user_id, cols].as_matrix()
    # get the user_id of friends of the target user
    friends_ls_a = friends_df[friends_df.uid_a == user_id].uid_b.as_matrix()
    friends_ls_b = friends_df[friends_df.uid_b == user_id].uid_a.as_matrix()
    friends_ls = list(set(friends_ls_a)) + list(set(friends_ls_b))
    # calculate the weighted distance of friends
    sim_dist_vec = []
    for f_id in friends_ls:
        friend_profile = profile_df.ix[profile_df.ID == f_id, cols].as_matrix()
        the_dist = weighted_euclidean(user_profile, friend_profile, weights)
        sim_dist_vec.append(the_dist)
    # calculate the weighted distances of non-friends
    non_friends_ls = [ u for u in profile_df.ID if u not in friends_ls + [user_id] ]
    diff_dist_vec = []
    for nf_id in non_friends_ls:
        nonfriend_profile = profile_df.ix[profile_df.ID == nf_id, cols].as_matrix()
        the_dist = weighted_euclidean(user_profile, nonfriend_profile, weights)
        diff_dist_vec.append(the_dist)

    res = [sim_dist_vec, diff_dist_vec]
    return res


def user_dist_kstest(sim_dist_vec, diff_dist_vec):
    """ Test the goodness of a given weights to defferentiate friend distance
        distributions and non-friend distance distributions of a given user.
        The distance distribution is considered to follow Rayleigh distribution.

        Parameters:
        ----------
        * sim_dist_vec: {vector-like (list), float}: distances between friends
            and the user
        * diff_dist_vec: {vector-like (list), float}: distances between non-friends
            and the user

        Returns:
        -------
        * res: {float}: p-value of ks-test with assumption that distances follow
            Rayleigh distribution.

        Examples:
        ---------
        pval = user_dist_kstest(sim_dist_vec, diff_dist_vec)
    """
    _n = 100
    friend_param = rayleigh.fit(sim_dist_vec)
    nonfriend_param = rayleigh.fit(diff_dist_vec)

    samp_friend = rayleigh.rvs(friend_param[0], friend_param[1], _n)
    samp_nonfriend = rayleigh.rvs(nonfriend_param[0], nonfriend_param[1], _n)

    ## ouput p-value of ks-test
    res = ks_2samp(samp_friend, samp_nonfriend)[1]
    return res


def users_filter_by_weights(weights, users_list,
                            profile_df, friends_df,
                            pval_threshold=0.20, min_friend_cnt=10):
    """ Split a list of users into two groups, "good fit group"(reject) and
        "invalid group", with respect to the ks-test on the null hypothesis
        that friends' weighted distance is not significantly different from the
        couterpart of non-friends. Assume the weighted distances of each group
        follow Rayleigh distribution.

        Parameters:
        ----------
        * weights: {vector-like, float}, the vector of feature weights which
            is extracted by LDM().fit(x, y).get_transform_matrix()
        * users_list: {vector-like, integer}, the list of user id
        * profile_df: {matrix-like, pandas.DataFrame}, user profile dataframe
            with columns: ["ID", "x0" - "xn"]
        * friends_df: {matrix-like, pandas.DataFrame}, pandas.DataFrame store
            pair of user ID(s) to represent connections with columns:
            ["uid_a", "uid_b"]
        * pval_threshold: {float}, the threshold for p-value to reject hypothesis
        * min_friend_cnt: {integer}, drop users whose total of friends is less
            than this minimum count

        Returns:
        -------
        res: {list} grouped list of user ids
           res[0] stores all users whose null hypothesis does not holds
           res[1] stores all users whose null hypothesis hold
           null hypothesis, given weights, distance distribution of all friends
           is significantly different from distance distribution of all non-fri
           -ends

        Examples:
        --------
        weights = ldm().fit(df, friends_list).get_transform_matrix()
        profile_df = users_df[["ID", cols]]
        grouped_users = users_filter_by_weights(weights,
                            profile_df, friends_df,
                            pval_threshold = 0.10, min_friend_cnt = 10)

        Notes:
        -----
        min_friend_cnt is not implemented
    """
    #all_users_ids = list(set(profile_df.ID))
    users_list
    # container for users meeting different critiria
    good_fits = []
    bad_fits = []
    for uid in users_list:
        res_dists = user_grouped_dist(uid, weights, profile_df, friends_df)
        pval = user_dist_kstest(res_dists[0], res_dists[1])
        if pval <= pval_threshold:
            good_fits.append(uid)
        else:
            bad_fits.append(uid)

    res = [good_fits, bad_fits]
    return res

def ldm_train_with_list(users_list,
                        profile_df, friends_df,
                        retain_type=0):
    """ learning distance matrics with ldm() instance, provided
        with selected list of users.

        Parameters:
        -----------
        * users_list: {vector-like, integer}, the list of user id
        * profile_df: {matrix-like, pandas.DataFrame}, user profile dataframe
            with columns: ["ID", "x0" - "xn"]
        * friends_df: {matrix-like, pandas.DataFrame}, pandas.DataFrame store
            pair of user ID(s) to represent connections with columns:
            ["uid_a", "uid_b"]
        * retain_type: {integer}, 0, adopting 'or' logic by keeping relation
            -ship in friends_df if either of entities is in user_list
            1, adopting 'and' logic

        Returns:
        -------
        res: {vector-like, float}, output of ldm.get_transform_matrix()

        Examples:
        ---------
        new_dist_metrics = ldm_train_with_list(user_list,
                                               profile_df,
                                               friends_df)
    """
    ldm = LDM()
    if retain_type == 0:
        friends_df = friends_df.ix[friends_df.uid_a.isin(users_list) |
                                   friends_df.uid_b.isin(users_list)]
    else:
        friends_df = friends_df.ix[friends_df.uid_a.isin(users_list) &
                                   friends_df.uid_b.isin(users_list)]

    cols = profile_df.columns.drop("ID")
    ldm.fit(profile_df[cols], friends_df.pair.as_matrix())
    return ldm.get_transform_matrix()

In this section, I will simulate the iterative learning process by run the line in a manual fashion

In [13]:
all_users_ids = list(set(profile_df.ID))
g0, g1 = users_filter_by_weights(the_weights, users_list = all_users_ids, profile_df, friends_df, 0.20, 10)

### 1st round learning

In [19]:
## define the data utilization type
RETAIN_TYPE = 0

g0_dist = ldm_train_with_list(g0, profile_df, friends_df, retain_type=RETAIN_TYPE)
g1_dist = ldm_train_with_list(g1, profile_df, friends_df, retain_type=RETAIN_TYPE)

print "Group 0's learned distance metrics:"
print g0_dist

print "Group 1's learned distance metrics:"
print g1_dist

--- 28.708357811 seconds ---
--- 39.5549061298 seconds ---
Group 0's learned distance metrics:
[0.45, 0.54, 0.0, 0.0, 0.01, 0.0]
Group 1's learned distance metrics:
[0.41, 0.59, 0.0, 0.0, 0.0, 0.0]


In [20]:
g0_profile_df = 
g0_good, g0_bad = users_filter_by_weights(g0_dist, profile_df, friends_df, 0.20, 10)
print "# of good fits: %d, # of bad fits: %d" % (len(g0_good), len(g0_bad))

# of good fits: 42, # of bad fits: 58
