### UserBatch Class Development to Provide Easy Access API

* load_users

* cal_distances

Update pairwise distance based on new distance function

* list_dist_distr(taste_group=[None, 0, 1])

Return the distance metric value of particular group.
  
* plot_dist_distr(taste_group=[None, 0, 1])

Visualize the distance distribution colored by taste group

##### 2. Access User-wise Information
* get_user_profile(user_id)

* get_user_friends(user_id)

* list_friends_dist(user_id)

In [2]:
import numpy as np
import scipy as sp
import pandas as pd

from itertools import combinations
from scipy.spatial.distance import euclidean

import matplotlib.pyplot as plt

%matplotlib inline

In [3]:
from itertools import combinations
from scipy.spatial.distance import euclidean
from progressbar import ProgressBar

class UserBatch(object):
    
    def __init__(self):
        self.users = pd.DataFrame([])
        self.all_user_id = []
        self.friends = pd.DataFrame([])
        self.dist_df = pd.DataFrame(columns = ["pair_index", "id_a", "id_b", "dist_a2b", "dist_b2a"])
        self.dist_func = euclidean
        self.dist_func_type01 = euclidean
        self.dist_func_type02 = euclidean
        # variable mark the calculation of distances 
    
    def __repr__(self):
        return "{0} unique users included with {1} friendship(s) confirmed.".format(\
                                                len(self.users), self.friends.shape[0])
    
    def load_users(self, _users):
        """ Assign _users to .users attribute to store
            
            Parameters:
            -----------
            _users: {DataFrame(Pandas)}, columns: "ID", "decision_style", "x1" - "xn"
        """
        self.users = _users
        self.all_user_id = sorted([i for i in set(users.loc[:, "ID"])])
        self.__repr__()
    
    def load_friends(self, _friends):
        """
        """
        self.friends = _friends
        self.__repr__()
    
    def cal_distances(self):
        """ (Re)calculate the pair-wise distances among users 
        
            user_pair | id_a | id_b | dist_a2b | dist_b2a
        """
        all_pairs = [p for p in combinations(self.all_user_id, 2)]
        pbar = ProgressBar(len(all_pairs))
        
        for i, p in enumerate(all_pairs):
            a, b = p
            a_profile = self.users.ix[self.users.ID == a, 2:].as_matrix()
            a_group   = self.users.ix[self.users.ID == a, 1].as_matrix()[0]
            b_profile = self.users.ix[self.users.ID == b, 2:].as_matrix()
            b_group   = self.users.ix[self.users.ID == b, 1].as_matrix()[0]
            if a < b:
                row = [(a, b), a, b]
                dist_a2b = decision_rule(a_profile, b_profile, a_group)
                if a_group != b_group:
                    dist_b2a = decision_rule(b_profile, a_profile, b_group)
                else:
                    dist_b2a = dist_a2b
                row.append(dist_b2a)
                row.append(dist_a2b)
            else:
                row = [(b, a), b, a]
                dist_a2b = decision_rule(a_profile, b_profile, a_group)
                if a_group != b_group:
                    dist_b2a = decision_rule(b_profile, a_profile, b_group)
                else:
                    dist_b2a = dist_a2b
                row.append(dist_b2a)
                row.append(dist_a2b)
        
            pbar.update(i)
            row = {"pair_idx": row[0], "id_a": row[1], "id_b": row[2], \
                   "dist_a2b": row[3], "dist_b2a": row[4]}
            self.dist_df.append(row, ignore_index = True)
        
    
    def list_dist_distr(self, taste_group=None):
        """ List pair-wise distance of the group specified 
            by taste_group
        """
        pass
    
    def plot_dist_distr(self, taste_group=None):
        """ Plot the histogram of distances of the group specified
            by taste_group, bars was colored by connections [0, 1]
        """
        pass

ImportError: No module named progressbar

In [4]:
from numpy.random import binomial

## ######################### ##
## GENERATE EXMPERIMENT DATA ##
## ######################### ##
np.random.seed(20150408)
n_samples = 100
 
users = pd.DataFrame(
    {"ID": range(n_samples),
     "decision_style": np.random.choice([0, 1], size = n_samples, replace=True, p = (0.7, 0.3)),
     "x0": np.random.uniform(0, 1, n_samples),
     "x1": np.random.uniform(0, 1, n_samples),
     "x2": np.random.uniform(0, 1, n_samples),
     "x3": np.random.uniform(0, 1, n_samples),
     "x4": np.random.uniform(0, 1, n_samples),
     "x5": np.random.uniform(0, 1, n_samples)
     })

## ########################## ##
## CUSTOM FUNCTION            ##
## -------------------------- ##
## DATA SIMULATION LOGIC      ##
## ########################## ##
def dist_func(a, b, weights = None):
    """Calculate the distance
    """
    if weights == None:
        weights = [1] * len(a)
        
    a = np.array(a) * weights
    b = np.array(b) * weights
    return euclidean(a, b)
 
def is_friends(dist, threshold = 0.3):
    """Return 1 if user decided to become a friend
       Or 0, if the user does not want to become a friend
       with other. The underlying logic governing the
       decision-makeing follows a mixture of two binomial
       distributions which is parametrized by distance
       between two users.

       Decision logic:
       ===============
       if distance < threshold, \beta = 0.7
       else, \beta = 0.2
    """
    if dist <= threshold:
        res = binomial(1, 0.9)
    else:
        res = binomial(1, 0.1)
    return res
 
def decision_rule_01(a, b):
    weights = [0.5, 0.5, 0.1, 0., 0., 0.]
    return dist_func(a, b, weights)
 
def decision_rule_02(a, b):
    weights = [0., 0., 0., 0.5, 0.3, 0.2]
    return dist_func(a, b, weights)
 
def decision_rule(a, b, type_idx):
    """ Consolidate decision_rule_01 and decision_rule_02
        into a single function to unify the interface
    """
    if type_idx == 0:
        res = decision_rule_01(a, b)
    elif type_idx == 1:
        res = decision_rule_02(a, b)
    else:
        res = dist_func(a, b)
    return res
 
def make_connection(decision_a, decision_b):
    """Return 1 if decision_a and decision_b are both 1(s)
       to indicate two parties aggree to be connected
       Otherwise, return 0
    """
    if decision_a == 1 and decision_b == 1:
        res = 1
    else:
        res = 0
    return res
 
def friendship_agreement(a_id, b_id, \
                         a_profile, b_profile, \
                         a_dec_rule, b_dec_rule):
    """ Mathematical simulation
    """
    dist_a = decision_rule(a_profile, b_profile, a_dec_rule)
    dist_b = decision_rule(b_profile, a_profile, b_dec_rule)
    dec_a = is_friends(dist_a)
    dec_b = is_friends(dist_b)
    is_connected = make_connection(dec_a, dec_b)
    if a_id > b_id:
        res = [b_id, a_id, is_connected]
    else:
        res = [a_id, b_id, is_connected]
    return res

## ########################## ##
## Simulate User Relationship ##
## ########################## ##
all_users = set(users.loc[:, "ID"])
rel_df = np.empty([len(all_users) * len(all_users), 3])
dist_df = np.empty([len(all_users) * len(all_users), 4])

row_counter = 0
for a_id in all_users:
    for b_id in all_users:
        if a_id != b_id:
            a_pos = [i for i, val in enumerate(users["ID"] == a_id) if val]
            b_pos = [i for i, val in enumerate(users["ID"] == b_id) if val]
            
            a_dec_rule = users.loc[a_pos, "decision_style"].as_matrix()
            a_profile  = users.iloc[a_pos, 2:].as_matrix()
            b_dec_rule = users.loc[b_pos, "decision_style"].as_matrix()
            b_profile  = users.iloc[b_pos, 2:].as_matrix()
            
            dist_df[row_counter, :] = [a_id, b_id, \
                                       decision_rule(a_profile, b_profile, a_dec_rule), \
                                       decision_rule(b_profile, a_profile, b_dec_rule)]
            rel_df[row_counter, :] = \
                friendship_agreement(a_id, b_id, a_profile, b_profile, \
                                     a_dec_rule, b_dec_rule)
            row_counter += 1
        else:
            rel_df[row_counter, :] = [a_id, b_id, np.nan]
            row_counter += 1

## Convert np.array() to pd.DataFrame()
dist_df = pd.DataFrame(dist_df, columns = ["uid_a", "uid_b", "distance_a2b", "distance_b2a"])
rel_df = pd.DataFrame(rel_df, columns = ["uid_a", "uid_b", "isFriend"])
## Drop A-A pairs
dist_df = dist_df.dropna()
rel_df = rel_df.dropna()

In [5]:
rel_df = rel_df.ix[rel_df.isFriend == 1, :2]
pair_idx = rel_df.apply(lambda x: (int(x[0]), int(x[1])), axis = 1)
rel_df["pair_idx"] = pair_idx

In [6]:
UBatch = UserBatch()
UBatch.load_users(users)
UBatch.load_friends(rel_df)

NameError: name 'UserBatch' is not defined

In [97]:
UBatch.cal_distances()



In [87]:
UBatch.dist_df

Unnamed: 0,pair_index,id_a,id_b,dist_a2b,dist_b2a
