<h2> We try another NLM where nest1 = 1,2 and nest2 = 3,5. We remove 4 </h2>

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from functions_NLM import estimate_nested_logit, find_clusters
from functions_IB import information_bottleneck, information_bottleneck_convergence, IB_curve
from functions_geom_DIB import geom_DIB, DIB_curve

In [2]:
# Define log-likelihood function for telephone data
# beta will be beta = ["ASC_1", "ASC_3", "ASC_5", "BETA_COST", "lambda_1", "lambda_2"]

def log_likelihood_telephone2(beta, data):
    # Define utility functions
    data['U_1'] = beta[0] + beta[3] * data['logcost1'] 
    data['U_2'] = beta[3] * data['logcost2'] 
    data['U_3'] = beta[1] + beta[3] * data['logcost3']
    data['U_5'] = beta[2] + beta[3] * data['logcost5']
    
    # Calculate logsum for nests with > 1 alt
    data['logsum_1'] = np.log(data['avail1'] * np.exp(data['U_1'] / beta[4])
                                        + data['avail2'] * np.exp(data['U_2'] / beta[4])
                                        + (1 - data['avail1']) * (1 - data['avail2']))
    data['logsum_2'] = np.log(data['avail3'] * np.exp(data['U_3'] / beta[5])
                                    + data['avail5'] * np.exp(data['U_5'] / beta[5])
                                    + (1 - data['avail3']) * (1 - data['avail5']))

    # Nest probabilities
    data['P_nest_1'] = np.exp(beta[4] * data['logsum_1']) / \
                                 (np.exp(beta[4] * data['logsum_1']) 
                                  + np.exp(beta[5] * data['logsum_2']))
    data['P_nest_2'] = 1 - data['P_nest_1']
    
    # Within nest probabilities for nests with > 1 alt
    data['P_1_in_nest1'] = data['avail1'] * np.exp(data['U_1'] / beta[4]) / \
                                (data['avail1'] * np.exp(data['U_1'] / beta[4]) 
                                 + data['avail2'] * np.exp(data['U_2'] / beta[4]))
    data['P_2_in_nest1'] = 1 - data['P_1_in_nest1']
    
    data['P_3_in_nest2'] = data['avail3'] * np.exp(data['U_3'] / beta[5]) / \
                            (data['avail3'] * np.exp(data['U_3'] / beta[5])
                                + data['avail5'] * np.exp(data['U_5'] / beta[5]))
    data['P_5_in_nest2'] = 1 - data['P_3_in_nest2']
    
    # Full probabilities
    data['P_1'] = data['P_nest_1'] * data['P_1_in_nest1']
    data['P_2'] = data['P_nest_1'] * data['P_2_in_nest1']
    data['P_3'] = data['P_nest_2'] * data['P_3_in_nest2']
    data['P_5'] = data['P_nest_2'] * data['P_5_in_nest2']
    
    # Calculate probability for chosen alternative for each row
    data['P'] = (data['choice'] == 1) * data['P_1'] + \
                (data['choice'] == 2) * data['P_2'] + \
                (data['choice'] == 3) * data['P_3'] + \
                (data['choice'] == 5) * data['P_5']
    
    # Replace zero probabilities with small value to avoid LL = -inf
    epsilon = 1e-20
    data.loc[data['P'] == 0, 'P'] = epsilon
    
    # Calculate log-likelihood 
    LL = data['P'].apply(np.log).sum()
    
    return -LL  # We minimize negative log-likelihood

In [3]:
# Load data
subdata = pd.read_csv('./data/telephone.dat', sep='\t')
subdata = subdata[subdata['choice'] != 4]

subdata = subdata[['cost1', 'cost2', 'cost3', 'cost5', 'avail1', 'avail2', 'avail3', 'avail5', 'choice']]

subdata['logcost1'] = np.log(subdata['cost1'])
subdata['logcost2'] = np.log(subdata['cost2'])
subdata['logcost3'] = np.log(subdata['cost3'])
subdata['logcost5'] = np.log(subdata['cost5'])

#subdata = subdata.iloc[:50]

In [5]:
# Estimate parameters
beta = np.array([0, 0, 0, 0, 1, 1])
beta_names = ["ASC_1", "ASC_3", "ASC_5", "BETA_COST", "lambda_measured", "lambda_flat"]
result_sub, se_sub, t_stat_sub, p_value_sub, aic, bic = estimate_nested_logit(subdata, beta, beta_names, log_likelihood_telephone2)

Optimization Results:
         Parameter  Estimate  Robust Asymptotic SE  t-statistic       p-value
0            ASC_1 -0.375414              0.098887    -3.796376  1.681822e-04
1            ASC_3  0.883034              0.183936     4.800774  2.193147e-06
2            ASC_5  1.391894              0.275366     5.054701  6.418004e-07
3        BETA_COST -1.484781              0.213483    -6.955018  1.337574e-11
4  lambda_measured  0.482857              0.140620     3.433771  6.536757e-04
5      lambda_flat  0.433651              0.101131     4.288012  2.233178e-05
AIC: 946.817198916706
BIC: 971.2138474573285


---
---
---
---
---
---

In [6]:
data_logcost = subdata[['logcost1', 'logcost2', 'logcost3', 'logcost5']]

# Function to compare rows with a reference row
def count_same_rows(df):
    row_counts = {}

    for index, row in df.iterrows():
        # Convert the row to a tuple to make it hashable
        row_tuple = tuple(row)
        
        # Count the occurrences of the row in the dataframe
        if row_tuple in row_counts:
            row_counts[row_tuple] += 1
        else:
            row_counts[row_tuple] = 1
            
    return row_counts

# Count occurrences of each row
row_counts = count_same_rows(data_logcost)

# Add a new column with probabilities
total_rows = len(data_logcost)
data_logcost['probability'] = data_logcost.apply(lambda row: row_counts[tuple(row)] / total_rows, axis=1)
data_logcost.head(15)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_logcost['probability'] = data_logcost.apply(lambda row: row_counts[tuple(row)] / total_rows, axis=1)


Unnamed: 0,logcost1,logcost2,logcost3,logcost5,probability
0,1.7613,1.754404,2.545531,3.147595,0.00232
1,1.258461,1.754404,2.507972,3.147595,0.00232
2,1.627278,1.754404,2.439735,3.342155,0.00232
3,1.558145,1.754404,2.347558,3.342155,0.00232
4,2.145931,1.953028,2.662355,3.342155,0.00232
5,2.778819,2.681022,2.573375,3.342155,0.00232
6,1.627278,1.754404,2.478218,3.342155,0.00232
7,1.211941,1.754404,2.328253,3.342155,0.00464
8,3.5522,3.508256,3.642836,3.342155,0.00232
9,3.128075,3.060115,3.234355,3.342155,0.00232


In [7]:
# Computation of p(x,y)

p_x_sub = data_logcost['probability'].values
p_y_given_x_sub = subdata[['P_1', 'P_2', 'P_3', 'P_5']].values
p_xy_sub = p_x_sub[:, np.newaxis] * p_y_given_x_sub

# Normalize p_xy 
p_xy_sub /= p_xy_sub.sum()

# Define epsilon value
epsilon = 1e-100

# Add epsilon to elements equal to 0 to avoid numerical issues in the computation of the DIB
p_xy_sub[p_xy_sub == 0] += epsilon

In [8]:
pd.set_option('display.max_columns', None) 
subdata.head()

Unnamed: 0,cost1,cost2,cost3,cost5,avail1,avail2,avail3,avail5,choice,logcost1,logcost2,logcost3,logcost5,U_1,U_2,U_3,U_5,logsum_1,logsum_2,P_nest_1,P_nest_2,P_1_in_nest1,P_2_in_nest1,P_3_in_nest2,P_5_in_nest2,P_1,P_2,P_3,P_5,P
0,5.82,5.78,12.75,23.28,1,1,1,1,2,1.7613,1.754404,2.545531,3.147595,-2.990559,-2.604905,-2.896523,-3.281594,-5.023268,-6.334746,0.579701,0.420299,0.310305,0.689695,0.708472,0.291528,0.179884,0.399817,0.29777,0.122529,0.399817
1,3.52,5.78,12.28,23.28,1,1,1,1,3,1.258461,1.754404,2.507972,3.147595,-2.243953,-2.604905,-2.840755,-3.281594,-4.259577,-6.24196,0.657025,0.342975,0.678641,0.321359,0.734305,0.265695,0.445884,0.211141,0.251848,0.091127,0.251848
2,5.09,5.78,11.47,28.28,1,1,1,1,1,1.627278,1.754404,2.439735,3.342155,-2.791565,-2.604905,-2.739438,-3.570473,-4.876349,-6.179881,0.580623,0.419377,0.404542,0.595458,0.871733,0.128267,0.234886,0.345736,0.365585,0.053792,0.234886
3,4.75,5.78,10.46,28.28,1,1,1,1,3,1.558145,1.754404,2.347558,3.342155,-2.688918,-2.604905,-2.602576,-3.570473,-4.784843,-5.89961,0.561678,0.438322,0.456612,0.543388,0.903084,0.096916,0.256469,0.305209,0.395842,0.04248,0.395842
4,8.55,7.05,14.33,28.28,1,1,1,1,3,2.145931,1.953028,2.662355,3.342155,-3.561652,-2.899818,-3.069981,-3.570473,-5.779251,-6.805298,0.540056,0.459944,0.202513,0.797487,0.760266,0.239734,0.109368,0.430688,0.349679,0.110264,0.349679


In [10]:
p_x = data_logcost['probability'].values
p_y_given_x = subdata[['P_1', 'P_2', 'P_3', 'P_5']].values
p_xy = p_x[:, np.newaxis] * p_y_given_x

# Normalize p_xy 
p_xy /= p_xy.sum()

# Define epsilon value
epsilon = 1e-20

# Add epsilon to elements equal to 0
p_xy[p_xy == 0] += epsilon

In [11]:
from functions_IB import kl_divergence, entropy, mutual_information

In [12]:
def geom_DIB_on_alternatives(p_xy, max_iter=100, beta=0.5):
    """
    Performs the geometric deterministic information bottleneck algorithm for clustering.

    Parameters:
    - p_xy (numpy.ndarray): The joint probability distribution of data points and clusters.
    - max_iter (int): The maximum number of iterations for the algorithm. Default is 100.
    - beta (float): The beta parameter for the algorithm. Default is 0.5.

    Returns:
    - q_t_given_x (numpy.ndarray): The conditional probability distribution of clusters given data points.
    - q_t (numpy.ndarray): The marginal probability distribution of clusters.
    - q_y_given_t (numpy.ndarray): The conditional probability distribution of data points given clusters.
    """
    num_data_points = p_xy.shape[0]
    num_clusters = p_xy.shape[1]
    
    # Initialize f(x) as if each index i is assigned to its own cluster
    #f_x = np.arange(num_data_points)
    f_x = np.random.randint(1, 6, size=num_data_points)

    # Initialization 
    d_xt = np.zeros((num_data_points, num_clusters))
    p_y_given_x = p_xy / np.sum(p_xy, axis=1, keepdims=True)
    p_x = np.sum(p_xy, axis=1)
    
    # Initialize q(t) and q(y|t)
    q_t = np.zeros(num_clusters)
    q_y_given_t = np.zeros((num_clusters, num_clusters))
    for t in range(num_clusters):
        relevant_indices = np.where(f_x == t)[0]
        if len(relevant_indices) > 0:
            q_t[t] = np.sum(p_x[relevant_indices])
            q_y_given_t[t] = np.sum(p_xy[relevant_indices], axis=0) / np.sum(p_x[relevant_indices])
    
    q_t /= np.sum(q_t)  # Normalize q(t)

    # Iterative algorithm
    for _ in range(max_iter):
        
        # Compute d, l_beta, and f
        for i in range(num_data_points):
            for j in range(num_clusters):
                d_xt[i, j] = kl_divergence(p_y_given_x[i], q_y_given_t[j])

        l_beta_xt = np.log(q_t) - beta * d_xt
        f_x = np.argmax(l_beta_xt, axis=1)
        
        # Update q_t_given_x
        q_t_given_x = np.eye(num_clusters)[f_x]
        
        # Update q_t and q_y_given_t
        for t in range(num_clusters):
            relevant_indices = np.where(f_x == t)[0]
            if len(relevant_indices) > 0:
                q_t[t] = np.sum(p_x[relevant_indices])
                q_y_given_t[t] = np.sum(p_xy[relevant_indices], axis=0) / np.sum(p_x[relevant_indices])
        
        # Normalize q(t)
        q_t /= np.sum(q_t)
        
        # print at which iteration we are 
        print("Iteration:", _, "out of", max_iter)
        

########################################################################################################

        # Merge step to verify if we are stuck in a local minimum
        best_merge = None

        # Compute the objective function with the current clusters
        H_T = entropy(q_t)
        I_TY = mutual_information(q_t.reshape(-1, 1) * q_y_given_t)
        objective = H_T - beta * I_TY

        # Compute the objective function for each pair of consecutive clusters
        for i in range(num_clusters - 1):
            # Merge clusters i and i + 1
            merged_f_x = f_x.copy()
            merged_f_x[np.where(merged_f_x == i + 1)] = i

            # Initialize merge_q_t and merge_q_y_given_t
            merged_q_t = np.zeros(num_clusters)
            merged_q_y_given_t = np.zeros((num_clusters, num_clusters))

            # Update merge_q_t and merge_q_y_given_t
            for t in range(num_clusters):
                relevant_indices = np.where(merged_f_x == t)[0]
                if len(relevant_indices) > 0:
                    merged_q_t[t] = np.sum(p_x[relevant_indices])
                    merged_q_y_given_t[t] = np.sum(p_xy[relevant_indices], axis=0) / np.sum(p_x[relevant_indices])
            
            # Compute the objective function with the merged clusters
            merged_H_T = entropy(merged_q_t)
            merged_I_TY = mutual_information(merged_q_t.reshape(-1, 1) * merged_q_y_given_t)
            merged_objective = merged_H_T - beta * merged_I_TY

            if merged_objective < objective:
                objective = merged_objective
                best_merge = i

        if best_merge is not None:
            # If the objective function can be improved, merge the clusters
            f_x[np.where(f_x == best_merge + 1)] = best_merge
            q_t_given_x = np.eye(num_clusters)[f_x]
        
            # Update q_t and q_y_given_t after the merge
            for t in range(num_clusters):
                relevant_indices = np.where(f_x == t)[0]
                if len(relevant_indices) > 0:
                    q_t[t] = np.sum(p_x[relevant_indices])
                    q_y_given_t[t] = np.sum(p_xy[relevant_indices], axis=0) / np.sum(p_x[relevant_indices])
            # Normalize q(t) 
            q_t /= np.sum(q_t)   

    return q_t_given_x, q_t, q_y_given_t

In [13]:
q_t_given_x, q_t, q_y_given_t = geom_DIB_on_alternatives(p_xy, beta=50, max_iter=200)

  q = q / np.sum(q)
  l_beta_xt = np.log(q_t) - beta * d_xt


Iteration: 0 out of 200
Iteration: 1 out of 200
Iteration: 2 out of 200
Iteration: 3 out of 200
Iteration: 4 out of 200
Iteration: 5 out of 200
Iteration: 6 out of 200
Iteration: 7 out of 200
Iteration: 8 out of 200
Iteration: 9 out of 200
Iteration: 10 out of 200
Iteration: 11 out of 200
Iteration: 12 out of 200
Iteration: 13 out of 200
Iteration: 14 out of 200
Iteration: 15 out of 200
Iteration: 16 out of 200
Iteration: 17 out of 200
Iteration: 18 out of 200
Iteration: 19 out of 200
Iteration: 20 out of 200
Iteration: 21 out of 200
Iteration: 22 out of 200
Iteration: 23 out of 200
Iteration: 24 out of 200
Iteration: 25 out of 200
Iteration: 26 out of 200
Iteration: 27 out of 200
Iteration: 28 out of 200
Iteration: 29 out of 200
Iteration: 30 out of 200
Iteration: 31 out of 200
Iteration: 32 out of 200
Iteration: 33 out of 200
Iteration: 34 out of 200
Iteration: 35 out of 200
Iteration: 36 out of 200
Iteration: 37 out of 200
Iteration: 38 out of 200
Iteration: 39 out of 200
Iteration:

In [14]:
# Calculate the number of clusters
column_sum = np.sum(q_t_given_x, axis=0)
num_clusters = np.count_nonzero(column_sum)
num_clusters

4

In [15]:
# Create new column choice_nest which is 1 if choice= 1 or 2, and 2 otherwise
subdata['choice_nest'] = np.where(subdata['choice'].isin([1, 2]), 1, 2)
subdata['cluster'] = np.argmax(q_t_given_x, axis=1)
subdata['cluster'].value_counts()

cluster
0    148
2    138
3    107
1     38
Name: count, dtype: int64

In [17]:
subdata['choice_nest'].value_counts()

choice_nest
2    235
1    196
Name: count, dtype: int64

In [22]:
data_results = subdata[['P_nest_1','P_nest_2', 'P_1','P_2', 'P_3', 'P_5', 'choice', 'choice_nest', 'cluster']]

data_results['max_proba'] = data_results[['P_1', 'P_2', 'P_3', 'P_5']].idxmax(axis=1).str[-1].astype(int)

#create a dataframe from mismatched_rows
data_results = pd.DataFrame(data_results)

# Print the mismatched rows
data_results.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_results['max_proba'] = data_results[['P_1', 'P_2', 'P_3', 'P_5']].idxmax(axis=1).str[-1].astype(int)


Unnamed: 0,P_nest_1,P_nest_2,P_1,P_2,P_3,P_5,choice,choice_nest,cluster,max_proba
0,0.579701,0.420299,0.179884,0.399817,0.29777,0.122529,2,1,0,2
1,0.657025,0.342975,0.445884,0.211141,0.251848,0.091127,3,2,3,1
2,0.580623,0.419377,0.234886,0.345736,0.365585,0.053792,1,1,3,3
3,0.561678,0.438322,0.256469,0.305209,0.395842,0.04248,3,2,3,3
4,0.540056,0.459944,0.109368,0.430688,0.349679,0.110264,3,2,0,2


In [23]:
# number of each alternative 1, 2, 3, 4, 5 in each cluster 
cluster_counts = data_results.groupby(['cluster', 'choice']).size().unstack(fill_value=0)
cluster_counts

choice,1,2,3,5
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,16,41,70,21
1,1,1,4,32
2,22,61,54,1
3,34,20,50,3


In [24]:
cluster_counts2 = data_results.groupby(['cluster', 'max_proba']).size().unstack(fill_value=0)
cluster_counts2

max_proba,1,2,3,5
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0,82,60,6
1,0,0,1,37
2,0,0,138,0
3,48,3,56,0


In [25]:
nest_counts = data_results.groupby(['choice', 'max_proba']).size().unstack(fill_value=0)
nest_counts

max_proba,1,2,3,5
choice,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,16,13,43,1
2,12,27,82,2
3,19,32,122,5
5,1,13,8,35
