<h2> We try another NLM where nest1 = 1,2,3 and nest2 = 4,5 </h2>

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from functions_NLM import estimate_nested_logit, find_clusters
from functions_IB import information_bottleneck, information_bottleneck_convergence, IB_curve
from functions_geom_DIB import geom_DIB, DIB_curve

In [2]:
# Define log-likelihood function for telephone data
# beta will be beta = ["ASC_1", "ASC_3", "ASC_4", "ASC_5", "BETA_COST", "lambda_1", "lambda_2"]

def log_likelihood_telephone2(beta, data):
    # Define utility functions
    data['U_1'] = beta[0] + beta[4] * data['logcost1'] 
    data['U_2'] = beta[4] * data['logcost2'] 
    data['U_3'] = beta[1] + beta[4] * data['logcost3']
    data['U_4'] = beta[2] + beta[4] * data['logcost4']
    data['U_5'] = beta[3] + beta[4] * data['logcost5']
    
    # Calculate logsum for nests with > 1 alt
    data['logsum_1'] = np.log(data['avail1'] * np.exp(data['U_1'] / beta[5])
                                        + data['avail2'] * np.exp(data['U_2'] / beta[5])
                                        + data['avail3'] * np.exp(data['U_3'] / beta[5])
                                        + (1 - data['avail1']) * (1 - data['avail2']) * (1 - data['avail3']))
    data['logsum_2'] = np.log(data['avail4'] * np.exp(data['U_4'] / beta[6])
                                    + data['avail5'] * np.exp(data['U_5'] / beta[6])
                                    + (1 - data['avail4']) * (1 - data['avail5']))
    
    # Nest probabilities
    data['P_nest_1'] = np.exp(beta[5] * data['logsum_1']) / \
                                 (np.exp(beta[5] * data['logsum_1']) 
                                  + np.exp(beta[6] * data['logsum_2']))
    data['P_nest_2'] = 1 - data['P_nest_1']
    
    # Within nest probabilities for nests with > 1 alt
    data['P_1_in_nest1'] = data['avail1'] * np.exp(data['U_1'] / beta[5]) / \
                                (data['avail1'] * np.exp(data['U_1'] / beta[5]) 
                                 + data['avail2'] * np.exp(data['U_2'] / beta[5])
                                 + data['avail3'] * np.exp(data['U_3'] / beta[5]))
    data['P_2_in_nest1'] = data['avail2'] * np.exp(data['U_2'] / beta[5]) / \
                                (data['avail1'] * np.exp(data['U_1'] / beta[5]) 
                                 + data['avail2'] * np.exp(data['U_2'] / beta[5])
                                 + data['avail3'] * np.exp(data['U_3'] / beta[5]))
    data['P_3_in_nest1'] = 1 - data['P_1_in_nest1'] - data['P_2_in_nest1']
    
    data['P_4_in_nest2'] = data['avail4'] * np.exp(data['U_4'] / beta[6]) / \
                            (data['avail4'] * np.exp(data['U_4'] / beta[6])
                                + data['avail5'] * np.exp(data['U_5'] / beta[6]))
    data['P_4_in_nest2'] = data['P_4_in_nest2'].fillna(0)
    data['P_5_in_nest2'] = 1 - data['P_4_in_nest2']
    
    # Full probabilities
    data['P_1'] = data['P_nest_1'] * data['P_1_in_nest1']
    data['P_2'] = data['P_nest_1'] * data['P_2_in_nest1']
    data['P_3'] = data['P_nest_1'] * data['P_3_in_nest1']
    data['P_4'] = data['P_nest_2'] * data['P_4_in_nest2']
    data['P_5'] = data['P_nest_2'] * data['P_5_in_nest2']
    
    # Calculate probability for chosen alternative for each row
    data['P'] = (data['choice'] == 1) * data['P_1'] + \
                (data['choice'] == 2) * data['P_2'] + \
                (data['choice'] == 3) * data['P_3'] + \
                (data['choice'] == 4) * data['P_4'] + \
                (data['choice'] == 5) * data['P_5']
    
    # Replace zero probabilities with small value to avoid LL = -inf
    epsilon = 1e-20
    data.loc[data['P'] == 0, 'P'] = epsilon
    
    # Calculate log-likelihood 
    LL = data['P'].apply(np.log).sum()
    
    return -LL  # We minimize negative log-likelihood

In [3]:
# Load data
subdata = pd.read_csv('./data/telephone.dat', sep='\t')
subdata = subdata[['cost1', 'cost2', 'cost3', 'cost4', 'cost5', 'avail1', 'avail2', 'avail3', 'avail4', 'avail5', 'choice']]

subdata['logcost1'] = np.log(subdata['cost1'])
subdata['logcost2'] = np.log(subdata['cost2'])
subdata['logcost3'] = np.log(subdata['cost3'])
subdata['logcost4'] = np.log(subdata['cost4'])
subdata['logcost5'] = np.log(subdata['cost5'])

subdata = subdata.iloc[:100]

In [4]:
# Estimate parameters
beta = np.array([0, 0, 0, 0, 0, 1, 1])
beta_names = ["ASC_1", "ASC_3", "ASC_4", "ASC_5", "BETA_COST", "lambda_measured", "lambda_flat"]
result_sub, se_sub, t_stat_sub, p_value_sub, aic, bic = estimate_nested_logit(subdata, beta, beta_names, log_likelihood_telephone2)

  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **k

Optimization Results:
         Parameter   Estimate  Robust Asymptotic SE  t-statistic       p-value
0            ASC_1  -0.742247              2.072511    -0.358139  7.210510e-01
1            ASC_3   7.516452              1.383944     5.431183  4.454547e-07
2            ASC_4  10.281888              2.363042     4.351124  3.469720e-05
3            ASC_5  10.885813              2.173637     5.008110  2.602729e-06
4        BETA_COST  -2.810649              0.641256    -4.383036  3.073978e-05
5  lambda_measured   6.370845              1.334309     4.774641  6.682370e-06
6      lambda_flat   0.029201              0.890412     0.032795  9.739087e-01
AIC: 224.83047023192483
BIC: 243.06666153384148


---
---
---
---
---
---

In [5]:
# Computation of p(x) with Monte Carlo 

data_logcost = subdata[['logcost1', 'logcost2', 'logcost3', 'logcost4', 'logcost5']]

# Calculate the sum of the log costs for each row
avail_logcost1 = subdata['avail1'] * subdata['logcost1']
avail_logcost2 = subdata['avail2'] * subdata['logcost2']
avail_logcost3 = subdata['avail3'] * subdata['logcost3']
avail_logcost4 = subdata['avail4'] * subdata['logcost4']
avail_logcost5 = subdata['avail5'] * subdata['logcost5']
data_logcost['sum_logcost'] = avail_logcost1 + avail_logcost2 + avail_logcost3 + avail_logcost4 + avail_logcost5

# Number of Monte Carlo simulations (adjust as needed)
num_simulations = 10000
# Perform Monte Carlo simulation
samples = np.random.choice(data_logcost['sum_logcost'], size=(num_simulations,))
# Count occurrences of each value in the samples
unique_values, counts = np.unique(samples, return_counts=True)
# Normalize counts to obtain probability distribution
probability_distribution = counts / num_simulations
# Create a dictionary to store value-probability pairs
value_probability_dict = dict(zip(unique_values, probability_distribution))

# Add a new column with the probability of each value
data_logcost['probability_MC'] = data_logcost['sum_logcost'].map(value_probability_dict)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_logcost['sum_logcost'] = avail_logcost1 + avail_logcost2 + avail_logcost3 + avail_logcost4 + avail_logcost5
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_logcost['probability_MC'] = data_logcost['sum_logcost'].map(value_probability_dict)


In [6]:
# Computation of p(x,y)

p_x_sub = data_logcost['probability_MC'].values
p_y_given_x_sub = subdata[['P_1', 'P_2', 'P_3', 'P_4', 'P_5']].values
p_xy_sub = p_x_sub[:, np.newaxis] * p_y_given_x_sub

# Normalize p_xy 
p_xy_sub /= p_xy_sub.sum()

# Define epsilon value
epsilon = 1e-100

# Add epsilon to elements equal to 0 to avoid numerical issues in the computation of the DIB
p_xy_sub[p_xy_sub == 0] += epsilon

In [7]:
q_t_given_x_sub, q_t_sub, q_y_given_t_sub = geom_DIB(p_xy_sub, beta=5, max_iter=50)

Iteration: 0 out of 50
Iteration: 1 out of 50
Iteration: 2 out of 50
Iteration: 3 out of 50
Iteration: 4 out of 50
Iteration: 5 out of 50
Iteration: 6 out of 50
Iteration: 7 out of 50
Iteration: 8 out of 50
Iteration: 9 out of 50
Iteration: 10 out of 50
Iteration: 11 out of 50
Iteration: 12 out of 50
Iteration: 13 out of 50
Iteration: 14 out of 50
Iteration: 15 out of 50
Iteration: 16 out of 50
Iteration: 17 out of 50
Iteration: 18 out of 50
Iteration: 19 out of 50
Iteration: 20 out of 50
Iteration: 21 out of 50
Iteration: 22 out of 50
Iteration: 23 out of 50
Iteration: 24 out of 50
Iteration: 25 out of 50
Iteration: 26 out of 50
Iteration: 27 out of 50
Iteration: 28 out of 50
Iteration: 29 out of 50
Iteration: 30 out of 50
Iteration: 31 out of 50
Iteration: 32 out of 50
Iteration: 33 out of 50
Iteration: 34 out of 50
Iteration: 35 out of 50
Iteration: 36 out of 50
Iteration: 37 out of 50
Iteration: 38 out of 50
Iteration: 39 out of 50
Iteration: 40 out of 50
Iteration: 41 out of 50
It

In [8]:
# Find clusters
cluster_dict = find_clusters(q_t_given_x_sub)

# Create DataFrame with row index and cluster number
df = pd.DataFrame.from_dict(cluster_dict, orient='index', columns=['Cluster'])

# Set index name
df.index.name = 'Row Index'

# Reindex the DataFrame to have row index from 0 to n
df = df.reindex(range(len(q_t_given_x_sub)))

# Count the number of unique clusters
num_clusters = len(set(cluster_dict.values()))

# Print the number of clusters
print("Number of clusters:", num_clusters)


Number of clusters: 5


In [9]:
# Create new column choice_nest which is 1 if choice = 1 or 2, and 2 otherwise
subdata['choice_nest'] = np.where(subdata['choice'].isin([1, 2, 3]), 1, 2)
subdata['cluster'] = df['Cluster']
subdata['cluster'].value_counts()

cluster
1    81
2    10
3     5
5     3
4     1
Name: count, dtype: int64

In [10]:
subdata['choice_nest'].value_counts()

choice_nest
1    81
2    19
Name: count, dtype: int64

In [11]:
data_results = subdata[['P_nest_1','P_nest_2', 'P_1','P_2', 'P_3', 'P_4', 'P_5', 'choice', 'choice_nest', 'cluster']]

data_results['max_proba'] = data_results[['P_1', 'P_2', 'P_3', 'P_4', 'P_5']].idxmax(axis=1).str[-1].astype(int)

#create a dataframe from mismatched_rows
data_results = pd.DataFrame(data_results)

# pd.set_option('display.max_rows', None)  # Set the maximum number of rows to display to None
# pd.set_option('display.max_columns', None) 
# Print the mismatched rows
data_results.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_results['max_proba'] = data_results[['P_1', 'P_2', 'P_3', 'P_4', 'P_5']].idxmax(axis=1).str[-1].astype(int)


Unnamed: 0,P_nest_1,P_nest_2,P_1,P_2,P_3,P_4,P_5,choice,choice_nest,cluster,max_proba
0,0.895441,0.104559,0.191144,0.212733,0.491564,0.0,0.104559,2,1,1,3
1,0.926341,0.073659,0.23245,0.207171,0.48672,0.0,0.073659,3,1,1,3
2,0.949751,0.050249,0.206972,0.217106,0.525673,0.0,0.050249,1,1,1,3
3,0.957941,0.042059,0.20901,0.21265,0.536282,0.0,0.042059,3,1,1,3
4,0.896367,0.103633,0.175656,0.21224,0.508471,0.0,0.103633,3,1,1,3


In [12]:
# number of each alternative 1, 2, 3, 4, 5 in each cluster 
cluster_counts = data_results.groupby(['cluster', 'choice']).size().unstack(fill_value=0)
cluster_counts

choice,1,2,3,4,5
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,17,18,41,0,5
2,0,0,3,0,7
3,0,0,0,0,5
4,0,0,0,1,0
5,1,0,1,1,0


In [13]:
cluster_counts2 = data_results.groupby(['cluster', 'max_proba']).size().unstack(fill_value=0)
cluster_counts2

max_proba,3,4,5
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,81,0,0
2,0,0,10
3,0,0,5
4,0,1,0
5,2,1,0


In [14]:
nest_counts = data_results.groupby(['choice', 'max_proba']).size().unstack(fill_value=0)
nest_counts

max_proba,3,4,5
choice,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,18,0,0
2,18,0,0
3,41,1,3
4,1,1,0
5,5,0,12
