<h2> We try another NLM where nest1 = 1,2,3 and nest2 = 4,5 </h2>

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from functions_NLM import estimate_nested_logit, find_clusters
from functions_IB import information_bottleneck, information_bottleneck_convergence, IB_curve
from functions_geom_DIB import geom_DIB, DIB_curve

In [2]:
# Define log-likelihood function for telephone data
# beta will be beta = ["ASC_1", "ASC_3", "ASC_4", "ASC_5", "BETA_COST", "lambda_1", "lambda_2"]

def log_likelihood_telephone2(beta, data):
    # Define utility functions
    data['U_1'] = beta[0] + beta[4] * data['logcost1'] 
    data['U_2'] = beta[4] * data['logcost2'] 
    data['U_3'] = beta[1] + beta[4] * data['logcost3']
    data['U_4'] = beta[2] + beta[4] * data['logcost4']
    data['U_5'] = beta[3] + beta[4] * data['logcost5']
    
    # Calculate logsum for nests with > 1 alt
    data['logsum_1'] = np.log(data['avail1'] * np.exp(data['U_1'] / beta[5])
                                        + data['avail2'] * np.exp(data['U_2'] / beta[5])
                                        + data['avail3'] * np.exp(data['U_3'] / beta[5])
                                        + (1 - data['avail1']) * (1 - data['avail2']) * (1 - data['avail3']))
    data['logsum_2'] = np.log(data['avail4'] * np.exp(data['U_4'] / beta[6])
                                    + data['avail5'] * np.exp(data['U_5'] / beta[6])
                                    + (1 - data['avail4']) * (1 - data['avail5']))
    
    # Nest probabilities
    data['P_nest_1'] = np.exp(beta[5] * data['logsum_1']) / \
                                 (np.exp(beta[5] * data['logsum_1']) 
                                  + np.exp(beta[6] * data['logsum_2']))
    data['P_nest_2'] = 1 - data['P_nest_1']
    
    # Within nest probabilities for nests with > 1 alt
    data['P_1_in_nest1'] = data['avail1'] * np.exp(data['U_1'] / beta[5]) / \
                                (data['avail1'] * np.exp(data['U_1'] / beta[5]) 
                                 + data['avail2'] * np.exp(data['U_2'] / beta[5])
                                 + data['avail3'] * np.exp(data['U_3'] / beta[5]))
    data['P_2_in_nest1'] = data['avail2'] * np.exp(data['U_2'] / beta[5]) / \
                                (data['avail1'] * np.exp(data['U_1'] / beta[5]) 
                                 + data['avail2'] * np.exp(data['U_2'] / beta[5])
                                 + data['avail3'] * np.exp(data['U_3'] / beta[5]))
    data['P_3_in_nest1'] = 1 - data['P_1_in_nest1'] - data['P_2_in_nest1']
    
    data['P_4_in_nest2'] = data['avail4'] * np.exp(data['U_4'] / beta[6]) / \
                            (data['avail4'] * np.exp(data['U_4'] / beta[6])
                                + data['avail5'] * np.exp(data['U_5'] / beta[6]))
    data['P_4_in_nest2'] = data['P_4_in_nest2'].fillna(0)
    data['P_5_in_nest2'] = 1 - data['P_4_in_nest2']
    
    # Full probabilities
    data['P_1'] = data['P_nest_1'] * data['P_1_in_nest1']
    data['P_2'] = data['P_nest_1'] * data['P_2_in_nest1']
    data['P_3'] = data['P_nest_1'] * data['P_3_in_nest1']
    data['P_4'] = data['P_nest_2'] * data['P_4_in_nest2']
    data['P_5'] = data['P_nest_2'] * data['P_5_in_nest2']
    
    # Calculate probability for chosen alternative for each row
    data['P'] = (data['choice'] == 1) * data['P_1'] + \
                (data['choice'] == 2) * data['P_2'] + \
                (data['choice'] == 3) * data['P_3'] + \
                (data['choice'] == 4) * data['P_4'] + \
                (data['choice'] == 5) * data['P_5']
    
    # Replace zero probabilities with small value to avoid LL = -inf
    epsilon = 1e-20
    data.loc[data['P'] == 0, 'P'] = epsilon
    
    # Calculate log-likelihood 
    LL = data['P'].apply(np.log).sum()
    
    return -LL  # We minimize negative log-likelihood

In [5]:
# Load data
subdata = pd.read_csv('./data/telephone.dat', sep='\t')
subdata = subdata[['cost1', 'cost2', 'cost3', 'cost4', 'cost5', 'avail1', 'avail2', 'avail3', 'avail4', 'avail5', 'choice']]

subdata['logcost1'] = np.log(subdata['cost1'])
subdata['logcost2'] = np.log(subdata['cost2'])
subdata['logcost3'] = np.log(subdata['cost3'])
subdata['logcost4'] = np.log(subdata['cost4'])
subdata['logcost5'] = np.log(subdata['cost5'])

#subdata = subdata.iloc[:100]

In [6]:
# Estimate parameters
beta = np.array([0, 0, 0, 0, 0, 1, 1])
beta_names = ["ASC_1", "ASC_3", "ASC_4", "ASC_5", "BETA_COST", "lambda_measured", "lambda_flat"]
result_sub, se_sub, t_stat_sub, p_value_sub, aic, bic = estimate_nested_logit(subdata, beta, beta_names, log_likelihood_telephone2)

  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **k

Optimization Results:
         Parameter   Estimate  Robust Asymptotic SE  t-statistic       p-value
0            ASC_1  -4.980771              1.312214    -3.795701  1.685207e-04
1            ASC_3   4.938172              0.784479     6.294844  7.644372e-10
2            ASC_4  11.057639              1.254723     8.812815  0.000000e+00
3            ASC_5  12.390216              1.181103    10.490381  0.000000e+00
4        BETA_COST  -2.642071              0.229041   -11.535377  0.000000e+00
5  lambda_measured   9.990623              1.415785     7.056597  6.937118e-12
6      lambda_flat   0.012176              0.891198     0.013663  9.891053e-01
AIC: 988.5357921959762
BIC: 1017.0471039346791


  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)


---
---
---
---
---
---

In [17]:
data_logcost['probability_MC']

0     0.0102
1     0.0102
2     0.0098
3     0.0103
4     0.0087
5     0.0122
6     0.0116
7     0.0108
8     0.0081
9     0.0098
10    0.0310
11    0.0086
12    0.0077
13    0.0097
14    0.0098
15    0.0100
16    0.0071
17    0.0086
18    0.0084
19    0.0086
20    0.0104
21    0.0095
22    0.0094
23    0.0095
24    0.0310
25    0.0092
26    0.0098
27    0.0107
28    0.0095
29    0.0081
30    0.0112
31    0.0104
32    0.0096
33    0.0119
34    0.0100
35    0.0086
36    0.0310
37    0.0094
38    0.0112
39    0.0104
40    0.0116
41    0.0106
42    0.0111
43    0.0112
44    0.0091
45    0.0097
46    0.0095
47    0.0093
48    0.0098
49    0.0087
50    0.0104
51    0.0104
52    0.0075
53    0.0100
54    0.0095
55    0.0097
56    0.0109
57    0.0102
58    0.0101
59    0.0101
60    0.0092
61    0.0082
62    0.0088
63    0.0119
64    0.0129
65    0.0103
66    0.0103
67    0.0091
68    0.0122
69    0.0114
70    0.0090
71    0.0117
72    0.0105
73    0.0106
74    0.0101
75    0.0089
76    0.0096

In [9]:
# Computation of p(x,y)

p_x_sub = data_logcost['probability'].values
p_y_given_x_sub = subdata[['P_1', 'P_2', 'P_3', 'P_4', 'P_5']].values
p_xy_sub = p_x_sub[:, np.newaxis] * p_y_given_x_sub

# Normalize p_xy 
p_xy_sub /= p_xy_sub.sum()

# Define epsilon value
epsilon = 1e-100

# Add epsilon to elements equal to 0 to avoid numerical issues in the computation of the DIB
p_xy_sub[p_xy_sub == 0] += epsilon

In [8]:
q_t_given_x_sub, q_t_sub, q_y_given_t_sub = geom_DIB(p_xy_sub, beta=5, max_iter=50)

Iteration: 0 out of 50
Iteration: 1 out of 50
Iteration: 2 out of 50
Iteration: 3 out of 50
Iteration: 4 out of 50
Iteration: 5 out of 50
Iteration: 6 out of 50
Iteration: 7 out of 50
Iteration: 8 out of 50
Iteration: 9 out of 50
Iteration: 10 out of 50
Iteration: 11 out of 50
Iteration: 12 out of 50
Iteration: 13 out of 50
Iteration: 14 out of 50
Iteration: 15 out of 50
Iteration: 16 out of 50
Iteration: 17 out of 50
Iteration: 18 out of 50
Iteration: 19 out of 50
Iteration: 20 out of 50
Iteration: 21 out of 50
Iteration: 22 out of 50
Iteration: 23 out of 50
Iteration: 24 out of 50
Iteration: 25 out of 50
Iteration: 26 out of 50
Iteration: 27 out of 50
Iteration: 28 out of 50
Iteration: 29 out of 50
Iteration: 30 out of 50
Iteration: 31 out of 50
Iteration: 32 out of 50
Iteration: 33 out of 50
Iteration: 34 out of 50
Iteration: 35 out of 50
Iteration: 36 out of 50
Iteration: 37 out of 50
Iteration: 38 out of 50
Iteration: 39 out of 50
Iteration: 40 out of 50
Iteration: 41 out of 50
It

In [9]:
# Find clusters
cluster_dict = find_clusters(q_t_given_x_sub)

# Create DataFrame with row index and cluster number
df = pd.DataFrame.from_dict(cluster_dict, orient='index', columns=['Cluster'])

# Set index name
df.index.name = 'Row Index'

# Reindex the DataFrame to have row index from 0 to n
df = df.reindex(range(len(q_t_given_x_sub)))

# Count the number of unique clusters
num_clusters = len(set(cluster_dict.values()))

# Print the number of clusters
print("Number of clusters:", num_clusters)


Number of clusters: 5


In [10]:
# Create new column choice_nest which is 1 if choice = 1 or 2, and 2 otherwise
subdata['choice_nest'] = np.where(subdata['choice'].isin([1, 2, 3]), 1, 2)
subdata['cluster'] = df['Cluster']
subdata['cluster'].value_counts()

cluster
1    81
2    12
3     5
4     1
5     1
Name: count, dtype: int64

In [11]:
subdata['choice_nest'].value_counts()

choice_nest
1    81
2    19
Name: count, dtype: int64

In [16]:
data_results = subdata[['P_nest_1','P_nest_2', 'P_1','P_2', 'P_3', 'P_4', 'P_5', 'choice', 'choice_nest', 'cluster']]

data_results['max_proba'] = data_results[['P_1', 'P_2', 'P_3', 'P_4', 'P_5']].idxmax(axis=1).str[-1].astype(int)

#create a dataframe from mismatched_rows
data_results = pd.DataFrame(data_results)

pd.set_option('display.max_rows', None)  # Set the maximum number of rows to display to None
pd.set_option('display.max_columns', None) 
# Print the mismatched rows
data_results

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_results['max_proba'] = data_results[['P_1', 'P_2', 'P_3', 'P_4', 'P_5']].idxmax(axis=1).str[-1].astype(int)


Unnamed: 0,P_nest_1,P_nest_2,P_1,P_2,P_3,P_4,P_5,choice,choice_nest,cluster,max_proba
0,0.895441,0.104559,0.191144,0.212733,0.491564,0.0,0.104559,2,1,1,3
1,0.926341,0.073659,0.23245,0.207171,0.48672,0.0,0.073659,3,1,1,3
2,0.949751,0.050249,0.206972,0.217106,0.525673,0.0,0.050249,1,1,1,3
3,0.957941,0.042059,0.20901,0.21265,0.536282,0.0,0.042059,3,1,1,3
4,0.896367,0.103633,0.175656,0.21224,0.508471,0.0,0.103633,3,1,1,3
5,0.825854,0.174146,0.134491,0.155816,0.535547,0.0,0.174146,3,1,1,3
6,0.946829,0.053171,0.208278,0.218477,0.520074,0.0,0.053171,3,1,1,3
7,0.967085,0.032915,0.236226,0.206253,0.524606,0.0,0.032915,1,1,1,3
8,0.236174,0.763826,0.041986,0.047499,0.146689,0.0,0.763826,5,2,2,5
9,0.501046,0.498954,0.089264,0.102062,0.30972,0.0,0.498954,3,1,2,5


In [13]:
# number of each alternative 1, 2, 3, 4, 5 in each cluster 
cluster_counts = data_results.groupby(['cluster', 'choice']).size().unstack(fill_value=0)
cluster_counts

choice,1,2,3,4,5
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,17,18,41,0,5
2,0,0,4,1,7
3,0,0,0,0,5
4,0,0,0,1,0
5,1,0,0,0,0


In [14]:
cluster_counts2 = data_results.groupby(['cluster', 'max_proba']).size().unstack(fill_value=0)
cluster_counts2

max_proba,3,4,5
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,81,0,0
2,1,1,10
3,0,0,5
4,0,1,0
5,1,0,0


In [15]:
nest_counts = data_results.groupby(['choice', 'max_proba']).size().unstack(fill_value=0)
nest_counts

max_proba,3,4,5
choice,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,18,0,0
2,18,0,0
3,41,1,3
4,1,1,0
5,5,0,12
