<h2> We try another NLM where nest1 = 1,2 and nest2 = 3,5. We remove 4 </h2>

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from functions_NLM import estimate_nested_logit, find_clusters
from functions_IB import information_bottleneck, information_bottleneck_convergence, IB_curve
from functions_geom_DIB import geom_DIB, DIB_curve

In [2]:
# Define log-likelihood function for telephone data
# beta will be beta = ["ASC_1", "ASC_3", "ASC_5", "BETA_COST", "lambda_1", "lambda_2"]

def log_likelihood_telephone2(beta, data):
    # Define utility functions
    data['U_1'] = beta[0] + beta[3] * data['logcost1'] 
    data['U_2'] = beta[3] * data['logcost2'] 
    data['U_3'] = beta[1] + beta[3] * data['logcost3']
    data['U_5'] = beta[2] + beta[3] * data['logcost5']
    
    # Calculate logsum for nests with > 1 alt
    data['logsum_1'] = np.log(data['avail1'] * np.exp(data['U_1'] / beta[4])
                                        + data['avail2'] * np.exp(data['U_2'] / beta[4])
                                        + (1 - data['avail1']) * (1 - data['avail2']))
    data['logsum_2'] = np.log(data['avail3'] * np.exp(data['U_3'] / beta[5])
                                    + data['avail5'] * np.exp(data['U_5'] / beta[5])
                                    + (1 - data['avail3']) * (1 - data['avail5']))

    # Nest probabilities
    data['P_nest_1'] = np.exp(beta[4] * data['logsum_1']) / \
                                 (np.exp(beta[4] * data['logsum_1']) 
                                  + np.exp(beta[5] * data['logsum_2']))
    data['P_nest_2'] = 1 - data['P_nest_1']
    
    # Within nest probabilities for nests with > 1 alt
    data['P_1_in_nest1'] = data['avail1'] * np.exp(data['U_1'] / beta[4]) / \
                                (data['avail1'] * np.exp(data['U_1'] / beta[4]) 
                                 + data['avail2'] * np.exp(data['U_2'] / beta[4]))
    data['P_2_in_nest1'] = 1 - data['P_1_in_nest1']
    
    data['P_3_in_nest2'] = data['avail3'] * np.exp(data['U_3'] / beta[5]) / \
                            (data['avail3'] * np.exp(data['U_3'] / beta[5])
                                + data['avail5'] * np.exp(data['U_5'] / beta[5]))
    data['P_5_in_nest2'] = 1 - data['P_3_in_nest2']
    
    # Full probabilities
    data['P_1'] = data['P_nest_1'] * data['P_1_in_nest1']
    data['P_2'] = data['P_nest_1'] * data['P_2_in_nest1']
    data['P_3'] = data['P_nest_2'] * data['P_3_in_nest2']
    data['P_5'] = data['P_nest_2'] * data['P_5_in_nest2']
    
    # Calculate probability for chosen alternative for each row
    data['P'] = (data['choice'] == 1) * data['P_1'] + \
                (data['choice'] == 2) * data['P_2'] + \
                (data['choice'] == 3) * data['P_3'] + \
                (data['choice'] == 5) * data['P_5']
    
    # Replace zero probabilities with small value to avoid LL = -inf
    epsilon = 1e-20
    data.loc[data['P'] == 0, 'P'] = epsilon
    
    # Calculate log-likelihood 
    LL = data['P'].apply(np.log).sum()
    
    return -LL  # We minimize negative log-likelihood

In [3]:
# Load data
subdata = pd.read_csv('./data/telephone.dat', sep='\t')
subdata = subdata[subdata['choice'] != 4]

subdata = subdata[['cost1', 'cost2', 'cost3', 'cost5', 'avail1', 'avail2', 'avail3', 'avail5', 'choice']]

subdata['logcost1'] = np.log(subdata['cost1'])
subdata['logcost2'] = np.log(subdata['cost2'])
subdata['logcost3'] = np.log(subdata['cost3'])
subdata['logcost5'] = np.log(subdata['cost5'])

subdata = subdata.iloc[:50]

In [4]:
# Estimate parameters
beta = np.array([0, 0, 0, 0, 1, 1])
beta_names = ["ASC_1", "ASC_3", "ASC_5", "BETA_COST", "lambda_measured", "lambda_flat"]
result_sub, se_sub, t_stat_sub, p_value_sub, aic, bic = estimate_nested_logit(subdata, beta, beta_names, log_likelihood_telephone2)

Optimization Results:
         Parameter  Estimate  Robust Asymptotic SE  t-statistic   p-value
0            ASC_1 -1.597779              1.039309    -1.537348  0.131369
1            ASC_3  2.695801              1.591962     1.693383  0.097452
2            ASC_5 -1.335068              6.681584    -0.199813  0.842547
3        BETA_COST -5.543512              2.417588    -2.292992  0.026685
4  lambda_measured  0.887048              0.560797     1.581765  0.120866
5      lambda_flat  7.259750              5.707937     1.271869  0.210101
AIC: 105.13303076390868
BIC: 116.60516879647756


---
---
---
---
---
---

In [5]:
# Computation of p(x) with Monte Carlo 

data_logcost = subdata[['logcost1', 'logcost2', 'logcost3', 'logcost5']]

# Calculate the sum of the log costs for each row
avail_logcost1 = subdata['avail1'] * subdata['logcost1']
avail_logcost2 = subdata['avail2'] * subdata['logcost2']
avail_logcost3 = subdata['avail3'] * subdata['logcost3']
avail_logcost5 = subdata['avail5'] * subdata['logcost5']
data_logcost['sum_logcost'] = avail_logcost1 + avail_logcost2 + avail_logcost3 + avail_logcost5

# Number of Monte Carlo simulations
num_simulations = 10000
# Perform Monte Carlo simulation
samples = np.random.choice(data_logcost['sum_logcost'], size=(num_simulations,))
# Count occurrences of each value in the samples
unique_values, counts = np.unique(samples, return_counts=True)
# Normalize counts to obtain probability distribution
probability_distribution = counts / num_simulations
# Create a dictionary to store value-probability pairs
value_probability_dict = dict(zip(unique_values, probability_distribution))

# Add a new column with the probability of each value
data_logcost['probability_MC'] = data_logcost['sum_logcost'].map(value_probability_dict)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_logcost['sum_logcost'] = avail_logcost1 + avail_logcost2 + avail_logcost3 + avail_logcost5
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_logcost['probability_MC'] = data_logcost['sum_logcost'].map(value_probability_dict)


In [6]:
# Computation of p(x,y)

p_x_sub = data_logcost['probability_MC'].values
p_y_given_x_sub = subdata[['P_1', 'P_2', 'P_3', 'P_5']].values
p_xy_sub = p_x_sub[:, np.newaxis] * p_y_given_x_sub

# Normalize p_xy 
p_xy_sub /= p_xy_sub.sum()

# Define epsilon value
epsilon = 1e-100

# Add epsilon to elements equal to 0 to avoid numerical issues in the computation of the DIB
p_xy_sub[p_xy_sub == 0] += epsilon

In [7]:
pd.set_option('display.max_columns', None) 
subdata.head()

Unnamed: 0,cost1,cost2,cost3,cost5,avail1,avail2,avail3,avail5,choice,logcost1,logcost2,logcost3,logcost5,U_1,U_2,U_3,U_5,logsum_1,logsum_2,P_nest_1,P_nest_2,P_1_in_nest1,P_2_in_nest1,P_3_in_nest2,P_5_in_nest2,P_1,P_2,P_3,P_5,P
0,5.82,5.78,12.75,23.28,1,1,1,1,2,1.7613,1.754404,2.545531,3.147595,-11.361568,-9.725557,-11.415381,-18.783795,-10.817146,-1.263163,0.3953,0.6047,0.13654,0.86346,0.733991,0.266009,0.053974,0.341326,0.443845,0.160855,0.341326
1,3.52,5.78,12.28,23.28,1,1,1,1,3,1.258461,1.754404,2.507972,3.147595,-8.574072,-9.725557,-11.20717,-18.783795,-9.424431,-1.242032,0.658566,0.341434,0.785516,0.214484,0.739553,0.260447,0.517315,0.141252,0.252508,0.088925,0.252508
2,5.09,5.78,11.47,28.28,1,1,1,1,1,1.627278,1.754404,2.439735,3.342155,-10.618613,-9.725557,-10.828898,-19.862342,-10.652509,-1.238438,0.387332,0.612668,0.267612,0.732388,0.776315,0.223685,0.103655,0.283677,0.475623,0.137045,0.103655
3,4.75,5.78,10.46,28.28,1,1,1,1,3,1.558145,1.754404,2.347558,3.342155,-10.235372,-9.725557,-10.317916,-19.862342,-10.517439,-1.183372,0.323334,0.676666,0.360146,0.639854,0.788299,0.211701,0.116447,0.206887,0.533415,0.143251,0.533415
4,8.55,7.05,14.33,28.28,1,1,1,1,3,2.145931,1.953028,2.662355,3.342155,-13.493774,-10.826631,-12.062996,-19.862342,-12.156964,-1.367818,0.298637,0.701363,0.047121,0.952879,0.745419,0.254581,0.014072,0.284564,0.52281,0.178554,0.52281


In [8]:
q_t_given_x_sub, q_t_sub, q_y_given_t_sub = geom_DIB(p_xy_sub, beta=15, max_iter=50)

Iteration: 0 out of 50
Iteration: 1 out of 50
Iteration: 2 out of 50
Iteration: 3 out of 50
Iteration: 4 out of 50
Iteration: 5 out of 50
Iteration: 6 out of 50
Iteration: 7 out of 50
Iteration: 8 out of 50
Iteration: 9 out of 50
Iteration: 10 out of 50
Iteration: 11 out of 50
Iteration: 12 out of 50
Iteration: 13 out of 50
Iteration: 14 out of 50
Iteration: 15 out of 50
Iteration: 16 out of 50
Iteration: 17 out of 50
Iteration: 18 out of 50
Iteration: 19 out of 50
Iteration: 20 out of 50
Iteration: 21 out of 50
Iteration: 22 out of 50
Iteration: 23 out of 50
Iteration: 24 out of 50
Iteration: 25 out of 50
Iteration: 26 out of 50
Iteration: 27 out of 50
Iteration: 28 out of 50
Iteration: 29 out of 50
Iteration: 30 out of 50
Iteration: 31 out of 50
Iteration: 32 out of 50
Iteration: 33 out of 50
Iteration: 34 out of 50
Iteration: 35 out of 50
Iteration: 36 out of 50
Iteration: 37 out of 50
Iteration: 38 out of 50
Iteration: 39 out of 50
Iteration: 40 out of 50
Iteration: 41 out of 50
It

In [9]:
# Find clusters
cluster_dict = find_clusters(q_t_given_x_sub)

# Create DataFrame with row index and cluster number
df = pd.DataFrame.from_dict(cluster_dict, orient='index', columns=['Cluster'])

# Set index name
df.index.name = 'Row Index'

# Reindex the DataFrame to have row index from 0 to n
df = df.reindex(range(len(q_t_given_x_sub)))

# Count the number of unique clusters
num_clusters = len(set(cluster_dict.values()))

# Print the number of clusters
print("Number of clusters:", num_clusters)


Number of clusters: 11


In [10]:
# Create new column choice_nest which is 1 if choice = 1 or 2, and 2 otherwise
subdata['choice_nest'] = np.where(subdata['choice'].isin([1, 2]), 1, 2)
subdata['cluster'] = df['Cluster'].values
subdata['cluster'].value_counts()

cluster
4     16
1     10
5      6
3      5
11     4
2      2
6      2
9      2
7      1
8      1
10     1
Name: count, dtype: int64

In [11]:
subdata['choice_nest'].value_counts()

choice_nest
2    34
1    16
Name: count, dtype: int64

In [16]:
data_results = subdata[['P_nest_1','P_nest_2', 'P_1','P_2', 'P_3', 'P_5', 'choice', 'choice_nest', 'cluster']]

data_results['max_proba'] = data_results[['P_1', 'P_2', 'P_3', 'P_5']].idxmax(axis=1).str[-1].astype(int)

#create a dataframe from mismatched_rows
data_results = pd.DataFrame(data_results)

pd.set_option('display.max_rows', None)  # Set the maximum number of rows to display to None
pd.set_option('display.max_columns', None) 
# Print the mismatched rows
data_results.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_results['max_proba'] = data_results[['P_1', 'P_2', 'P_3', 'P_5']].idxmax(axis=1).str[-1].astype(int)


Unnamed: 0,P_nest_1,P_nest_2,P_1,P_2,P_3,P_5,choice,choice_nest,cluster,max_proba
0,0.3953,0.6047,0.053974,0.341326,0.443845,0.160855,2,1,1,3
1,0.658566,0.341434,0.517315,0.141252,0.252508,0.088925,3,2,2,1
2,0.387332,0.612668,0.103655,0.283677,0.475623,0.137045,1,1,1,3
3,0.323334,0.676666,0.116447,0.206887,0.533415,0.143251,3,2,1,3
4,0.298637,0.701363,0.014072,0.284564,0.52281,0.178554,3,2,3,3


In [17]:
# number of each alternative 1, 2, 3, 4, 5 in each cluster 
cluster_counts = data_results.groupby(['cluster', 'choice']).size().unstack(fill_value=0)
cluster_counts

choice,1,2,3,5
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,2,2,6,0
2,1,0,1,0
3,0,1,4,0
4,2,1,7,6
5,0,1,4,1
6,0,0,2,0
7,0,1,0,0
8,1,0,0,0
9,0,1,1,0
10,0,1,0,0


In [18]:
cluster_counts2 = data_results.groupby(['cluster', 'max_proba']).size().unstack(fill_value=0)
cluster_counts2

max_proba,1,2,3,5
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,0,1,9,0
2,2,0,0,0
3,0,0,5,0
4,6,0,8,2
5,0,0,6,0
6,0,0,2,0
7,0,1,0,0
8,1,0,0,0
9,0,1,1,0
10,0,1,0,0


In [19]:
nest_counts = data_results.groupby(['choice', 'max_proba']).size().unstack(fill_value=0)
nest_counts

max_proba,1,2,3,5
choice,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,4,0,3,0
2,1,3,5,0
3,3,1,22,0
5,1,0,5,2
