In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from functions_NLM import estimate_nested_logit, simulate_choice
from functions_geom_DIB import geom_DIB_on_alternatives

In [2]:
# Load data
data = pd.read_csv('../data/swissmetro.dat', sep='\t')

In [3]:
data_used = data[['ID', 'TRAIN_TT', 'SM_TT', 'CAR_TT', 'TRAIN_CO', 'SM_CO', 'CAR_CO',
                   'TRAIN_AV', 'SM_AV', 'CAR_AV', 'GA', 'CHOICE']]

In [4]:
# Define log-likelihood function
def log_likelihood_SM_NLM(beta, data):
    # Define utility functions
    data['U_TRAIN'] = beta[2] * data['TRAIN_TT'] + beta[3] * (data['GA'] == 0) * data['TRAIN_CO']
    data['U_SM'] = beta[1] + beta[2] * data['SM_TT'] + beta[3] * (data['GA'] == 0) * data['SM_CO']
    data['U_CAR'] = beta[0] + beta[2] * data['CAR_TT'] + beta[3] * data['CAR_CO']
    
    # Calculate logsum for nests with > 1 alt
    data['logsum_CAR_SM'] = np.log(data['SM_AV'] * np.exp(data['U_SM'] / beta[4])
                                   + data['CAR_AV'] * np.exp(data['U_CAR'] / beta[4])
                                      + (1 - data['SM_AV']) * (1 - data['CAR_AV']))
    
    # Nest probabilities
    data['P_nest_CAR_SM'] = np.exp(beta[4] * data['logsum_CAR_SM']) / \
                               (np.exp(beta[4] * data['logsum_CAR_SM']) + data['TRAIN_AV'] * np.exp(data['U_TRAIN']))
    data['P_nest_TRAIN'] = 1 - data['P_nest_CAR_SM']
    
    # Within nest probabilities for nests with > 1 alt
    data['P_CAR_in_CAR_SM'] = (data['CAR_AV'] * np.exp(data['U_CAR'] / beta[4])) / \
                                  (data['SM_AV'] * np.exp(data['U_SM'] / beta[4]) +
                                   data['CAR_AV'] * np.exp(data['U_CAR'] / beta[4]))
    data['P_SM_in_CAR_SM'] = 1 - data['P_CAR_in_CAR_SM']
    
    # Full probabilities
    data['P_TRAIN_1'] = data['P_nest_TRAIN']
    data['P_SM_2'] = data['P_nest_CAR_SM'] * data['P_SM_in_CAR_SM']
    data['P_CAR_3'] = data['P_nest_CAR_SM'] * data['P_CAR_in_CAR_SM']
    
    # Calculate probability for chosen alternative for each row
    data['P'] = (data['CHOICE'] == 1) * data['P_TRAIN_1'] + \
                (data['CHOICE'] == 2) * data['P_SM_2'] + \
                (data['CHOICE'] == 3) * data['P_CAR_3']
    
    # Calculate log-likelihood
    LL = data.groupby('ID')['P'].prod().apply(np.log).sum()
    
    return -LL  # We minimize negative log-likelihood


In [5]:
# Define model parameters
beta = np.array([0, 0, 0, 0, 1])
beta_names = ["ASC_CAR", "ASC_SM", "BETA_TT", "BETA_TC", "lambda_CAR_SM"]

# Estimate parameters
result, se, t_stat, p_value, aic, bic  = estimate_nested_logit(data_used, beta, beta_names, log_likelihood_SM_NLM)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['U_TRAIN'] = beta[2] * data['TRAIN_TT'] + beta[3] * (data['GA'] == 0) * data['TRAIN_CO']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['U_SM'] = beta[1] + beta[2] * data['SM_TT'] + beta[3] * (data['GA'] == 0) * data['SM_CO']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['U_CAR'] = 

Optimization Results:
       Parameter  Estimate  Robust Asymptotic SE  t-statistic   p-value
0        ASC_CAR -0.423693              0.123328    -3.435497  0.000594
1         ASC_SM -0.152823              0.196117    -0.779244  0.435853
2        BETA_TT -0.020277              0.002313    -8.765210  0.000000
3        BETA_TC -0.013852              0.000846   -16.380949  0.000000
4  lambda_CAR_SM  2.184909              0.196259    11.132768  0.000000
AIC: 17163.234220999868
BIC: 17199.628886817078
Log-likelihood: -8576.617110499934


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['U_TRAIN'] = beta[2] * data['TRAIN_TT'] + beta[3] * (data['GA'] == 0) * data['TRAIN_CO']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['U_SM'] = beta[1] + beta[2] * data['SM_TT'] + beta[3] * (data['GA'] == 0) * data['SM_CO']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['U_CAR'] = 

In [6]:
df_input = data[['TRAIN_TT', 'SM_TT', 'CAR_TT', 'TRAIN_CO', 'SM_CO', 'CAR_CO',
                  'TRAIN_AV', 'SM_AV', 'CAR_AV', 'GA']]
# Calculate frequencies and probabilities
vector_counts = df_input.value_counts().reset_index(name='Frequency')
vector_counts['Probability'] = vector_counts['Frequency'] / len(df_input)

# Creating a tuple of attributes to facilitate mapping
vector_counts['tuple'] = vector_counts[['TRAIN_TT', 'SM_TT', 'CAR_TT', 'TRAIN_CO', 'SM_CO', 'CAR_CO',
                                        'TRAIN_AV', 'SM_AV', 'CAR_AV', 'GA']].apply(tuple, axis=1)
probability_map = vector_counts.set_index('tuple')['Probability'].to_dict()

# Assign the probability to each row based on its tuple of attributes
df_input['Probability'] = df_input.apply(lambda row: probability_map[tuple(row)], axis=1)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_input['Probability'] = df_input.apply(lambda row: probability_map[tuple(row)], axis=1)


In [7]:
# Computation of p(x,y)
p_x = df_input['Probability'].values

p_y_given_x = data_used[['P_TRAIN_1', 'P_SM_2', 'P_CAR_3']].values
p_xy = p_x[:, np.newaxis] * p_y_given_x

# Normalize p_xy 
p_xy /= p_xy.sum()

# Define epsilon value
epsilon = 1e-20

# Add epsilon to elements equal to 0
p_xy[p_xy == 0] += epsilon

In [8]:
def simulate_choice_SM(row):
    """
    Function to simulate a choice based on cumulative probabilities.

    Parameters:
    - row: pandas Series or DataFrame row containing probabilities for each choice

    Returns:
    - The simulated choice (an integer between 1 and the number of choices)
    """
    # Generate a random number between 0 and 1
    random_number = np.random.rand()
    
    # Calculate cumulative probabilities
    probabilities = row[['P_TRAIN_1', 'P_SM_2', 'P_CAR_3']]
    cumulative_probs = probabilities.cumsum()
    
    # Determine simulated choice based on random number
    for i, cumulative_prob in enumerate(cumulative_probs):
        if random_number <= cumulative_prob:
            return i+1

In [9]:
np.random.seed(123)
data_used['simulated_choice'] = data_used.apply(simulate_choice_SM, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_used['simulated_choice'] = data_used.apply(simulate_choice_SM, axis=1)


In [10]:
betas = np.linspace(0, 50, 51)
# Initialize an empty list to store the number of clusters
num_clusters_list = []

# Iterate over each beta value
for beta in betas:
    # Run iterative_algorithm to obtain q_t_given_x
    q_t_given_x, _, _ = geom_DIB_on_alternatives(p_xy, max_iter=5000, beta=beta, threshold=1e-3)
    
    # Calculate the number of clusters
    column_sum = np.sum(q_t_given_x, axis=0)
    num_clusters = np.count_nonzero(column_sum)
    
    # Append the number of clusters to the list
    num_clusters_list.append(num_clusters)

# Plot the number of clusters against beta values
plt.plot(betas, num_clusters_list)
plt.xlabel('Beta')
plt.ylabel('Number of Clusters')
plt.title('Number of Clusters vs. Beta')
plt.grid(True)
plt.show()

Iteration: 1 out of 5000
Objective function value: 1.0986119248215014
H(T) =  1.0986119248215014
I(T;Y) =  2.082419482851705e-05
Iteration: 2 out of 5000
Objective function value: 0.7962520783544137
H(T) =  0.7962520783544137
I(T;Y) =  8.295005773373276e-06
Iteration: 3 out of 5000
Objective function value: 0.6001331339118812
H(T) =  0.6001331339118812
I(T;Y) =  5.295277509365093e-06
Iteration: 4 out of 5000
Objective function value: 0.4850734045945699
H(T) =  0.4850734045945699
I(T;Y) =  3.888940857121881e-06
Iteration: 5 out of 5000
Objective function value: 0.4094443711830501
H(T) =  0.4094443711830501
I(T;Y) =  3.072865617292919e-06
Iteration: 6 out of 5000
Objective function value: 0.3557024077508031
H(T) =  0.3557024077508031
I(T;Y) =  2.539899662368228e-06
Iteration: 7 out of 5000
Objective function value: 0.3153882655174312
H(T) =  0.3153882655174312
I(T;Y) =  2.16449791023976e-06
Iteration: 8 out of 5000
Objective function value: 0.2839298985028171
H(T) =  0.2839298985028171
I

KeyboardInterrupt: 

In [11]:
q_t_given_x, q_t, q_y_given_t = geom_DIB_on_alternatives(p_xy, beta=8, max_iter=5000, threshold=1e-4)

Iteration: 1 out of 5000
Objective function value: 1.0984453312628732
H(T) =  1.0986119248215014
I(T;Y) =  2.082419482851705e-05
Iteration: 2 out of 5000
Objective function value: 0.5633635847660903
H(T) =  1.069000531475403
I(T;Y) =  0.06320461833866409
Iteration: 3 out of 5000
Objective function value: 0.20039708415349855
H(T) =  0.8619103086816806
I(T;Y) =  0.08268915306602276
Iteration: 4 out of 5000
Objective function value: 0.08735973684204734
H(T) =  0.7002322246087325
I(T;Y) =  0.07660906097083564
Iteration: 5 out of 5000
Objective function value: -0.05209985493488256
H(T) =  0.5312286113668256
I(T;Y) =  0.07291605828771353
Iteration: 6 out of 5000
Objective function value: -0.11318293051204231
H(T) =  0.4622952868484894
I(T;Y) =  0.07193477717006647
Iteration: 7 out of 5000
Objective function value: -0.11524276633381597
H(T) =  0.45261954760394857
I(T;Y) =  0.07098278924222057
Iteration: 8 out of 5000
Objective function value: -0.11425267547086737
H(T) =  0.45003005102906346
I

In [12]:
import re

# Define a function to extract the number from the column name
def extract_number(column_name):
    match = re.search(r'\d+', column_name)  # Search for one or more digits in the column name
    if match:
        return int(match.group())  # Return the first occurrence of digits as an integer
    else:
        return None  # Return None if no digits are found

In [13]:
# Calculate the number of clusters
column_sum = np.sum(q_t_given_x, axis=0)
num_clusters = np.count_nonzero(column_sum)
print('Number of clusters :', num_clusters)

# Count the number of observations in each cluster
data_used['cluster'] = np.argmax(q_t_given_x, axis=1)
print('\n', data_used['cluster'].value_counts())

# number of each alternative 1, 2, 3, 4, 5 in each cluster 
cluster_counts = data_used.groupby(['cluster', 'CHOICE']).size().unstack(fill_value=0)
print('\n', cluster_counts)

data_used['max_proba'] = data_used[['P_TRAIN_1', 'P_SM_2', 'P_CAR_3']].idxmax(axis=1).apply(extract_number)
cluster_counts2 = data_used.groupby(['cluster', 'max_proba']).size().unstack(fill_value=0)
print('\n', cluster_counts2)

cluster_counts3 = data_used.groupby(['cluster', 'simulated_choice']).size().unstack(fill_value=0)
print('\n', cluster_counts3)

Number of clusters : 2

 cluster
2    9012
0    1698
Name: count, dtype: int64

 CHOICE     1     2     3
cluster                 
0        651  1043     4
2        772  5170  3070

 max_proba    1     2     3
cluster                   
0          128  1570     0
2            0  7303  1709

 simulated_choice    1     2     3
cluster                          
0                 527  1171     0
2                 902  4996  3114


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_used['cluster'] = np.argmax(q_t_given_x, axis=1)
