In [11]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from functions_NLM import estimate_nested_logit, simulate_choice
from functions_geom_DIB import geom_DIB_on_alternatives

In [12]:
# Load data
data = pd.read_csv('../data/swissmetro.dat', sep='\t')

In [13]:
data_used = data[['ID', 'TRAIN_TT', 'SM_TT', 'CAR_TT', 'TRAIN_CO', 'SM_CO', 'CAR_CO',
                   'TRAIN_AV', 'SM_AV', 'CAR_AV', 'GA', 'CHOICE']]

In [14]:
# DEFINE MODEL STRUCTURE AND LIKELIHOOD FUNCTION
def log_likelihood_SM_CNLM(beta, data):
    alpha_SM_WITH_CAR = np.exp(beta[6]) / (1 + np.exp(beta[6]))
    alpha_SM_WITH_TRAIN = 1 - alpha_SM_WITH_CAR

    # define utility functions
    data['U_TRAIN'] = beta[2] * data['TRAIN_TT'] + beta[3] * (data['GA'] == 0) * data['TRAIN_CO']
    data['U_SM'] = beta[1] + beta[2] * data['SM_TT'] + beta[3] * (data['GA'] == 0) * data['SM_CO']
    data['U_CAR'] = beta[0] + beta[2] * data['CAR_TT'] + beta[3] * data['CAR_CO']

    # combined utility terms inside nests
    data['U_CAR_SM'] = (alpha_SM_WITH_CAR * data['SM_AV'] * np.exp(data['U_SM'])) ** (1 / beta[4]) + \
                          (1 * data['CAR_AV'] * np.exp(data['U_CAR'])) ** (1 / beta[4])
    data['U_SM_TRAIN'] = (alpha_SM_WITH_TRAIN * data['SM_AV'] * np.exp(data['U_SM'])) ** (1 / beta[5]) + \
                         (1 * data['TRAIN_AV'] * np.exp(data['U_TRAIN'])) ** (1 / beta[5])

    # Nest probabilities
    data['P_nest_CAR_SM'] = (data['U_CAR_SM'] ** beta[4]) / \
                                (data['U_CAR_SM'] ** beta[4] + data['U_SM_TRAIN'] ** beta[5])
    data['P_nest_SM_TRAIN'] = 1 - data['P_nest_CAR_SM']

    # Within nest probabilities for any nests with > 1 alt
    data['P_SM_in_CAR_SM'] = ((alpha_SM_WITH_CAR * data['SM_AV'] * np.exp(data['U_SM'])) ** (1 / beta[4])) / \
                                data['U_CAR_SM']
    data['P_CAR_in_CAR_SM'] = 1 - data['P_SM_in_CAR_SM']
    data['P_SM_in_SM_TRAIN'] = ((alpha_SM_WITH_TRAIN * data['SM_AV'] * np.exp(data['U_SM'])) ** (1 / beta[5])) / \
                                    data['U_SM_TRAIN']
    data['P_TRAIN_in_SM_TRAIN'] = 1 - data['P_SM_in_SM_TRAIN']

    # Full probabilities
    data['P_TRAIN_1'] = data['P_nest_SM_TRAIN'] * data['P_TRAIN_in_SM_TRAIN']
    data['P_SM_2'] = data['P_nest_SM_TRAIN'] * data['P_SM_in_SM_TRAIN'] + \
                        data['P_nest_CAR_SM'] * data['P_SM_in_CAR_SM']
    data['P_CAR_3'] = data['P_nest_CAR_SM'] * data['P_CAR_in_CAR_SM']

    # calculate probability for chosen alternative for each row in the data table
    data['P'] = ((data['CHOICE'] == 1) * data['P_TRAIN_1'] +
                 (data['CHOICE'] == 2) * data['P_SM_2'] +
                 (data['CHOICE'] == 3) * data['P_CAR_3'])

    # take product across choices for the same person (likelihood)
    # then take the log for log-likelihood
    LL = data.groupby('ID')['P'].prod().apply(np.log).sum()

    return -LL  # We minimize negative log-likelihood

In [15]:
# DEFINE MODEL PARAMETERS
beta = np.array([0, 0, 0, 0, 1, 1, 0])
beta_names = ["ASC_CAR", "ASC_SM", "BETA_TT", "BETA_TC", "lambda_CAR_TRAIN", "lambda_SM_TRAIN", "base_alpha_TRAIN_WITH_CAR"]


# Estimate parameters
result, se, t_stat, p_value, aic, bic  = estimate_nested_logit(data_used, beta, beta_names, log_likelihood_SM_CNLM)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['U_TRAIN'] = beta[2] * data['TRAIN_TT'] + beta[3] * (data['GA'] == 0) * data['TRAIN_CO']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['U_SM'] = beta[1] + beta[2] * data['SM_TT'] + beta[3] * (data['GA'] == 0) * data['SM_CO']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['U_CAR'] = 

Optimization Results:
                   Parameter  Estimate  Robust Asymptotic SE  t-statistic  \
0                    ASC_CAR -2.148966              0.219706    -9.781088   
1                     ASC_SM  0.206893              0.116850     1.770587   
2                    BETA_TT -0.029486              0.001467   -20.092575   
3                    BETA_TC -0.021405              0.000984   -21.752910   
4           lambda_CAR_TRAIN  8.781123              0.025130   349.425149   
5            lambda_SM_TRAIN  1.814824              0.126595    14.335714   
6  base_alpha_TRAIN_WITH_CAR -7.769435              0.506294   -15.345705   

    p-value  
0  0.000000  
1  0.076658  
2  0.000000  
3  0.000000  
4  0.000000  
5  0.000000  
6  0.000000  
AIC: 16923.274882700705
BIC: 16974.227414844798
Log-likelihood: -8454.637441350353


In [16]:
df_input = data[['TRAIN_TT', 'SM_TT', 'CAR_TT', 'TRAIN_CO', 'SM_CO', 'CAR_CO',
                  'TRAIN_AV', 'SM_AV', 'CAR_AV', 'GA']]
# Calculate frequencies and probabilities
vector_counts = df_input.value_counts().reset_index(name='Frequency')
vector_counts['Probability'] = vector_counts['Frequency'] / len(df_input)

# Creating a tuple of attributes to facilitate mapping
vector_counts['tuple'] = vector_counts[['TRAIN_TT', 'SM_TT', 'CAR_TT', 'TRAIN_CO', 'SM_CO', 'CAR_CO',
                                        'TRAIN_AV', 'SM_AV', 'CAR_AV', 'GA']].apply(tuple, axis=1)
probability_map = vector_counts.set_index('tuple')['Probability'].to_dict()

# Assign the probability to each row based on its tuple of attributes
df_input['Probability'] = df_input.apply(lambda row: probability_map[tuple(row)], axis=1)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_input['Probability'] = df_input.apply(lambda row: probability_map[tuple(row)], axis=1)


In [17]:
# Computation of p(x,y)
p_x = df_input['Probability'].values

p_y_given_x = data_used[['P_TRAIN_1', 'P_SM_2', 'P_CAR_3']].values
p_xy = p_x[:, np.newaxis] * p_y_given_x

# Normalize p_xy 
p_xy /= p_xy.sum()

# Define epsilon value
epsilon = 1e-20

# Add epsilon to elements equal to 0
p_xy[p_xy == 0] += epsilon

In [18]:
def simulate_choice_SM(row):
    """
    Function to simulate a choice based on cumulative probabilities.

    Parameters:
    - row: pandas Series or DataFrame row containing probabilities for each choice

    Returns:
    - The simulated choice (an integer between 1 and the number of choices)
    """
    # Generate a random number between 0 and 1
    random_number = np.random.rand()
    
    # Calculate cumulative probabilities
    probabilities = row[['P_TRAIN_1', 'P_SM_2', 'P_CAR_3']]
    cumulative_probs = probabilities.cumsum()
    
    # Determine simulated choice based on random number
    for i, cumulative_prob in enumerate(cumulative_probs):
        if random_number <= cumulative_prob:
            return i+1

In [19]:
np.random.seed(123)
data_used['simulated_choice'] = data_used.apply(simulate_choice_SM, axis=1)

In [20]:
betas = np.linspace(0, 50, 51)
# Initialize an empty list to store the number of clusters
num_clusters_list = []

# Iterate over each beta value
for beta in betas:
    # Run iterative_algorithm to obtain q_t_given_x
    q_t_given_x, _, _ = geom_DIB_on_alternatives(p_xy, max_iter=5000, beta=beta, threshold=1e-3)
    
    # Calculate the number of clusters
    column_sum = np.sum(q_t_given_x, axis=0)
    num_clusters = np.count_nonzero(column_sum)
    
    # Append the number of clusters to the list
    num_clusters_list.append(num_clusters)

# Plot the number of clusters against beta values
plt.plot(betas, num_clusters_list)
plt.xlabel('Beta')
plt.ylabel('Number of Clusters')
plt.title('Number of Clusters vs. Beta')
plt.grid(True)
plt.show()

Iteration: 1 out of 5000
Objective function value: 1.0986119248215014
H(T) =  1.0986119248215014
I(T;Y) =  2.380731977691397e-05
Iteration: 2 out of 5000
Objective function value: 0.7962520783544135
H(T) =  0.7962520783544135
I(T;Y) =  9.52332133330458e-06
Iteration: 3 out of 5000
Objective function value: 0.6001331339118811
H(T) =  0.6001331339118811
I(T;Y) =  6.073983753296375e-06
Iteration: 4 out of 5000
Objective function value: 0.4850734045945699
H(T) =  0.4850734045945699
I(T;Y) =  4.4589827761232215e-06
Iteration: 5 out of 5000
Objective function value: 0.40944437118304994
H(T) =  0.40944437118304994
I(T;Y) =  3.522437366898501e-06
Iteration: 6 out of 5000
Objective function value: 0.3557024077508029
H(T) =  0.3557024077508029
I(T;Y) =  2.911036464325889e-06
Iteration: 7 out of 5000
Objective function value: 0.31538826551743104
H(T) =  0.31538826551743104
I(T;Y) =  2.4805024825269584e-06
Iteration: 8 out of 5000
Objective function value: 0.2839298985028169
H(T) =  0.283929898502

KeyboardInterrupt: 

In [24]:
q_t_given_x, q_t, q_y_given_t = geom_DIB_on_alternatives(p_xy, beta=100, max_iter=5000, threshold=1e-4)

Iteration: 1 out of 5000
Objective function value: 1.09623119284381
H(T) =  1.0986119248215014
I(T;Y) =  2.380731977691397e-05
Iteration: 2 out of 5000
Objective function value: -7.3645595453648705
H(T) =  0.9331889327234133
I(T;Y) =  0.08297748478088285
Iteration: 3 out of 5000
Objective function value: -7.92121855932959
H(T) =  1.0065415250938958
I(T;Y) =  0.08927760084423486
Iteration: 4 out of 5000
Objective function value: -8.623983125605077
H(T) =  1.0482003437375436
I(T;Y) =  0.09672183469342621
Iteration: 5 out of 5000
Objective function value: -9.258150996710539
H(T) =  1.0646666474679445
I(T;Y) =  0.10322817644178484
Iteration: 6 out of 5000
Objective function value: -9.454269811431354
H(T) =  1.0616909833067347
I(T;Y) =  0.10515960794738088
Iteration: 7 out of 5000
Objective function value: -9.505284526983806
H(T) =  1.0556821806143564
I(T;Y) =  0.10560966707598163
Iteration: 8 out of 5000
Objective function value: -9.522708033753334
H(T) =  1.049971387800376
I(T;Y) =  0.105

In [25]:
import re

# Define a function to extract the number from the column name
def extract_number(column_name):
    match = re.search(r'\d+', column_name)  # Search for one or more digits in the column name
    if match:
        return int(match.group())  # Return the first occurrence of digits as an integer
    else:
        return None  # Return None if no digits are found

In [26]:
# Calculate the number of clusters
column_sum = np.sum(q_t_given_x, axis=0)
num_clusters = np.count_nonzero(column_sum)
print('Number of clusters :', num_clusters)

# Count the number of observations in each cluster
data_used['cluster'] = np.argmax(q_t_given_x, axis=1)
print('\n', data_used['cluster'].value_counts())

# number of each alternative 1, 2, 3, 4, 5 in each cluster 
cluster_counts = data_used.groupby(['cluster', 'CHOICE']).size().unstack(fill_value=0)
print('\n', cluster_counts)

data_used['max_proba'] = data_used[['P_TRAIN_1', 'P_SM_2', 'P_CAR_3']].idxmax(axis=1).apply(extract_number)
cluster_counts2 = data_used.groupby(['cluster', 'max_proba']).size().unstack(fill_value=0)
print('\n', cluster_counts2)

cluster_counts3 = data_used.groupby(['cluster', 'simulated_choice']).size().unstack(fill_value=0)
print('\n', cluster_counts3)

Number of clusters : 3

 cluster
0    4869
2    3784
1    2057
Name: count, dtype: int64

 CHOICE     1     2     3
cluster                 
0        527  3452   890
1        680  1350    27
2        216  1411  2157

 max_proba   1     2     3
cluster                  
0           0  4869     0
1          67  1990     0
2           0  1573  2211

 simulated_choice    1     2     3
cluster                          
0                 673  3113  1083
1                 542  1505    10
2                 241  1591  1952
