In [5]:
import pandas as pd
import requests
from io import StringIO
from skimpy import skim 
import numpy as np
import larch.numba as lx


### larch.numba is experimental, and not feature-complete ###
 the first time you import on a new system, this package will
 compile optimized binaries for your machine, which may take 
 a little while, please be patient 

OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.


## Download or view input data files

All files are publicly shared at [here](https://unsw-my.sharepoint.com/:f:/g/personal/z5005182_ad_unsw_edu_au/ElRRKDgPUXVOrvhpTrSNfPkBql9AAmqobMmBPgLsx8EIYQ?e=B14TfJ). 

Note that, OneDrive doesn't provide shared links that can directly download shared files. The trick is to replace the last part after `?` with `download=1`. 

In [2]:
def read_csv_from_url(url):
    headers = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:66.0) Gecko/20100101 Firefox/66.0"}
    req = requests.get(url, headers=headers)
    data = StringIO(req.text)
    data_csv = pd.read_csv(data)
    return(data_csv)

households = read_csv_from_url("https://unsw-my.sharepoint.com/:x:/g/personal/z5005182_ad_unsw_edu_au/EbLQEsk7UelFlyUGMUc2CckBDi3KNC7kW3l6XaNOHmkdaw?download=1")
persons = read_csv_from_url("https://unsw-my.sharepoint.com/:x:/g/personal/z5005182_ad_unsw_edu_au/ESa25fnqOWxFmrwN0xNj6sMBMzFjvJM1fTccglqCc_8Dgg?download=1")
trips = read_csv_from_url("https://unsw-my.sharepoint.com/:x:/g/personal/z5005182_ad_unsw_edu_au/EWFMhJS4WtFMvvRMLLCTmDsBA3zYHeOzsGOrfIuePRw9Yg?download=1")

  data_csv = pd.read_csv(data)


In [3]:
households = households.rename(columns=str.lower)
persons = persons.rename(columns=str.lower)
trips = trips.rename(columns=str.lower)

In [31]:
bins = [0, 1, 2, 3, float('inf')]
names = ['0', '1', '2', '3+']
households['cars_cat'] = pd.cut(households.cars, bins, labels=names, include_lowest=True, right=False)
households.statistics()

Unnamed: 0,n,minimum,maximum,median,histogram,mean,stdev,zeros,positives,negatives,nonzero_minimum,nonzero_maximum,nonzero_mean,nonzero_stdev,nulls,mode
hhid,25140,,,,,,,,,,,,,,,
surveyperiod,25140,,,,,,,,,,,,,,,
travdow,25140,,,,,,,,,,,,,,,
travmonth,25140,,,,,,,,,,,,,,,
daytype,25140,,,,,,,,,,,,,,,
dwelltype,25140,,,,,,,,,,,,,,,
owndwell,25140,,,,,,,,,,,,,,,
hhsize,25140,1.0,11.0,2.0,"2022-11-21T21:37:11.447777  image/svg+xml  Matplotlib v3.6.2, https://matplotlib.org/  Histograms are green if the displayed range truncates some extreme outliers.",2.56627,1.32359,0.0,25140.0,0.0,1.0,11.0,2.56627,1.32359,,
hhinc,25140,,,,,,,,,,,,,,,
visitors,25140,0.0,12.0,0.0,,0.247772,0.804166,22057.0,3083.0,0.0,1.0,12.0,2.02043,1.30068,,


In [32]:
# prepare choice model data
hhauto = (
    households
        .filter(['hhid', 'cars_cat', 'dwelltype', 'hhsize', 'hhinc', 'owndwell'])
        .assign(
            # integerise household income (hhinc)
            hhinc = lambda x: x.hhinc.str.strip().str.replace(r'\$|,', '').str.replace('^$', '0').astype(int),
            owndwell = lambda x: x.owndwell.astype('category'),
            dwelltype = lambda x: x.dwelltype.astype('category')
        )
)

hhauto.statistics()

  hhinc = lambda x: x.hhinc.str.strip().str.replace(r'\$|,', '').str.replace('^$', '0').astype(int),


Unnamed: 0,n,histogram,nulls,mode,minimum,maximum,median,mean,stdev,zeros,positives,negatives,nonzero_minimum,nonzero_maximum,nonzero_mean,nonzero_stdev
hhid,25140,,,,,,,,,,,,,,,
cars_cat,25140,"2022-11-21T21:37:15.043336  image/svg+xml  Matplotlib v3.6.2, https://matplotlib.org/  Histograms are purple if the data is represented as discrete values.",0.0,1,,,,,,,,,,,,
dwelltype,25140,"2022-11-21T21:37:15.076857  image/svg+xml  Matplotlib v3.6.2, https://matplotlib.org/  Histograms are purple if the data is represented as discrete values.",0.0,Separate House,,,,,,,,,,,,
hhsize,25140,"2022-11-21T21:37:15.109021  image/svg+xml  Matplotlib v3.6.2, https://matplotlib.org/  Histograms are green if the displayed range truncates some extreme outliers.",,,1.0,11.0,2.0,2.56627,1.32359,0.0,25140.0,0.0,1.0,11.0,2.56627,1.32359
hhinc,25140,"2022-11-21T21:37:15.141563  image/svg+xml  Matplotlib v3.6.2, https://matplotlib.org/  Histograms are green if the displayed range truncates some extreme outliers.",,,0.0,12500.0,1725.0,1907.29,1331.08,468.0,24672.0,0.0,100.0,12500.0,1943.47,1317.22
owndwell,25140,"2022-11-21T21:37:15.180134  image/svg+xml  Matplotlib v3.6.2, https://matplotlib.org/  Histograms are purple if the data is represented as discrete values.",0.0,Fully Owned,,,,,,,,,,,,


## Prepare choice data

{larch} requires the choice data to be in a specific format.

In [192]:
df = (
    hhauto
     .filter(['hhid'])
     .merge(pd.DataFrame({'cars_choice': ['0', '1', '2', '3+']}), how='cross')
     .merge(hhauto, on = ['hhid'])
     .rename(columns={'cars_cat': 'cars'})
     .assign(cars_chosen = lambda x: np.where(x.cars == x.cars_choice, 1, 0))
     .drop(['cars'], axis = 1)
     .assign(hhid = lambda x: x.groupby(['hhid']).ngroup()+1)
)
df.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 100560 entries, 0 to 100559
Data columns (total 7 columns):
 #   Column       Non-Null Count   Dtype   
---  ------       --------------   -----   
 0   hhid         100560 non-null  int64   
 1   cars_choice  100560 non-null  object  
 2   dwelltype    100560 non-null  category
 3   hhsize       100560 non-null  int64   
 4   hhinc        100560 non-null  int64   
 5   owndwell     100560 non-null  category
 6   cars_chosen  100560 non-null  int64   
dtypes: category(2), int64(4), object(1)
memory usage: 4.8+ MB


Problems to be addressed
- `alt_codes` and `alt_names` cannot be specified.
- m.

In [200]:
d = lx.DataFrames(
    df.set_index(['hhid', 'cars_choice']),
    ch='cars_chosen',
    crack=True,
    # alt_codes=[0,1,2,3],
    # alt_names=['a','b','c'],
    av=True
    # alt_names=['a', 'b', 'c'],
    # alt_codes=[1, 2, 3]
)
d.info(verbose=True)
d.alternative_names()

converting data_ch to <class 'numpy.float64'>


larch.DataFrames:  (not computation-ready)
  n_cases: 25140
  n_alts: 4
  data_ca:
    - dwelltype   (100560 non-null category)
    - owndwell    (100560 non-null category)
    - cars_chosen (100560 non-null int64)
  data_co:
    - hhsize (25140 non-null int64)
    - hhinc  (25140 non-null int64)
  data_av: <populated>
  data_ch: cars_chosen


['0', '1', '2', '3+']

## Estimate a driver's licence status model

Using the persons table and larch, we estimate a MNL model for predicting the probabilities of driver's licence status.

In [208]:
m = lx.Model(dataservice=d)
m.utility_co[1] = P("ASC_1") + P("hhinc#2") * X("hhinc") + P("hhsize#2") * X("hhsize")
m.utility_co[2] = P("ASC_2") + P("hhinc#3") * X("hhinc") + P("hhsize#3") * X("hhsize")
m.utility_co[3] = P("ASC_3+") + P("hhinc#4") * X("hhinc") + P("hhsize#4") * X("hhsize")
# m.utility_co[4] = P("ASC_4+") + P("hhinc#5") * X("hhinc") + P("hhsize#5") * X("hhsize")
# m.utility_ca = PX("x2")
m.estimate()
m.parameter_summary()

req_data does not request {choice_ca,choice_co,choice_co_code} but choice is set and being provided
req_data does not request avail_ca or avail_co but it is set and being provided
converting data_co to <class 'numpy.float64'>


Unnamed: 0,value,initvalue,nullvalue,minimum,maximum,holdfast,note,best
ASC_1,3.98867,0.0,0.0,-inf,inf,0,,3.98867
ASC_2,4.786766,0.0,0.0,-inf,inf,0,,4.786766
ASC_3+,2.914152,0.0,0.0,-inf,inf,0,,2.914152
hhinc#2,-0.000454,0.0,0.0,-inf,inf,0,,-0.000454
hhinc#3,-0.000384,0.0,0.0,-inf,inf,0,,-0.000384
hhinc#4,-0.000202,0.0,0.0,-inf,inf,0,,-0.000202
hhsize#2,-0.759856,0.0,0.0,-inf,inf,0,,-0.759856
hhsize#3,-0.702653,0.0,0.0,-inf,inf,0,,-0.702653
hhsize#4,-0.351632,0.0,0.0,-inf,inf,0,,-0.351632


Unnamed: 0,Value,Std Err,t Stat,Signif,Null Value
ASC_1,3.99,0.083,48.06,***,0.0
ASC_2,4.79,0.0779,61.45,***,0.0
ASC_3+,2.91,0.0786,37.09,***,0.0
hhinc#2,-0.000454,2.26e-05,-20.06,***,0.0
hhinc#3,-0.000384,1.87e-05,-20.57,***,0.0
hhinc#4,-0.000202,1.81e-05,-11.12,***,0.0
hhsize#2,-0.76,0.0239,-31.78,***,0.0
hhsize#3,-0.703,0.0202,-34.72,***,0.0
hhsize#4,-0.352,0.0198,-17.78,***,0.0


## A minimal example of Larch with on a dummy dataset

In [145]:
test_df = (pd.DataFrame({
    'caseid': [1, 1, 1, 2, 2, 2],
    'altid': ['a', 'b', 'c', 'a', 'b', 'c'],
    'chose': [1, 0, 0, 0, 1, 0],
    'x1': [500, 500, 500, 2000, 2000, 2000],
    'x2': [123, 322, 435, 123, 435, 234],
    'x_av': [1, 1, 0, 1, 1, 1],
}).assign(altid = lambda x: x.altid.astype('category'))
      )

# test_df.reset_index().info()

In [147]:
# ds = lx.Dataset.construct.from_idca(test_df.set_index(['caseid', 'altid']))
# m = lx.Model(ds)
# m.choice_ca_code = 'cars_cat'
# ds

d = lx.DataFrames(
    test_df.set_index(['caseid', 'altid']),
    ch='chose',
    crack=True,
    # alt_codes=[1,2,3],
    # alt_names=['a','b','c'],
    av='x_av == 1'
    # alt_names=['a', 'b', 'c'],
    # alt_codes=[1, 2, 3]
)
d.info(verbose=True)
d.alternative_names()

converting data_ch to <class 'numpy.float64'>


larch.DataFrames:  (not computation-ready)
  n_cases: 2
  n_alts: 3
  data_ca:
    - chose (6 non-null int64)
    - x2    (6 non-null int64)
    - x_av  (6 non-null int64)
  data_co:
    - x1 (2 non-null int64)
  data_av: x_av == 1
  data_ch: chose


['a', 'b', 'c']

In [148]:
# d.data_co.statistics()
m = lx.Model(dataservice=d)

In [None]:
from larch import P, X, PX

In [149]:
m.utility_co[2] = P("ASC_b")  + P("x1#2") * X("x1")
m.utility_co[3] = P("ASC_c") + P("x1#3") * X("x1")
m.utility_ca = PX("x2")
m.estimate()
m.parameter_summary()

req_data does not request {choice_ca,choice_co,choice_co_code} but choice is set and being provided
req_data does not request avail_ca or avail_co but it is set and being provided
converting data_ca to <class 'numpy.float64'>
converting data_co to <class 'numpy.float64'>


Unnamed: 0,value,initvalue,nullvalue,minimum,maximum,holdfast,note,best
ASC_b,1.320741,0.0,0.0,-inf,inf,0,,1.320741
ASC_c,455870.090751,0.0,0.0,-inf,inf,0,,455870.090751
x1#2,0.152631,0.0,0.0,-inf,inf,0,,0.152631
x1#3,-519.650844,0.0,0.0,-inf,inf,0,,-519.650844
x2,-0.883112,0.0,0.0,-inf,inf,0,,-0.883112




Unnamed: 0,Value,Std Err,t Stat,Signif,Like Ratio,Null Value
ASC_b,1.32,0.0,,[],0.00,0.0
ASC_c,456000.0,0.0,,[],0.00,0.0
x1#2,0.153,0.0,,[***],274.21,0.0
x1#3,-520.0,0.0,,[***],BIG,0.0
x2,-0.883,0.0,,[***],77.64,0.0
