# Read data

This program reads in the raw data accompanying Brownstone and Train (1998), `xmat.txt`, and converts to a `csv file` for the students to use. 

In [1]:
import pandas as pd 
import numpy as np 

In [2]:
labels = ['Person ID', 'Row num', 'Binary choice','Price/ln(income)','Range','Acceleration','Top speed','Pollution','Size','Big enough','Luggage space','Operating cost','Station availability','Sports utility vehicle','Sports car','Station wagon','Truck','Van','Constant for EV','Commute*EV','College*EV','Constant for CNG','Constant for methanol','College*methanol','Non-EV','Non-CNG']
varnames = ['person_id', 'rownum', 'binary_choice','price_to_inc','range','acceleration','top_speed','pollution','size','big_enough','luggage_space','operating_cost','station_availability','suv','sports_car','station_wagon','truck','van','ev','commute_x_ev','college_x_ev','cng','methanol','college_x_methanol','not_ev','not_cng']

# we will drop not_ev and not_cng (to avoid multicollinearity)
xvars = ['price_to_inc','range','acceleration','top_speed','pollution','size','big_enough','luggage_space','operating_cost','station_availability','suv','sports_car','station_wagon','truck','van','ev','commute_x_ev','college_x_ev','cng','methanol','college_x_methanol']

In [3]:
tab = pd.read_csv('raw_data/xmat.txt', delimiter='\t', names=varnames)

In [4]:
# j is the index for each car, i.e. [0,1,2,3,4,5,6,0,1,...,6,0,...], repeating N times 
tab['j'] = tab.groupby('person_id').binary_choice.transform(lambda x : np.arange(len(x)))

# y is the discrete choice, it is the index for the car where binary_choice == 1 
tab['y'] = tab.groupby('person_id').binary_choice.transform(lambda x : x.argmax())

In [5]:
tab[['person_id', 'binary_choice', 'j', 'y']].query('person_id == 10')

Unnamed: 0,person_id,binary_choice,j,y
54,10,0,0,1
55,10,1,1,1
56,10,0,2,1
57,10,0,3,1
58,10,0,4,1
59,10,0,5,1


## Output

In [6]:
tab.person_id -= 1

In [7]:
tab.to_csv('car_data.csv', index=False)

# Labels

In [8]:
desc = ['Person identifier', 'Row number in the dataset', 'Dummy, =1 if this row is the car that was chosen' ,'Purchase price in thousands of dollars, divided by the natural log of household income in thousands','Hundreds of miles that the vehicle can travel between refuelings/rechargings','Seconds required to reach 30 mph from stop, in tens of seconds (e.g., 3 s is entered as 0.3)','Highest speed that the vehicle can attain, in hundreds of miles/h (e.g., 80 mph is entered as 0.80)','Tailpipe emissions as fraction of comparable new gas vehicle','0"mini, 0.1"subcompact, 0.2"compact, 0.3"mid-size or large','1 if household size is over 2 and vehicle size is 3; 0 otherwise','Luggage space as fraction of comparable new gas vehicle','Cost per mile of travel, in tens of cents per mile (e.g., 5 cents/mile is entered as 0.5.). For electric vehicles, cost is for home recharging. For other vehicles, cost is for station refueling','Fraction of stations that have capability to refuel/recharge the vehicle','1 for sports utility vehicle, zero otherwise','1 for sports car, zero otherwise','1 for station wagon, zero otherwise','1 for truck, zero otherwise','1 for van, zero otherwise','1 for electric vehicle, zero otherwise','1 if respondent commutes less than five miles each day and vehicle is electric; zero otherwise','1 if respondent had some college education and vehicle is electric; zero otherwise','1 for compressed natural gas vehicle, zero otherwise','1 for methanol vehicle, zero otherwise','1 if respondent had some college education and vehicle is methanol; zero otherwise', 'Dummy, =1 if car is not EV', 'Dummy, =1 if car is not CNG', 'Index for the car number (0,1,...,5)', 'Index for the chosen car (0,1,...,5)']

In [9]:
# add the extra variables we have created 
varnames.append('j')
labels.append('Car number')

varnames.append('y')
labels.append('Chosen car')

In [10]:
lab = pd.DataFrame({'variable':varnames, 'label':labels, 'description':desc})
lab.head(3)

Unnamed: 0,variable,label,description
0,person_id,Person ID,Person identifier
1,rownum,Row num,Row number in the dataset
2,binary_choice,Binary choice,"Dummy, =1 if this row is the car that was chosen"


In [11]:
lab.set_index('variable').to_csv('car_labels.csv')