Set up the notebook

In [1]:
import os
import csv
import json

notebook_path = os.path.abspath("generate_metadata.ipynb")
print('foo')

foo


Add the CSV file you're working with (in the same folder as this python notebook):

In [2]:
infilename = 'cleaned-final/auto-mpg.csv'#"{your file here}.csv"
infilepath = os.path.join(os.path.dirname(notebook_path), infilename)

Fill out the fields for the data card, excepting "fields" and "column" - we'll get to those later. A template is [here](https://hackmd.io/62se7jj-Qoycs__e6NjS2w).

In [3]:
data = {
    #ignore this
    'fields': [],
    'defaultLabelColumn': '',
    #here we include the basic card information
    'card': {
        # a short description of the dataset
        'description': "A small dataset of different car statistics, originally from 1983, used to predict their MPG (miles per gallon).", 
        # a link to the original source
        'source': 'UCI (http://archive.ics.uci.edu/ml/datasets/Auto+MPG)', 
        #date last updated (if possible)
        'lastUpdated': '1983', 
        'context': 
        {
            # who it was created by
            'createdBy': "Carnegie Mellon's StatLib",
            # has it been cleaned/prepared for use
            'preparation': 'Yes', 
            # does it contain potential identifying/confidential information
            'confidentiality': 'No', 
            # does it contain information that can identify a subgroup of people (age, race, gender)
            'subgroupIdentifiers': 'No', 
            # what are potential uses (e.g. what are some successful combinations of features)?
            'potentialUses': "Determine a car's MPG based on various features like weight and horsepower.", 
            # what should it not be used for?
            'potentialMisuses': "Since it's 30 years old, it may not reflect advancements in technology - don't buy your car based on this!"
        }
    }
}


Now that we've done our basic setup, let's get to the columns. Run this code, which should display the available columns in the csv

In [8]:
with open(infilepath, 'r') as infile:
    reader = csv.reader(infile, delimiter=",")
    csv_list = list(map(tuple, reader))
    columns = csv_list[0]
    print(columns)

('mpg', 'cylinders', 'displacement', 'horsepower', 'weight', 'acceleration', 'model year', 'car name')


First, if you have a recommended investigation, enter the relevant features here: 

In [9]:
#set features
data['recommended_features'] = ['horsepower']
#set label
data['defaultLabelColumn'] = 'weight'

Assign each column to a list of either continuous or categorical data, e.g. ```
continuous = ['temperature', 'score']
categorical = ['state', 'color']
```

In [10]:

    continuous = ['mpg', 'cylinders', 'displacement', 'horsepower', 'weight', 'acceleration']
    categorical = ['model year', 'car name']
    # reset fields in case you're making changes
    data['fields'] = []
    for i in columns:
        if i in continuous:
            data['fields'].append({'type': 'continuous', 'id': i})
        elif i in categorical:
            data['fields'].append({'type': 'categorical', 'id': i})
        else:
            raise Exception("You forget to set a type for %f", i)
    
#     print('Set field information:', data['fields'])
 #    columns = data['fields'].copy()
#     print('\b')
    print('columns: ', columns)
    print('column count: ', len(columns))
    

columns:  ('mpg', 'cylinders', 'displacement', 'horsepower', 'weight', 'acceleration', 'model year', 'car name')
column count:  8


We've set our columns for the interface, now let's just add descriptions as a list - e.g. if our columns are `['Year', 'Temperature']` our list might be `['The year the measurement was taken', 'The temperature in Celsius']`. Do this in the same order the columns are printed above.

In [12]:
    desc = ['Miles per gallon.', 'The number of cylinders in the engine.', 'A measure of cylinder volume.', 'The power of the car.', "The car's weight in pounds.", '', 'The year the car was released.', 'The name of the car: all unique values']
    # set this to true if you want to ignore empty column descriptions
    ignore_empty = False
    if len(desc) < len(data['fields']) and ignore_empty == False:
        print('number of descriptions: ', len(desc))
        raise Exception("You don't have a description for each column!")
    for idx, i in enumerate(data['fields']):
        if idx < len(desc) and desc[idx] is not i['id']: 
            i['description'] =  desc[idx]
    print(data['fields'])

[{'type': 'continuous', 'id': 'mpg', 'description': 'Miles per gallon.'}, {'type': 'continuous', 'id': 'cylinders', 'description': 'The number of cylinders in the engine.'}, {'type': 'continuous', 'id': 'displacement', 'description': 'A measure of cylinder volume.'}, {'type': 'continuous', 'id': 'horsepower', 'description': 'The power of the car.'}, {'type': 'continuous', 'id': 'weight', 'description': "The car's weight in pounds."}, {'type': 'continuous', 'id': 'acceleration', 'description': ''}, {'type': 'categorical', 'id': 'model year', 'description': 'The year the car was released.'}, {'type': 'categorical', 'id': 'car name', 'description': 'The name of the car: all unique values'}]


Take a look and make sure everything is right, and if you're confident, we can write our data to a JSON file.

In [13]:
    print(infilename)
    with open(infilename.split('.')[0] + '.json', 'w') as outfile:
        json.dump(data, outfile)

cleaned-final/auto-mpg.csv
