In [13]:
import pandas as pd
from pandas.api.types import CategoricalDtype

df = pd.read_csv('../bike-sharing/data.csv.gz')

dfc = df

categorical_columns = [
    "holiday", "season", "mnth", "dteday"
]

for column in categorical_columns:
    dfc[column] = dfc[column].astype(CategoricalDtype(ordered=True))
    dfc[column] = dfc[column].cat.codes

In [15]:
df.columns

Index(['Unnamed: 0', 'Unnamed: 0.1', 'Unnamed: 0.1.1', 'Unnamed: 0.1.1.1',
       'Unnamed: 0.1.1.1.1', 'Unnamed: 0.1.1.1.1.1', 'Unnamed: 0.1.1.1.1.1.1',
       'Unnamed: 0.1.1.1.1.1.1.1', 'instant', 'dteday', 'season', 'yr', 'mnth',
       'holiday', 'weekday', 'workingday', 'weathersit', 'temp', 'atemp',
       'hum', 'windspeed', 'casual', 'registered', 'cnt'],
      dtype='object')

In [5]:
from algoneer.dataset import DataSet
from algoneer.dataschema import DataSchema

In [31]:
ds = DataSchema({
    'name' : 'Bike Rental Data',
    'url' : 'https://archive.ics.uci.edu/ml/datasets/bike+sharing+dataset',
    'description' : 'This dataset contains the hourly and daily count of rental'
                    ' bikes between years 2011 and 2012 in Capital bikeshare system'
                    ' with the corresponding weather and seasonal information.',
    'columns' : {
        'instant' : {
            'type' : 'timestamp',
            'type' : 'numerical',
        },
        'dteday' : {
            'type' : 'integer',
        },
        'season' : {
            'type' : 'categorical',
            'category_type' : 'integer',
            'categories' : [0, 1, 2, 3],
        },
        'yr' : {
            'type' : 'integer',
            'format' : 'year',            
        },
        'mnth' : {
            'type' : 'integer',
            'format' : 'month',
        },
        'holiday' : {
            'type' : 'boolean',
            'true' : 0,
            'false' : 1,
        },
        'weekday' : {
            'type' : 'integer',
            'format' : 'weekday',
        },
        'workingday' : {
            'type' : 'boolean',
            'true' : 0,
            'false' : 1,
        },
        'weathersit' : {
            'type' : 'categorical',
            'category_type' : 'integer',
            'categories' : [2, 1, 3]
        },
        'temp' : {
            'type' : 'numerical',
        },
        'atemp' : {
            'type' : 'numerical'
        },
        'hum' : {
            'type' : 'numerical',
        },
        'windspeed' : {
            'type' : 'numerical',
        },
        'casual' : {
            'type' : 'integer',
        },
        'registered' : {
            'type' : 'integer',
        },
        'cnt' : {
            'type' : 'integer',
        },
    }
})

TypeError: DataSchema() takes no arguments

In [32]:
df.cnt.unique()

array([ 985,  801, 1349, 1562, 1600, 1606, 1510,  959,  822, 1321, 1263,
       1162, 1406, 1421, 1248, 1204, 1000,  683, 1650, 1927, 1543,  981,
        986, 1416, 1985,  506,  431, 1167, 1098, 1096, 1501, 1360, 1526,
       1550, 1708, 1005, 1623, 1712, 1530, 1605, 1538, 1746, 1472, 1589,
       1913, 1815, 2115, 2475, 2927, 1635, 1812, 1107, 1450, 1917, 1807,
       1461, 1969, 2402, 1446, 1851, 2134, 1685, 1944, 2077,  605, 1872,
       2133, 1891,  623, 1977, 2132, 2417, 2046, 2056, 2192, 2744, 3239,
       3117, 2471, 2703, 2121, 1865, 2210, 2496, 1693, 2028, 2425, 1536,
       2227, 2252, 3249, 3115, 1795, 2808, 3141, 1471, 2455, 2895, 3348,
       2034, 2162, 3267, 3126,  795, 3744, 3429, 3204, 3944, 4189, 1683,
       4036, 4191, 4073, 4400, 3872, 4058, 4595, 5312, 3351, 4401, 4451,
       2633, 4433, 4608, 4714, 4333, 4362, 4803, 4182, 4864, 4105, 3409,
       4553, 3958, 4123, 3855, 4575, 4917, 5805, 4660, 4274, 4492, 4978,
       4677, 4679, 4758, 4788, 4098, 3982, 3974, 49