### categorical data 


In [2]:
import pandas as pd
url = 'https://github.com/arunadas/effective-pandas/raw/main/data/vehicles.csv.zip'
df = pd.read_csv(url)
make = df.make
make

  df = pd.read_csv(url)


0        Alfa Romeo
1           Ferrari
2             Dodge
3             Dodge
4            Subaru
            ...    
41139        Subaru
41140        Subaru
41141        Subaru
41142        Subaru
41143        Subaru
Name: make, Length: 41144, dtype: object

### Frequency count

In [3]:
make.value_counts()

make
Chevrolet                      4003
Ford                           3371
Dodge                          2583
GMC                            2494
Toyota                         2071
                               ... 
Volga Associated Automobile       1
Panos                             1
Mahindra                          1
Excalibur Autos                   1
London Coach Co Inc               1
Name: count, Length: 136, dtype: int64

In [5]:
make.shape, make.nunique()

((41144,), 136)

### Benifits of cardinality
-- use less memory
-- computation faster
-- binning function creates categorical results pd.cut, pd.qcut

In [7]:
cat_make = make.astype('category')
make.memory_usage(deep=True)

2606395

In [8]:
cat_make.memory_usage(deep=True)

95888

In [9]:
%%timeit
cat_make.str.upper()

194 µs ± 1.22 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [10]:
%%timeit
make.str.upper()

2.46 ms ± 18.6 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


### conversion to ordinal

In [11]:
make_type = pd.CategoricalDtype(
    categories=sorted(make.unique()), ordered=True)
ordered_make = make.astype(make_type)
ordered_make

0        Alfa Romeo
1           Ferrari
2             Dodge
3             Dodge
4            Subaru
            ...    
41139        Subaru
41140        Subaru
41141        Subaru
41142        Subaru
41143        Subaru
Name: make, Length: 41144, dtype: category
Categories (136, object): ['AM General' < 'ASC Incorporated' < 'Acura' < 'Alfa Romeo' ... 'Volvo' < 'Wallace Environmental' < 'Yugo' < 'smart']

In [12]:
ordered_make.max()

'smart'

In [13]:
cat_make.max()

TypeError: Categorical is not ordered for operation max
you can use .as_ordered() to change the Categorical to an ordered one


In [15]:
ordered_make.sort_values()

20288    AM General
20289    AM General
369      AM General
358      AM General
19314    AM General
            ...    
31289         smart
31290         smart
29605         smart
22974         smart
26882         smart
Name: make, Length: 41144, dtype: category
Categories (136, object): ['AM General' < 'ASC Incorporated' < 'Acura' < 'Alfa Romeo' ... 'Volvo' < 'Wallace Environmental' < 'Yugo' < 'smart']

### .cat accessor

In [16]:
cat_make.cat.rename_categories(
    [c.lower() for c in cat_make.cat.categories])

0        alfa romeo
1           ferrari
2             dodge
3             dodge
4            subaru
            ...    
41139        subaru
41140        subaru
41141        subaru
41142        subaru
41143        subaru
Name: make, Length: 41144, dtype: category
Categories (136, object): ['am general', 'asc incorporated', 'acura', 'alfa romeo', ..., 'volvo', 'wallace environmental', 'yugo', 'smart']

In [17]:
ordered_make.cat.rename_categories(
    [c.lower() for c in ordered_make.cat.categories])

0        alfa romeo
1           ferrari
2             dodge
3             dodge
4            subaru
            ...    
41139        subaru
41140        subaru
41141        subaru
41142        subaru
41143        subaru
Name: make, Length: 41144, dtype: category
Categories (136, object): ['am general' < 'asc incorporated' < 'acura' < 'alfa romeo' ... 'volvo' < 'wallace environmental' < 'yugo' < 'smart']

In [19]:
ordered_make.cat.reorder_categories(
    sorted(cat_make.cat.categories, key=str.lower))

0        Alfa Romeo
1           Ferrari
2             Dodge
3             Dodge
4            Subaru
            ...    
41139        Subaru
41140        Subaru
41141        Subaru
41142        Subaru
41143        Subaru
Name: make, Length: 41144, dtype: category
Categories (136, object): ['Acura' < 'Alfa Romeo' < 'AM General' < 'American Motors Corporation' ... 'Volvo' < 'VPG' < 'Wallace Environmental' < 'Yugo']

### Category gotachas

In [20]:
# counts every category even if value is not there
ordered_make.iloc[:100].value_counts()

make
Dodge                        17
Oldsmobile                    8
Ford                          8
Buick                         7
Chevrolet                     5
                             ..
Grumman Allied Industries     0
Goldacre                      0
Geo                           0
Genesis                       0
smart                         0
Name: count, Length: 136, dtype: int64

In [21]:
cat_make.iloc[:100].groupby(cat_make.iloc[:100]).first()

  cat_make.iloc[:100].groupby(cat_make.iloc[:100]).first()


make
AM General                            NaN
ASC Incorporated                      NaN
Acura                                 NaN
Alfa Romeo                     Alfa Romeo
American Motors Corporation           NaN
                                  ...    
Volkswagen                     Volkswagen
Volvo                               Volvo
Wallace Environmental                 NaN
Yugo                                  NaN
smart                                 NaN
Name: make, Length: 136, dtype: category
Categories (136, object): ['AM General', 'ASC Incorporated', 'Acura', 'Alfa Romeo', ..., 'Volvo', 'Wallace Environmental', 'Yugo', 'smart']

In [22]:
make.iloc[:100].groupby(make.iloc[:100]).first()

make
Alfa Romeo          Alfa Romeo
Audi                      Audi
BMW                        BMW
Buick                    Buick
CX Automotive    CX Automotive
Cadillac              Cadillac
Chevrolet            Chevrolet
Chrysler              Chrysler
Dodge                    Dodge
Ferrari                Ferrari
Ford                      Ford
Hyundai                Hyundai
Infiniti              Infiniti
Lexus                    Lexus
Mazda                    Mazda
Mercury                Mercury
Nissan                  Nissan
Oldsmobile          Oldsmobile
Plymouth              Plymouth
Pontiac                Pontiac
Rolls-Royce        Rolls-Royce
Subaru                  Subaru
Toyota                  Toyota
Volkswagen          Volkswagen
Volvo                    Volvo
Name: make, dtype: object

In [25]:
cat_make.iloc[:100].groupby(cat_make.iloc[:100], observed=True).first()

make
Alfa Romeo          Alfa Romeo
Audi                      Audi
BMW                        BMW
Buick                    Buick
CX Automotive    CX Automotive
Cadillac              Cadillac
Chevrolet            Chevrolet
Chrysler              Chrysler
Dodge                    Dodge
Ferrari                Ferrari
Ford                      Ford
Hyundai                Hyundai
Infiniti              Infiniti
Lexus                    Lexus
Mazda                    Mazda
Mercury                Mercury
Nissan                  Nissan
Oldsmobile          Oldsmobile
Plymouth              Plymouth
Pontiac                Pontiac
Rolls-Royce        Rolls-Royce
Subaru                  Subaru
Toyota                  Toyota
Volkswagen          Volkswagen
Volvo                    Volvo
Name: make, dtype: category
Categories (136, object): ['AM General', 'ASC Incorporated', 'Acura', 'Alfa Romeo', ..., 'Volvo', 'Wallace Environmental', 'Yugo', 'smart']

In [26]:
# puling single value with iloc will return scalar
ordered_make.iloc[0]

'Alfa Romeo'

In [27]:
# even single record as list makes category
ordered_make.iloc[[0]]

0    Alfa Romeo
Name: make, dtype: category
Categories (136, object): ['AM General' < 'ASC Incorporated' < 'Acura' < 'Alfa Romeo' ... 'Volvo' < 'Wallace Environmental' < 'Yugo' < 'smart']

### Generalization

In [28]:
def generalize_topn(ser , n = 5, other = 'Other'):
    topn = ser.value_counts().index[:n]
    if isinstance(ser.dtype, pd.CategoricalDtype):
        ser = ser.cat.set_categories(
            topn.set_categories(list(topn)+[other]))
    return ser.where(ser.isin(topn), other)

cat_make.pipe(generalize_topn, n=20, other='NA')

0            NA
1            NA
2         Dodge
3         Dodge
4        Subaru
          ...  
41139    Subaru
41140    Subaru
41141    Subaru
41142    Subaru
41143    Subaru
Name: make, Length: 41144, dtype: category
Categories (21, object): ['Chevrolet', 'Ford', 'Dodge', 'GMC', ..., 'Volvo', 'Hyundai', 'Chrysler', 'NA']

In [29]:
def generalize_mapping(ser, mapping, default):
    seen = None
    res = ser.astype(str)
    for old , new in mapping.items():
        mask = ser.str.contains(old)
        if seen is None:
            seen = mask
        else:
            seen |= mask
        res = res.where(~mask,new)
    res = res.where(seen, default)
    return res.astype('category')

generalize_mapping(cat_make,{'Ford':'US','Tesla':'US','Chevrolet':'US','Dodge':'US',
                             'Oldsmobile':'US','Plymouth':'US',
                             'BMW':'German'},'other')


0        other
1        other
2           US
3           US
4        other
         ...  
41139    other
41140    other
41141    other
41142    other
41143    other
Name: make, Length: 41144, dtype: category
Categories (3, object): ['German', 'US', 'other']

### Exercise 15.9

In [34]:
# convert a text column into a categorical column.How much memory did you save?
car_make = make[:100]
car_make2 = car_make.astype('category')
car_make.memory_usage(deep=True)

6489

In [35]:
car_make2.memory_usage(deep=True)

2386

In [38]:
#convert a numeric column into a categorical column by binning it(pd.cut).How much memory did you save?
pd.cut(car_make.value_counts(), 4)

make
Dodge            (13.0, 17.0]
Ford               (5.0, 9.0]
Oldsmobile         (5.0, 9.0]
Buick              (5.0, 9.0]
Mazda            (0.984, 5.0]
Plymouth         (0.984, 5.0]
Chevrolet        (0.984, 5.0]
Mercury          (0.984, 5.0]
Pontiac          (0.984, 5.0]
Cadillac         (0.984, 5.0]
Volkswagen       (0.984, 5.0]
Toyota           (0.984, 5.0]
BMW              (0.984, 5.0]
Nissan           (0.984, 5.0]
Hyundai          (0.984, 5.0]
Subaru           (0.984, 5.0]
Audi             (0.984, 5.0]
Volvo            (0.984, 5.0]
CX Automotive    (0.984, 5.0]
Infiniti         (0.984, 5.0]
Alfa Romeo       (0.984, 5.0]
Chrysler         (0.984, 5.0]
Lexus            (0.984, 5.0]
Ferrari          (0.984, 5.0]
Rolls-Royce      (0.984, 5.0]
Name: count, dtype: category
Categories (4, interval[float64, right]): [(0.984, 5.0] < (5.0, 9.0] < (9.0, 13.0] < (13.0, 17.0]]