# Working with Categorical Data in Python

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

What does it mean to be "categorical"?
- Finite number of groups (or categories)
- These categories are ususally fixed or known (eye color, hair color)
- Known as qualitative data

Categorical data can be broke down into two main types:
- **ordinal** (having a natural rank order - Strongly Disagree / Disagree / Neutral / Agree / Strongly Agree)
- **nominal** (cannot be placed into a natural order - Blue / Green / Blue / Red)

Notice that if we have groups of people with different income like `$0 < $10 000`, `$10 000 < $20 000`, `$20 000 < $30 000` this will be an example of **ordinal categorical data**

In [2]:
adults = pd.read_csv("databases/adult.csv")
adults.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Age              32561 non-null  int64 
 1   Workclass        32561 non-null  object
 2   fnlgwt           32561 non-null  int64 
 3   Education        32561 non-null  object
 4   Education Num    32561 non-null  int64 
 5   Marital Status   32561 non-null  object
 6   Occupation       32561 non-null  object
 7   Relationship     32561 non-null  object
 8   Race             32561 non-null  object
 9   Sex              32561 non-null  object
 10  Capital Gain     32561 non-null  int64 
 11  Capital Loss     32561 non-null  int64 
 12  Hours/Week       32561 non-null  int64 
 13  Country          32561 non-null  object
 14  Above/Below 50k  32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


Exploring "Marital Status" column

In [3]:
adults["Marital Status"].describe()

count                   32561
unique                      7
top        Married-civ-spouse
freq                    14976
Name: Marital Status, dtype: object

In [4]:
adults["Marital Status"].value_counts()

Marital Status
Married-civ-spouse       14976
Never-married            10683
Divorced                  4443
Separated                 1025
Widowed                    993
Married-spouse-absent      418
Married-AF-spouse           23
Name: count, dtype: int64

In [5]:
adults["Marital Status"].value_counts(normalize=True)

Marital Status
Married-civ-spouse       0.459937
Never-married            0.328092
Divorced                 0.136452
Separated                0.031479
Widowed                  0.030497
Married-spouse-absent    0.012837
Married-AF-spouse        0.000706
Name: proportion, dtype: float64

## Categorical data in pandas

In [6]:
adults.dtypes

Age                 int64
Workclass          object
fnlgwt              int64
Education          object
Education Num       int64
Marital Status     object
Occupation         object
Relationship       object
Race               object
Sex                object
Capital Gain        int64
Capital Loss        int64
Hours/Week          int64
Country            object
Above/Below 50k    object
dtype: object

In [7]:
adults["Marital Status"].dtype

dtype('O')

pandas uses the capital O to represent the object dtype

To convert it into categorical dtype:

In [8]:
adults["Marital Status"] = adults["Marital Status"].astype("category")
adults["Marital Status"].dtype

CategoricalDtype(categories=[' Divorced', ' Married-AF-spouse', ' Married-civ-spouse',
                  ' Married-spouse-absent', ' Never-married', ' Separated',
                  ' Widowed'],
, ordered=False, categories_dtype=object)

**Creating a categorical Series**

In [9]:
my_data = ["A", "A", "C", "B", "C", "A"]

my_series1 = pd.Series(my_data, dtype="category")
print(my_series1)

0    A
1    A
2    C
3    B
4    C
5    A
dtype: category
Categories (3, object): ['A', 'B', 'C']


Another way which allows us to set order

In [10]:
my_data = ["A", "A", "C", "B", "C", "A"]
my_series2 = pd.Categorical(my_data, categories=["C", "B", "A"], ordered=True)
my_series2

['A', 'A', 'C', 'B', 'C', 'A']
Categories (3, object): ['C' < 'B' < 'A']

**Why do we use categorical: memory**

In [11]:
adults["Marital Status"] = adults["Marital Status"].astype("object")
print(f"Object: {adults["Marital Status"].nbytes}")
adults["Marital Status"] = adults["Marital Status"].astype("category")
print(f"Category: {adults["Marital Status"].nbytes}")

Object: 260488
Category: 32617


**Specify dtypes when reading data**

In [12]:
adults_dtypes = {"Marital Status": "category"}

adults = pd.read_csv("databases/adult.csv", dtype=adults_dtypes)

adults["Marital Status"].dtype

CategoricalDtype(categories=[' Divorced', ' Married-AF-spouse', ' Married-civ-spouse',
                  ' Married-spouse-absent', ' Never-married', ' Separated',
                  ' Widowed'],
, ordered=False, categories_dtype=object)

In [13]:
adults.dtypes

Age                   int64
Workclass            object
fnlgwt                int64
Education            object
Education Num         int64
Marital Status     category
Occupation           object
Relationship         object
Race                 object
Sex                  object
Capital Gain          int64
Capital Loss          int64
Hours/Week            int64
Country              object
Above/Below 50k      object
dtype: object

### Grouping data by category in pandas

In [47]:
import pandas as pd
groupby_object = adults.groupby(by=["Above/Below 50k"], observed=False)
groupby_object.mean(numeric_only=True)

Unnamed: 0_level_0,Age,fnlgwt,Education Num,Capital Gain,Capital Loss,Hours/Week
Above/Below 50k,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
<=50K,36.783738,190340.86517,9.595065,148.752468,53.142921,38.84021
>50K,44.249841,188005.0,11.611657,4006.142456,195.00153,45.473026


In [48]:
adults.groupby(by=["Above/Below 50k"], observed=False)[["Age", "Education Num"]].sum()

Unnamed: 0_level_0,Age,Education Num
Above/Below 50k,Unnamed: 1_level_1,Unnamed: 2_level_1
<=50K,909294,237190
>50K,346963,91047


In [49]:
adults.groupby(by=["Above/Below 50k", "Marital Status"], observed=False).size()

Above/Below 50k  Marital Status       
<=50K            Divorced                  3980
                 Married-AF-spouse           13
                 Married-civ-spouse        8284
                 Married-spouse-absent      384
                 Never-married            10192
                 Separated                  959
                 Widowed                    908
>50K             Divorced                   463
                 Married-AF-spouse           10
                 Married-civ-spouse        6692
                 Married-spouse-absent       34
                 Never-married              491
                 Separated                   66
                 Widowed                     85
dtype: int64

## Setting category variables

In [4]:
dogs = pd.read_csv("databases/ShelterDogs.csv")
dogs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2937 entries, 0 to 2936
Data columns (total 19 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   ID                 2937 non-null   int64  
 1   name               2845 non-null   object 
 2   age                2937 non-null   float64
 3   sex                2937 non-null   object 
 4   breed              2937 non-null   object 
 5   date_found         2937 non-null   object 
 6   adoptable_from     2937 non-null   object 
 7   posted             2937 non-null   object 
 8   color              2937 non-null   object 
 9   coat               2937 non-null   object 
 10  size               2937 non-null   object 
 11  neutered           1852 non-null   object 
 12  housebroken        460 non-null    object 
 13  likes_people       1999 non-null   object 
 14  likes_children     1219 non-null   object 
 15  get_along_males    1633 non-null   object 
 16  get_along_females  1673 

In [52]:
dogs["coat"] = dogs["coat"].astype("category")
dogs["coat"].value_counts(dropna=False)

coat
short         1972
medium         565
wirehaired     220
long           180
Name: count, dtype: int64

**The `.cat` accessor object**

`Series.cat.method_name`

Common parameters:
- `new_categories`: a list of categories
- `inplace`: Boolean - whether or not the update should overwrite the Series
- `ordered`: Boolean - whether or not categorical is treated as an ordered caregorical

Using `new_categories`

In [54]:
dogs["coat"] = dogs["coat"].cat.set_categories(
    new_categories = ["short", "medium", "long"]
)
#values not listed in a new_categories list will be dropped

In [56]:
dogs["coat"].value_counts(dropna=False)

coat
short     1972
medium     565
NaN        220
long       180
Name: count, dtype: int64

In [57]:
dogs["coat"] = dogs["coat"].cat.set_categories(
    new_categories = ["short", "medium", "long"],
    ordered=True
)
dogs["coat"].head(3)

0    short
1    short
2    short
Name: coat, dtype: category
Categories (3, object): ['short' < 'medium' < 'long']

**Adding categories** 

In [64]:
dogs["likes_people"].value_counts(dropna=False)

Index(['no', 'yes', 'did not check', 'could not tell'], dtype='object')

In [None]:
#Do not run cause the given df already has these categories
dogs["likes_people"] = dogs["likes_people"].astype("category")
dogs["likes_people"] = dogs["likes_people"].cat.add_categories(
    new_categories=["did not check", "could not tell"]
)

In [68]:
#To see existing categories
dogs["likes_people"].cat.categories

Index(['no', 'yes', 'did not check', 'could not tell'], dtype='object')

In [69]:
dogs["likes_people"].value_counts(dropna=False)

likes_people
yes               1991
NaN                938
no                   8
did not check        0
could not tell       0
Name: count, dtype: int64

**Removing categories**

In [None]:
# Again do not run
dogs["coat"] = dogs["coat"].astype("category")
dogs["coat"] = dogs["coat"].cat.remove_categories(
    removals=["wirehaired"]
)

In [73]:
dogs["coat"].cat.categories

Index(['short', 'medium', 'long'], dtype='object')

**Methods recap:**
- `cat.set_categories()`
- `cat.add_categories()`
- `cat.remove_categories()`

## Updating Categories

In [74]:
dogs["breed"] = dogs["breed"].astype("category")
dogs["breed"].value_counts()

breed
Unknown Mix                                 1524
German Shepherd Dog Mix                      190
Dachshund Mix                                147
Labrador Retriever Mix                        83
Staffordshire Terrier Mix                     62
                                            ... 
English Cocker Spaniel, Vizsla Mix             1
English Greyhound Mix                          1
English Greyhound, Spanish Greyhound Mix       1
Fox Terrier, German Shepherd Dog Mix           1
Yorkshire Terrier                              1
Name: count, Length: 277, dtype: int64

We want to rename Unknown mix to be Unknown

**Renaming categories**

`Series.cat.rename_categories(new_categories=dict)`

In [76]:
my_changes = {"Unknown Mix": "Unknown"}

dogs["breed"] = dogs["breed"].cat.rename_categories(my_changes)
dogs["breed"].value_counts()

breed
Unknown                                     1524
German Shepherd Dog Mix                      190
Dachshund Mix                                147
Labrador Retriever Mix                        83
Staffordshire Terrier Mix                     62
                                            ... 
English Cocker Spaniel, Vizsla Mix             1
English Greyhound Mix                          1
English Greyhound, Spanish Greyhound Mix       1
Fox Terrier, German Shepherd Dog Mix           1
Yorkshire Terrier                              1
Name: count, Length: 277, dtype: int64

**Renaming categories with a function**

In [9]:
dogs["sex"] = dogs["sex"].astype("category")
dogs["sex"] = dogs["sex"].cat.rename_categories(lambda c: c.title())

dogs["sex"].cat.categories

Index(['Female', 'Male'], dtype='object')

Collapsing categories

We need this to give multiple categories the same name and collapse them together. Notice that here we do not use `.cat` in any way, we simply work with strings and then convert an object into a `category`

In [13]:
# Create the update_coats dictionary
update_coats = {"wirehaired": "medium",
                "medium-long": "medium"}

# Create a new column, coat_collapsed
dogs["coat_collapsed"] = dogs["coat"].replace(update_coats)

# Convert the column to categorical
dogs["coat_collapsed"] = dogs["coat_collapsed"].astype("category")

# Print the frequency table
print(dogs["coat_collapsed"].value_counts())

coat_collapsed
short     1972
medium     785
long       180
Name: count, dtype: int64


## Reordering categories

In [18]:
dogs["coat"] = dogs["coat"].astype("category")

dogs["coat"] = dogs["coat"].cat.reorder_categories(
    new_categories = ["short", "medium", "wirehaired", "long"],
    ordered=True
)

The order below will be the same as we have specified it to be

In [22]:
dogs.groupby(by=["coat"], observed=False)["age"].mean()

coat
short         8.364746
medium        9.027982
wirehaired    8.424136
long          9.552056
Name: age, dtype: float64

Grouping when ordered is False means we do not want the column to be ordinal, but it will have the order we have specified it to be

In [24]:
dogs["coat"] = dogs["coat"].cat.reorder_categories(
    new_categories = ["short", "medium", "long", "wirehaired"],
    ordered=False
)

dogs.groupby(by=["coat"], observed=False)["age"].mean()

coat
short         8.364746
medium        9.027982
long          9.552056
wirehaired    8.424136
Name: age, dtype: float64

Example from tasks 

In [29]:
dogs["size"] = dogs["size"].astype("category")

dogs["size"] = dogs["size"].cat.reorder_categories(
  new_categories=["small", "medium", "large"],
  ordered=True,
)

# How many Male/Female dogs are available of each size?
print(dogs.groupby("size", observed=False)["sex"].value_counts())

# Do larger dogs need more room to roam?
print(dogs.groupby("size", observed=False)["keep_in"].value_counts())

size    sex   
small   Male       260
        Female     214
medium  Male      1090
        Female     854
large   Male       331
        Female     188
Name: count, dtype: int64
size    keep_in             
small   both flat and garden    238
        flat                     80
        garden                   21
medium  both flat and garden    795
        garden                  317
        flat                     97
large   both flat and garden    191
        garden                  172
        flat                      5
Name: count, dtype: int64


## Cleaning and accessing data

**Identifying issues:**
- `Series.cat.categories()`
- `Series.value_counts()`

In [None]:
dogs["get_along_cats"].value_counts()

Imagine that we get:
- No 2503
- yes 156
- no 156
- Noo 2
-   NO 1

Fixing issues with whitespace

In [31]:
dogs["get_along_cats"] = dogs["get_along_cats"].str.strip()

Fixing issues with capitalization (`title()`, `upper()`, `lower()`)

In [None]:
dogs["get_along_cats"] = dogs["get_along_cats"].str.title()

Fixing issues: misspelled words

In [33]:
replace_map = {"Noo": "No"}
dogs["get_along_cats"].replace(replace_map, inplace=True)

Remember to convert the Series back to category

**Searching for a string**

In [34]:
dogs["breed"].str.contains("Shepherd", regex=False)

0       False
1       False
2       False
3       False
4       False
        ...  
2932    False
2933    False
2934    False
2935    False
2936     True
Name: breed, Length: 2937, dtype: bool

**Accessing data with loc**

In [35]:
dogs.head()

Unnamed: 0,ID,name,age,sex,breed,date_found,adoptable_from,posted,color,coat,...,neutered,housebroken,likes_people,likes_children,get_along_males,get_along_females,get_along_cats,keep_in,main_color,coat_collapsed
0,23807,Gida,0.25,Female,Unknown Mix,12/10/19,12/11/19,12/11/19,red,short,...,no,,,,,,,,red,short
1,533,Frida És Ricsi,0.17,Female,Unknown Mix,12/1/19,12/1/19,12/9/19,black and white,short,...,no,,yes,yes,yes,yes,yes,,black,short
2,23793,,4.0,Male,Unknown Mix,12/8/19,12/23/19,12/8/19,saddle back,short,...,no,,,,,,,,saddle back,short
3,23795,,1.0,Male,Unknown Mix,12/8/19,12/23/19,12/8/19,yellow-brown,medium,...,no,,,,,,,,yellow-brown,medium
4,23806,Amy,2.0,Female,French Bulldog Mix,12/10/19,12/11/19,12/11/19,black,short,...,no,,,,,,,,black,short


In [None]:
dogs.loc[dogs["get_along_cats"] == "Yes", "size"]

Exercise example

In [36]:
# Fix the misspelled word
replace_map = {"Malez": "male"}

# Update the sex column using the created map
dogs["sex"] = dogs["sex"].replace(replace_map)

# Strip away leading whitespace
dogs["sex"] = dogs["sex"].str.strip()

# Make all responses lowercase
dogs["sex"] = dogs["sex"].str.lower()

# Convert to a categorical Series
dogs["sex"] = dogs["sex"].astype("category")

print(dogs["sex"].value_counts())

sex
male      1681
female    1256
Name: count, dtype: int64


Second task

In [None]:
# Print the category of the coat for ID 23807
print(dogs.loc[dogs.index == 23807, "coat"])

In [37]:
# Find the count of male and female dogs who have a "long" coat
print(dogs.loc[dogs["coat"] == "long", "sex"].value_counts())

sex
male      124
female     56
Name: count, dtype: int64


In [38]:
# Print the mean age of dogs with a breed of "English Cocker Spaniel"
print(dogs.loc[dogs["breed"] == "English Cocker Spaniel", "age"].mean())

8.186153846153847


In [40]:
print(dogs[dogs["breed"].str.contains("English", regex=False)].shape[0])

35
