<a href="https://colab.research.google.com/github/bomnism/DA/blob/main/da17_categorical.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Categorical Data

In [10]:
import numpy as np
import pandas as pd

In [11]:
movie_rating = pd.DataFrame({'user_id' : np.arange(1, 1_000_001),
                                                        'gender' : ['M'] * 500_000 + ['F'] * 500_000, 
                                                        'rating' : np.random.randint(1, 6, size = 1_000_000)})

In [12]:
movie_rating

Unnamed: 0,user_id,gender,rating
0,1,M,5
1,2,M,5
2,3,M,3
3,4,M,1
4,5,M,1
...,...,...,...
999995,999996,F,1
999996,999997,F,3
999997,999998,F,1
999998,999999,F,2


In [13]:
movie_rating.info()
#> gender 컬럼의 타입은 object(문자열)
#> rating 컬럼의 타입은 int64(정수)
#> memory usage : 22.9+ MB

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 3 columns):
 #   Column   Non-Null Count    Dtype 
---  ------   --------------    ----- 
 0   user_id  1000000 non-null  int64 
 1   gender   1000000 non-null  object
 2   rating   1000000 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 22.9+ MB


In [14]:
movie_rating['gender'].value_counts()

M    500000
F    500000
Name: gender, dtype: int64

In [15]:
movie_rating['rating'].value_counts()

4    200559
2    200544
3    200109
1    199622
5    199166
Name: rating, dtype: int64

In [16]:
# gender 컬럼의 타입 object에서 category 타입으로 변환 
movie_rating['gender'] = movie_rating['gender'].astype('category')

In [17]:
movie_rating.info()
# memory usage: 16.2 MB

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 3 columns):
 #   Column   Non-Null Count    Dtype   
---  ------   --------------    -----   
 0   user_id  1000000 non-null  int64   
 1   gender   1000000 non-null  category
 2   rating   1000000 non-null  int64   
dtypes: category(1), int64(2)
memory usage: 16.2 MB


In [18]:
#  rating 컬럼의 타입을 int64에서 category 타입으로 변환 
movie_rating['rating'] = movie_rating['rating'].astype('category')

In [19]:
movie_rating.info()
# memory usage: 9.5 MB

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 3 columns):
 #   Column   Non-Null Count    Dtype   
---  ------   --------------    -----   
 0   user_id  1000000 non-null  int64   
 1   gender   1000000 non-null  category
 2   rating   1000000 non-null  category
dtypes: category(2), int64(1)
memory usage: 9.5 MB


# 연속형 변수에서 파생된 카테고리 변수 생성 

In [20]:
df = pd.DataFrame({'id': np.arange(1, 11), 
                                    'age': np.random.randint(10, 90, size=10)})

In [21]:
df

Unnamed: 0,id,age
0,1,71
1,2,61
2,3,49
3,4,58
4,5,13
5,6,16
6,7,16
7,8,82
8,9,51
9,10,74


In [22]:
df['ages'] = pd.cut(x=df['age'], bins=np.arange(10, 100, 10), right=False)
# bins: 구간의 경계값들로 이루어진 배열
# right: 구간의 오른쪽 경계를 포함할 지 말 지를 결정. 기본값은 True.

In [23]:

df.groupby('ages').size()  # df['ages'].value_counts()

ages
[10, 20)    3
[20, 30)    0
[30, 40)    0
[40, 50)    1
[50, 60)    2
[60, 70)    1
[70, 80)    2
[80, 90)    1
dtype: int64


pd.cut() 함수에서 bins 파라미터의 구간 경계값들은 일정한 간격일 필요는 없다.

In [24]:
df['age_level'] = pd.cut(x=df['age'],
                         bins=[0, 20, 60, 100],
                         labels=['young', 'middle', 'old'])
#> bins: 구간의 경계값들로 이루어진 배열(리스트)
#> labels: 구간의 레이블(이름)
df

Unnamed: 0,id,age,ages,age_level
0,1,71,"[70, 80)",old
1,2,61,"[60, 70)",old
2,3,49,"[40, 50)",middle
3,4,58,"[50, 60)",middle
4,5,13,"[10, 20)",young
5,6,16,"[10, 20)",young
6,7,16,"[10, 20)",young
7,8,82,"[80, 90)",old
8,9,51,"[50, 60)",middle
9,10,74,"[70, 80)",old


In [25]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype   
---  ------     --------------  -----   
 0   id         10 non-null     int64   
 1   age        10 non-null     int64   
 2   ages       10 non-null     category
 3   age_level  10 non-null     category
dtypes: category(2), int64(2)
memory usage: 860.0 bytes


In [27]:
df.groupby('age_level').size()

age_level
young     3
middle    3
old       4
dtype: int64

In [28]:
df.groupby('age_level')['age'].mean()

age_level
young     15.000000
middle    52.666667
old       72.000000
Name: age, dtype: float64