In [1]:
import pandas as pd
import numpy as np

class display(object):
    """여러 객체를 HTML 형태로 표시"""

    # HTML 템플릿 문자열
    template = """<div style="float: left; padding: 10px;">
    <p style='font-family:"Courier New", Courier, monospace'>{0}{1}
    """

    def __init__(self, *args):
        # 여러 개의 객체 이름을 args에 저장
        self.args = args

    def _repr_html_(self):
        # 각 객체를 HTML 문자열로 변환한 뒤 연결
        return '\n'.join(
            self.template.format(a, eval(a)._repr_html_())  # 객체 이름과 실제 객체의 HTML 출력
            for a in self.args
        )

    def __repr__(self):
        # 터미널 등에서의 문자열 표현
        return '\n\n'.join(
            a + '\n' + repr(eval(a))  # 객체 이름과 객체의 문자열 표현
            for a in self.args
        )

In [2]:
import seaborn as sns
planets = sns.load_dataset('planets')
planets.shape

(1035, 6)

In [3]:
planets.head()

Unnamed: 0,method,number,orbital_period,mass,distance,year
0,Radial Velocity,1,269.3,7.1,77.4,2006
1,Radial Velocity,1,874.774,2.21,56.95,2008
2,Radial Velocity,1,763.0,2.6,19.84,2011
3,Radial Velocity,1,326.03,19.4,110.62,2007
4,Radial Velocity,1,516.22,10.5,119.47,2009


In [5]:
import numpy as np
import pandas as pd

rng = np.random.RandomState(42)  # 시드값 42를 가진 난수 생성기
ser = pd.Series(rng.rand(5))    # 5개의 난수로 구성된 Series 생성
ser                     # Series 출력

0    0.374540
1    0.950714
2    0.731994
3    0.598658
4    0.156019
dtype: float64

In [6]:
ser.sum()

2.811925491708157

In [7]:
ser.mean()

0.5623850983416314

In [8]:

df = pd.DataFrame({
    'A': rng.rand(5),  # 난수 5개로 구성된 열 A 생성
    'B': rng.rand(5)   # 난수 5개로 구성된 열 B 생성
})
df  # 생성된 DataFrame 출력

Unnamed: 0,A,B
0,0.155995,0.020584
1,0.058084,0.96991
2,0.866176,0.832443
3,0.601115,0.212339
4,0.708073,0.181825


In [9]:
# df의 각 행(row)에 대한 평균값 계산
df.mean(axis='columns')

0    0.088290
1    0.513997
2    0.849309
3    0.406727
4    0.444949
dtype: float64

In [10]:
# 결측치를 제거한 후 기초 통계량 확인
planets.dropna().describe()

Unnamed: 0,number,orbital_period,mass,distance,year
count,498.0,498.0,498.0,498.0,498.0
mean,1.73494,835.778671,2.50932,52.068213,2007.37751
std,1.17572,1469.128259,3.636274,46.596041,4.167284
min,1.0,1.3283,0.0036,1.35,1989.0
25%,1.0,38.27225,0.2125,24.4975,2005.0
50%,1.0,357.0,1.245,39.94,2009.0
75%,2.0,999.6,2.8675,59.3325,2011.0
max,6.0,17337.5,25.0,354.0,2014.0


In [14]:
# 'df' DataFrame 생성
df = pd.DataFrame({
    'key': ['A', 'B', 'C', 'A', 'B', 'C'],  # 'key' 열에 'A', 'B', 'C'가 반복
    'data': range(5 + 1)                    # 'data' 열에 0부터 5까지의 정수
}, columns=['key', 'data'])

df

Unnamed: 0,key,data
0,A,0
1,B,1
2,C,2
3,A,3
4,B,4
5,C,5


In [15]:
df.groupby('key')

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7fd9aaad8340>

In [None]:
# 그룹 연산을 수행하기 위한 groupby 객체 생성 예시
grouped_df = df.groupby('key')
print(grouped_df)
# 출력 예시:
# <pandas.core.groupby.generic.DataFrameGroupBy object at 0x7fd9aaa149d0>

In [16]:
df.groupby('key').sum()

Unnamed: 0_level_0,data
key,Unnamed: 1_level_1
A,3
B,5
C,7


In [17]:
# 열 인덱싱
planets.groupby('method')

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7fd9a9fd36a0>

In [18]:
planets.groupby('method')['orbital_period']

<pandas.core.groupby.generic.SeriesGroupBy object at 0x7fd9a9eb81f0>

In [19]:
planets.groupby('method')['orbital_period'].median()

method
Astrometry                         631.180000
Eclipse Timing Variations         4343.500000
Imaging                          27500.000000
Microlensing                      3300.000000
Orbital Brightness Modulation        0.342887
Pulsar Timing                       66.541900
Pulsation Timing Variations       1170.000000
Radial Velocity                    360.200000
Transit                              5.714932
Transit Timing Variations           57.011000
Name: orbital_period, dtype: float64

In [20]:
# 그룹 내 반복
for (method, group) in planets.groupby('method'):
    print("{0:30s} shape = {1}".format(method, group.shape))

Astrometry                     shape = (2, 6)
Eclipse Timing Variations      shape = (9, 6)
Imaging                        shape = (38, 6)
Microlensing                   shape = (23, 6)
Orbital Brightness Modulation  shape = (3, 6)
Pulsar Timing                  shape = (5, 6)
Pulsation Timing Variations    shape = (1, 6)
Radial Velocity                shape = (553, 6)
Transit                        shape = (397, 6)
Transit Timing Variations      shape = (4, 6)


In [21]:
# Dispatch Method
planets.groupby('method')['year'].describe().unstack()

       method                       
count  Astrometry                          2.0
       Eclipse Timing Variations           9.0
       Imaging                            38.0
       Microlensing                       23.0
       Orbital Brightness Modulation       3.0
                                         ...  
max    Pulsar Timing                    2011.0
       Pulsation Timing Variations      2007.0
       Radial Velocity                  2014.0
       Transit                          2014.0
       Transit Timing Variations        2014.0
Length: 80, dtype: float64

In [22]:
# 집계, 필터, 변환, 적용

In [24]:
rng = np.random.RandomState(0)  # 시드값 0을 사용한 난수 생성기
df = pd.DataFrame({
    'key': ['A', 'B', 'C', 'A', 'B', 'C'],  # 'A', 'B', 'C'가 반복되는 key 열
    'data1': range(5 + 1),                  # 0부터 5까지 data1 열
    'data2': rng.randint(0, 10, 6)          # 0부터 9 사이의 난수 6개로 구성된 data2 열
}, columns=['key', 'data1', 'data2'])

df

Unnamed: 0,key,data1,data2
0,A,0,5
1,B,1,0
2,C,2,3
3,A,3,3
4,B,4,7
5,C,5,9


In [27]:
# 집계
df.groupby('key').aggregate([min, np.median, max])

Unnamed: 0_level_0,data1,data1,data1,data2,data2,data2
Unnamed: 0_level_1,min,median,max,min,median,max
key,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
A,0,1.5,3,3,4.0,5
B,1,2.5,4,0,3.5,7
C,2,3.5,5,3,6.0,9


In [28]:
df.groupby('key').aggregate({'data1' : 'min',
                             'data2' : 'max'})

Unnamed: 0_level_0,data1,data2
key,Unnamed: 1_level_1,Unnamed: 2_level_1
A,0,5
B,1,7
C,2,9


In [29]:
# 필터링

def filter_func(x):
    return x['data2'].std() > 4  # 그룹의 'data2' 열의 표준편차가 4보다 큰지 여부를 반환

print(df)  # DataFrame 출력

print(df.groupby('key').std())  # 'key'를 기준으로 그룹화한 후 각 그룹의 'data1'과 'data2'의 표준편차를 계산하여 출력

print(df.groupby('key').filter(filter_func))  # 'data2'의 표준편차가 4보다 큰 그룹만 필터링하여 출력

  key  data1  data2
0   A      0      5
1   B      1      0
2   C      2      3
3   A      3      3
4   B      4      7
5   C      5      9
       data1     data2
key                   
A    2.12132  1.414214
B    2.12132  4.949747
C    2.12132  4.242641
  key  data1  data2
1   B      1      0
2   C      2      3
4   B      4      7
5   C      5      9


In [30]:
# 변환
df.groupby('key').transform(lambda x : x - x.mean())

Unnamed: 0,data1,data2
0,-1.5,1.0
1,-1.5,-3.5
2,-1.5,-3.0
3,1.5,-1.0
4,1.5,3.5
5,1.5,3.0


In [31]:
def norm_by_data2(x):
    # x는 그룹값을 가지는 DataFrame
    x['data1'] /= x['data2'].sum()
    return x

print(df)
print(df.groupby('key').apply(norm_by_data2))

  key  data1  data2
0   A      0      5
1   B      1      0
2   C      2      3
3   A      3      3
4   B      4      7
5   C      5      9
  key     data1  data2
0   A  0.000000      5
1   B  0.142857      0
2   C  0.166667      3
3   A  0.375000      3
4   B  0.571429      7
5   C  0.416667      9


To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  print(df.groupby('key').apply(norm_by_data2))


In [32]:
L = [0, 1, 0, 1, 2, 0]
print(df)
print(df.groupby(L).sum())

  key  data1  data2
0   A      0      5
1   B      1      0
2   C      2      3
3   A      3      3
4   B      4      7
5   C      5      9
   data1  data2
0      7     17
1      4      3
2      4      7


  print(df.groupby(L).sum())


In [33]:
print(df)
print(df.groupby(df['key']).sum())

  key  data1  data2
0   A      0      5
1   B      1      0
2   C      2      3
3   A      3      3
4   B      4      7
5   C      5      9
     data1  data2
key              
A        3      8
B        5      7
C        7     12


In [37]:
df2 = df.set_index('key')
mapping = {'A' : 'vowel', 'B' : 'consonant', 'C' : 'consonant'}

print(df2)
print(df2.groupby(mapping).sum())

     data1  data2
key              
A        0      5
B        1      0
C        2      3
A        3      3
B        4      7
C        5      9
           data1  data2
key                    
consonant     12     19
vowel          3      8


In [38]:
print(df2)
print(df2.groupby(str.lower).mean())

     data1  data2
key              
A        0      5
B        1      0
C        2      3
A        3      3
B        4      7
C        5      9
     data1  data2
key              
a      1.5    4.0
b      2.5    3.5
c      3.5    6.0


In [39]:
df2.groupby([str.lower, mapping]).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,data1,data2
key,key,Unnamed: 2_level_1,Unnamed: 3_level_1
a,vowel,1.5,4.0
b,consonant,2.5,3.5
c,consonant,3.5,6.0


In [40]:
# 변환

# 'year' 열을 기준으로 해당 년도가 속한 10년 단위의 decade 계산
decade = 10 * (planets['year'] // 10)  # 예: 1995년 -> 1990

# 계산된 decade를 문자열로 변환하고 's'를 추가하여 형식화 (예: '1990s')
decade = decade.astype(str) + 's'

# decade 시리즈의 이름을 'decade'로 설정
decade.name = 'decade'

# 'method'와 'decade'를 기준으로 그룹화한 후, 각 그룹의 'number' 합계를 계산
# unstack()을 사용하여 'decade'를 열로 변환하고, 결측치는 0으로 채움
decade_summary = planets.groupby(['method', decade])['number'].sum().unstack().fillna(0)

print(decade_summary)  # 집계된 결과 출력

decade,1980s,1990s,2000s,2010s
method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Astrometry,0.0,0.0,0.0,2.0
Eclipse Timing Variations,0.0,0.0,5.0,10.0
Imaging,0.0,0.0,29.0,21.0
Microlensing,0.0,0.0,12.0,15.0
Orbital Brightness Modulation,0.0,0.0,0.0,5.0
Pulsar Timing,0.0,9.0,1.0,1.0
Pulsation Timing Variations,0.0,0.0,1.0,0.0
Radial Velocity,1.0,52.0,475.0,424.0
Transit,0.0,0.0,64.0,712.0
Transit Timing Variations,0.0,0.0,0.0,9.0
