## 12장. 고급 Pandas

In [1]:
import numpy as np
import pandas as pd 
import matplotlib as mpl 
import matplotlib.pyplot as plt 
import seaborn as sns
sns.set(style='whitegrid', palette="pastel")

In [5]:
## sample dataset 생성 
fruits = ['apple', 'orange', 'apple', 'apple'] *2 
N = len(fruits)

print(fruits) 
print(N) 

['apple', 'orange', 'apple', 'apple', 'apple', 'orange', 'apple', 'apple']
8


In [7]:
df = pd.DataFrame({'basket_id' : np.arange(N), 
                  'fruits' : fruits, 
                  'count' : np.random.randint(3, 15, size = N), 
                  'weight': np.random.uniform(0, 4, size = N)}, 
                  columns = ['basket_id', 'fruits', 'count', 'weight']
                  )
df

Unnamed: 0,basket_id,fruits,count,weight
0,0,apple,11,3.220996
1,1,orange,5,3.830039
2,2,apple,10,0.782415
3,3,apple,5,2.602933
4,4,apple,7,2.184617
5,5,orange,10,2.415614
6,6,apple,8,3.74103
7,7,apple,13,3.498229


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8 entries, 0 to 7
Data columns (total 4 columns):
basket_id    8 non-null int32
fruits       8 non-null object
count        8 non-null int32
weight       8 non-null float64
dtypes: float64(1), int32(2), object(1)
memory usage: 320.0+ bytes


In [12]:
#### obj인 fruits를 category 타입으로 변경 --> 메모리 절약과 시각화 등에 유용함 
df['fruits'] = df['fruits'].astype('category')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8 entries, 0 to 7
Data columns (total 4 columns):
basket_id    8 non-null int32
fruits       8 non-null category
count        8 non-null int32
weight       8 non-null float64
dtypes: category(1), float64(1), int32(2)
memory usage: 360.0 bytes


In [13]:
#### category 변수를 dummy로 변환하기 
cat_s = pd.Series(['a', 'b', 'c', 'd']*2, dtype = 'category')
cat_s

0    a
1    b
2    c
3    d
4    a
5    b
6    c
7    d
dtype: category
Categories (4, object): [a, b, c, d]

In [14]:
#### get_dummies 명령으로 더미 데이터프레임을 생성 
#### index = 0, 4에는 a가 있으므로 True(1), 나머지에는 없으므로 Fales(0)
pd.get_dummies(cat_s)

Unnamed: 0,a,b,c,d
0,1,0,0,0
1,0,1,0,0
2,0,0,1,0
3,0,0,0,1
4,1,0,0,0
5,0,1,0,0
6,0,0,1,0
7,0,0,0,1


In [21]:
#### 시계열 데이터에서 특정시간간격으로 resampling 하기 
times = pd.date_range('2020-03-04', freq = '1min', periods = 20)
times

DatetimeIndex(['2020-03-04 00:00:00', '2020-03-04 00:01:00',
               '2020-03-04 00:02:00', '2020-03-04 00:03:00',
               '2020-03-04 00:04:00', '2020-03-04 00:05:00',
               '2020-03-04 00:06:00', '2020-03-04 00:07:00',
               '2020-03-04 00:08:00', '2020-03-04 00:09:00',
               '2020-03-04 00:10:00', '2020-03-04 00:11:00',
               '2020-03-04 00:12:00', '2020-03-04 00:13:00',
               '2020-03-04 00:14:00', '2020-03-04 00:15:00',
               '2020-03-04 00:16:00', '2020-03-04 00:17:00',
               '2020-03-04 00:18:00', '2020-03-04 00:19:00'],
              dtype='datetime64[ns]', freq='T')

In [24]:
df = pd.DataFrame({'time_stamp' : times, 
                   'values': np.arange(20)}) 
df

Unnamed: 0,time_stamp,values
0,2020-03-04 00:00:00,0
1,2020-03-04 00:01:00,1
2,2020-03-04 00:02:00,2
3,2020-03-04 00:03:00,3
4,2020-03-04 00:04:00,4
5,2020-03-04 00:05:00,5
6,2020-03-04 00:06:00,6
7,2020-03-04 00:07:00,7
8,2020-03-04 00:08:00,8
9,2020-03-04 00:09:00,9


In [33]:
#### time_stamp를 인덱스로 지정하고, 간격을 3분단위로 하였다. 
df.set_index('time_stamp').resample('3min').count()

Unnamed: 0_level_0,values
time_stamp,Unnamed: 1_level_1
2020-03-04 00:00:00,3
2020-03-04 00:03:00,3
2020-03-04 00:06:00,3
2020-03-04 00:09:00,3
2020-03-04 00:12:00,3
2020-03-04 00:15:00,3
2020-03-04 00:18:00,2
