# Pandas 
 - R의 데이터프레임(data.frame)과 같은 형식으로 데이터를 다룰 수 있게 해주는 라이브러리
 - Series는 R의 vector와, DataFrame은 R의 data.frame과 유사
 - <a href='https://pandas.pydata.org/pandas-docs/stable/comparison_with_r.html'>R과 비교 </a>

In [2]:
import pandas as pd

## Series

In [3]:
s = pd.Series([1,2,3,4,5])
s

0    1
1    2
2    3
3    4
4    5
dtype: int64

In [4]:
s[0]

1

In [5]:
s[2:5]

2    3
3    4
4    5
dtype: int64

In [6]:
s % 2 == 0

0    False
1     True
2    False
3     True
4    False
dtype: bool

### Boolean

In [8]:
s[s % 2 == 0]

1    2
3    4
dtype: int64

### Date

In [9]:
d = pd.date_range('20170829', periods=5)
d

DatetimeIndex(['2017-08-29', '2017-08-30', '2017-08-31', '2017-09-01',
               '2017-09-02'],
              dtype='datetime64[ns]', freq='D')

In [10]:
pd.date_range('20170829', periods=5, freq='2MS')

DatetimeIndex(['2017-09-01', '2017-11-01', '2018-01-01', '2018-03-01',
               '2018-05-01'],
              dtype='datetime64[ns]', freq='2MS')

In [11]:
pd.date_range('20170829', periods=5, freq='2MS') + pd.DateOffset(days=10)

DatetimeIndex(['2017-09-11', '2017-11-11', '2018-01-11', '2018-03-11',
               '2018-05-11'],
              dtype='datetime64[ns]', freq=None)

## DataFrame

In [12]:
df = pd.DataFrame({'date':d, 'num':s})
df

Unnamed: 0,date,num
0,2017-08-29,1
1,2017-08-30,2
2,2017-08-31,3
3,2017-09-01,4
4,2017-09-02,5


In [13]:
df.head(3)

Unnamed: 0,date,num
0,2017-08-29,1
1,2017-08-30,2
2,2017-08-31,3


In [14]:
df.columns

Index(['date', 'num'], dtype='object')

In [15]:
df.index

RangeIndex(start=0, stop=5, step=1)

In [16]:
df.values

array([[Timestamp('2017-08-29 00:00:00'), 1],
       [Timestamp('2017-08-30 00:00:00'), 2],
       [Timestamp('2017-08-31 00:00:00'), 3],
       [Timestamp('2017-09-01 00:00:00'), 4],
       [Timestamp('2017-09-02 00:00:00'), 5]], dtype=object)

In [17]:
df.dtypes

date    datetime64[ns]
num              int64
dtype: object

## Another way to make DataFrame 

In [18]:
d1 = pd.DataFrame(
[
    {'A':1, 'B':2},
    {'A':3, 'B':4}
    
])

In [19]:
d1

Unnamed: 0,A,B
0,1,2
1,3,4


## Change Columns & Row's names

In [20]:
df.columns = ["날짜","번호"]

In [21]:
df

Unnamed: 0,날짜,번호
0,2017-08-29,1
1,2017-08-30,2
2,2017-08-31,3
3,2017-09-01,4
4,2017-09-02,5


In [22]:
df.index = ['A', 'B', 'C', 'D', 'E']
df

Unnamed: 0,날짜,번호
A,2017-08-29,1
B,2017-08-30,2
C,2017-08-31,3
D,2017-09-01,4
E,2017-09-02,5


In [23]:
df.set_index('날짜')

Unnamed: 0_level_0,번호
날짜,Unnamed: 1_level_1
2017-08-29,1
2017-08-30,2
2017-08-31,3
2017-09-01,4
2017-09-02,5


In [24]:
df.sort_values(by='날짜',ascending=False)

Unnamed: 0,날짜,번호
E,2017-09-02,5
D,2017-09-01,4
C,2017-08-31,3
B,2017-08-30,2
A,2017-08-29,1


In [25]:
df['날짜']

A   2017-08-29
B   2017-08-30
C   2017-08-31
D   2017-09-01
E   2017-09-02
Name: 날짜, dtype: datetime64[ns]

In [26]:
df[['번호', '날짜']]

Unnamed: 0,번호,날짜
A,1,2017-08-29
B,2,2017-08-30
C,3,2017-08-31
D,4,2017-09-01
E,5,2017-09-02


## Select Row

In [27]:
df.loc['A']

날짜    2017-08-29 00:00:00
번호                      1
Name: A, dtype: object

In [28]:
df.loc[['A','B']]

Unnamed: 0,날짜,번호
A,2017-08-29,1
B,2017-08-30,2


In [29]:
df.loc['A':'C']

Unnamed: 0,날짜,번호
A,2017-08-29,1
B,2017-08-30,2
C,2017-08-31,3


In [30]:
df.loc['A','날짜']

Timestamp('2017-08-29 00:00:00')

In [31]:
df.at['A','날짜'] # 하나의 값에 접근할 때는 .at이 더 빠름

Timestamp('2017-08-29 00:00:00')

In [32]:
%%timeit
df.at['A','날짜'] 

The slowest run took 8.99 times longer than the fastest. This could mean that an intermediate result is being cached.
100000 loops, best of 3: 9.25 µs per loop


In [33]:
%%timeit
df.loc['A', '날짜']

The slowest run took 6.74 times longer than the fastest. This could mean that an intermediate result is being cached.
100000 loops, best of 3: 12.5 µs per loop


## Select Row with number of Rows

In [34]:
df.iloc[0]

날짜    2017-08-29 00:00:00
번호                      1
Name: A, dtype: object

In [35]:
df.iloc[0, 0]

Timestamp('2017-08-29 00:00:00')

In [36]:
df.iat[0, 0]

Timestamp('2017-08-29 00:00:00')

## Max, Min value index

In [37]:
df['번호'].idxmin()

'A'

In [38]:
df['번호'].idxmax()

'E'

## Boolean Indexing

In [39]:
df[df['날짜'] > pd.Timestamp('2017-08-31')]

Unnamed: 0,날짜,번호
D,2017-09-01,4
E,2017-09-02,5


### OR
 - OR는 |로 나타냄
 - Python에서 |는 부등호보다 먼저 계산되기 때문에 괄호를 해줌
  - (df['날짜'] < pd.Timestamp('2017-09-02')) | (df['번호'] < 4)
  
### AND
 - AND는 &로 나타냄
 - 역시 괄호가 필요

In [40]:
(df['날짜'] < pd.Timestamp('2017-09-02')) & (df['번호'] < 4)

A     True
B     True
C     True
D    False
E    False
dtype: bool

### NOT
 - NOT은 ~로 나타냄

In [41]:
~(df['날짜'] < pd.Timestamp('2017-09-02'))

A    False
B    False
C    False
D    False
E     True
Name: 날짜, dtype: bool

### 포함관계

In [42]:
df['번호'].isin([1, 3, 5])

A     True
B    False
C     True
D    False
E     True
Name: 번호, dtype: bool

## Copy DataFrame

In [43]:
df2 = df.copy()
df2

Unnamed: 0,날짜,번호
A,2017-08-29,1
B,2017-08-30,2
C,2017-08-31,3
D,2017-09-01,4
E,2017-09-02,5


## Add Data

In [45]:
df['고객'] = [300,240,150,400,340]
df

Unnamed: 0,날짜,번호,고객
A,2017-08-29,1,300
B,2017-08-30,2,240
C,2017-08-31,3,150
D,2017-09-01,4,400
E,2017-09-02,5,340


## Change Data value

In [46]:
df.iat[2,2] = 10
df

Unnamed: 0,날짜,번호,고객
A,2017-08-29,1,300
B,2017-08-30,2,240
C,2017-08-31,3,10
D,2017-09-01,4,400
E,2017-09-02,5,340


In [47]:
df2.loc[df['번호'] > 2, '고객'] = 0
df2

Unnamed: 0,날짜,번호,고객
A,2017-08-29,1,
B,2017-08-30,2,
C,2017-08-31,3,0.0
D,2017-09-01,4,0.0
E,2017-09-02,5,0.0


In [48]:
df2.loc[df['번호'] < 4, ['고객', '번호']] = -1
df2

Unnamed: 0,날짜,번호,고객
A,2017-08-29,-1,-1.0
B,2017-08-30,-1,-1.0
C,2017-08-31,-1,-1.0
D,2017-09-01,4,0.0
E,2017-09-02,5,0.0


## 조건에 따라 다른 값 매기기

In [49]:
import numpy as np 

In [50]:
np.where(df['날짜'] > pd.Timestamp('2017-08-31'),'미래','과거')

array(['과거', '과거', '과거', '미래', '미래'], 
      dtype='<U2')

## csv 파일 읽기

In [51]:
air = pd.read_csv('product_airtime.csv')

In [52]:
air.head()

Unnamed: 0,PRODUCT_NBR,ONAIR_DATE,ONAIR_START_TMS,ONAIR_END_TMS,ONAIR_MINS,HOST1,HOST2
0,P150028635,2015-01-02,2015-01-02 01:33:02,2015-01-02 01:58:20,25.32,,
1,P150028635,2015-01-02,2015-01-02 01:21:14,2015-01-02 01:32:48,11.58,,
2,P150028635,2015-01-01,2015-01-01 22:34:30,2015-01-01 22:58:17,23.8,,
3,P150028635,2015-01-01,2015-01-01 22:00:05,2015-01-01 22:33:02,32.97,,
4,P150028635,2015-01-01,2015-01-01 19:00:00,2015-01-01 19:00:01,0.03,,


In [53]:
air.shape

(63468, 7)

## 컬럼 통계

In [54]:
air.columns

Index(['PRODUCT_NBR', 'ONAIR_DATE', 'ONAIR_START_TMS', 'ONAIR_END_TMS',
       'ONAIR_MINS', 'HOST1', 'HOST2'],
      dtype='object')

In [55]:
air.ONAIR_MINS.sum()

240808.89999999999

In [56]:
air.ONAIR_MINS.mean()

3.7941781685258711

In [57]:
air.ONAIR_MINS.std()

5.1647709418247665

In [58]:
air.ONAIR_MINS.min()

0.02

In [59]:
air.ONAIR_MINS.max()

59.5

## 필터링

In [60]:
m = air['ONAIR_MINS'].mean()
s = air['ONAIR_MINS'].std()

In [61]:
air[air['ONAIR_MINS'] > m + 10 *s]

Unnamed: 0,PRODUCT_NBR,ONAIR_DATE,ONAIR_START_TMS,ONAIR_END_TMS,ONAIR_MINS,HOST1,HOST2
1039,P150028603,2015-01-01,2015-01-01 08:01:29,2015-01-01 08:57:35,56.12,,
3532,P150028674,2015-02-15,2015-02-15 12:00:00,2015-02-15 12:58:04,58.08,,
6742,P150062392,2015-01-05,2015-01-05 00:01:21,2015-01-05 00:58:48,57.47,,
6743,P150062392,2015-01-04,2015-01-04 22:00:42,2015-01-04 22:58:56,58.25,,
6745,P150062392,2015-01-04,2015-01-04 20:00:47,2015-01-04 20:59:59,59.22,,
6755,P150062392,2015-01-04,2015-01-04 12:00:43,2015-01-04 12:58:42,58.00,,
6764,P150062392,2015-01-04,2015-01-04 21:01:20,2015-01-04 21:58:49,57.50,Wheeler Dan,
6766,P150062392,2015-01-04,2015-01-04 18:02:04,2015-01-04 18:59:59,57.93,Bauer Jill,
6767,P150062392,2015-01-04,2015-01-04 16:00:03,2015-01-04 16:59:32,59.50,Bauer Jill,
6775,P150062392,2015-01-04,2015-01-04 09:00:43,2015-01-04 09:58:42,58.00,Hughes Dan,


## DB에서 불러오기

In [62]:
from sqlalchemy import create_engine

In [63]:
engine = create_engine('sqlite:///example.db')

In [64]:
conn = engine.connect()

In [65]:
conn.execute('CREATE TABLE stocks (date text, price real)')

<sqlalchemy.engine.result.ResultProxy at 0x25b68056f28>

In [66]:
conn.execute("INSERT INTO stocks (date, price) VALUES ('2017-08-01', 1000)")
conn.execute("INSERT INTO stocks (date, price) VALUES ('2017-08-02', 2000)")

<sqlalchemy.engine.result.ResultProxy at 0x25b68064080>

In [67]:
pd.read_sql('SELECT * from stocks', conn)

Unnamed: 0,date,price
0,2017-08-01,1000.0
1,2017-08-02,2000.0
