## 하이브 와 연결 후 테이블 조회 및 간단한 `DataFrame` 조작

In [1]:
import pandas as pd
from pyhive import hive
import re

In [46]:
# 하이브 연결
cursor = hive.connect('localhost').cursor()

In [47]:
cursor

<pyhive.hive.Cursor at 0x2107db54760>

In [48]:
# 제약바이오주 의 2019년 1월 4일 주가 조회

sql = """
SELECT s.stk_cd ,s.stk_nm ,s.sec_nm ,s.ex_cd
		,hd.dt ,hd.o_prc ,hd.l_prc ,hd.h_prc ,hd.c_prc ,hd.vol 
		,fy.fin_itm_val 
FROM stock s 
inner join history_dt hd on (s.stk_cd=hd.stk_cd)
inner join finance_y fy on (s.stk_cd=fy.stk_cd)
WHERE s.sec_nm ='제약바이오'
AND s.ex_cd ='KP'
AND hd.dt ='2019-01-04'
AND fy.fin_itm_nm ='주당순이익'
AND fy.yy ='2018'
"""

In [49]:
cursor.execute(sql,encoding='utf8')

In [50]:
# 쿼리를 실행하고 결과를 result에 할당
result = cursor.fetchall()

In [52]:
# result

In [53]:
cursor.description

[('s.stk_cd', 'STRING_TYPE', None, None, None, None, True),
 ('s.stk_nm', 'STRING_TYPE', None, None, None, None, True),
 ('s.sec_nm', 'STRING_TYPE', None, None, None, None, True),
 ('s.ex_cd', 'STRING_TYPE', None, None, None, None, True),
 ('hd.dt', 'DATE_TYPE', None, None, None, None, True),
 ('hd.o_prc', 'DECIMAL_TYPE', None, None, None, None, True),
 ('hd.l_prc', 'DECIMAL_TYPE', None, None, None, None, True),
 ('hd.h_prc', 'DECIMAL_TYPE', None, None, None, None, True),
 ('hd.c_prc', 'DECIMAL_TYPE', None, None, None, None, True),
 ('hd.vol', 'DECIMAL_TYPE', None, None, None, None, True),
 ('fy.fin_itm_val', 'DECIMAL_TYPE', None, None, None, None, True)]

In [18]:
desc = cursor.description
# desc

In [19]:
# 리스트의 행 열 방향을 반대로 바꿈
list(zip(*desc))

[('s.stk_cd',
  's.stk_nm',
  's.sec_nm',
  's.ex_cd',
  'hd.dt',
  'hd.o_prc',
  'hd.l_prc',
  'hd.h_prc',
  'hd.c_prc',
  'hd.vol',
  'fy.fin_itm_val'),
 ('STRING_TYPE',
  'STRING_TYPE',
  'STRING_TYPE',
  'STRING_TYPE',
  'DATE_TYPE',
  'DECIMAL_TYPE',
  'DECIMAL_TYPE',
  'DECIMAL_TYPE',
  'DECIMAL_TYPE',
  'DECIMAL_TYPE',
  'DECIMAL_TYPE'),
 (None, None, None, None, None, None, None, None, None, None, None),
 (None, None, None, None, None, None, None, None, None, None, None),
 (None, None, None, None, None, None, None, None, None, None, None),
 (None, None, None, None, None, None, None, None, None, None, None),
 (True, True, True, True, True, True, True, True, True, True, True)]

In [20]:
# 리스트의 0번째 열, column 리턴 => 컬럼명들
list(zip(*desc))[0]

('s.stk_cd',
 's.stk_nm',
 's.sec_nm',
 's.ex_cd',
 'hd.dt',
 'hd.o_prc',
 'hd.l_prc',
 'hd.h_prc',
 'hd.c_prc',
 'hd.vol',
 'fy.fin_itm_val')

In [21]:
col_name = list(zip(*desc))[0]

In [44]:
# 리스트컴프리헨션을 통해 각 컬럼명 대문자로 변환
col_name =[x.upper() for x in col_name]
col_name

['S.STK_CD',
 'S.STK_NM',
 'S.SEC_NM',
 'S.EX_CD',
 'HD.DT',
 'HD.O_PRC',
 'HD.L_PRC',
 'HD.H_PRC',
 'HD.C_PRC',
 'HD.VOL',
 'FY.FIN_ITM_VAL']

In [54]:
# re.sub() : 특정 문자를 찾아서 삭제
# \S :문자열
# \. : .
# . 앞의 문자열을 찾아서 삭제

col_name = [re.sub('\S*\.', "", x) for x in col_name]
col_name

['STK_CD',
 'STK_NM',
 'SEC_NM',
 'EX_CD',
 'DT',
 'O_PRC',
 'L_PRC',
 'H_PRC',
 'C_PRC',
 'VOL',
 'FIN_ITM_VAL']

In [55]:
df = pd.DataFrame(result)
df.columns = col_name
df

Unnamed: 0,STK_CD,STK_NM,SEC_NM,EX_CD,DT,O_PRC,L_PRC,H_PRC,C_PRC,VOL,FIN_ITM_VAL
0,20,동화약품,제약바이오,KP,2019-01-04,9100.0,9100.0,9700.0,9530.0,115336.0,365.0
1,220,유유제약,제약바이오,KP,2019-01-04,10050.0,9970.0,10300.0,10200.0,82844.0,639.0
2,230,일동홀딩스,제약바이오,KP,2019-01-04,12116.0,11830.0,12259.0,12211.0,5895.0,-363.0
3,520,삼일제약,제약바이오,KP,2019-01-04,19900.0,19650.0,20550.0,20550.0,38394.0,-1621.0
4,1630,종근당홀딩스,제약바이오,KP,2019-01-04,58600.0,57800.0,59800.0,58400.0,3773.0,5481.0
5,2390,한독,제약바이오,KP,2019-01-04,28150.0,27400.0,28950.0,28000.0,151657.0,590.0
6,2720,국제약품,제약바이오,KP,2019-01-04,4070.0,4070.0,4395.0,4275.0,50009.0,128.0
7,3000,부광약품,제약바이오,KP,2019-01-04,20670.0,20583.0,21588.0,21413.0,391200.0,3019.0
8,3060,에이프로젠제약,제약바이오,KP,2019-01-04,1863.0,1765.0,1883.0,1844.0,477730.0,-12.0
9,3220,대원제약,제약바이오,KP,2019-01-04,13865.0,13685.0,14000.0,13955.0,29095.0,1263.0


In [56]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25 entries, 0 to 24
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   STK_CD       25 non-null     object
 1   STK_NM       25 non-null     object
 2   SEC_NM       25 non-null     object
 3   EX_CD        25 non-null     object
 4   DT           25 non-null     object
 5   O_PRC        25 non-null     object
 6   L_PRC        25 non-null     object
 7   H_PRC        25 non-null     object
 8   C_PRC        25 non-null     object
 9   VOL          25 non-null     object
 10  FIN_ITM_VAL  25 non-null     object
dtypes: object(11)
memory usage: 2.3+ KB


In [59]:
df["DT"] = pd.to_datetime(df["DT"])
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25 entries, 0 to 24
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   STK_CD       25 non-null     object        
 1   STK_NM       25 non-null     object        
 2   SEC_NM       25 non-null     object        
 3   EX_CD        25 non-null     object        
 4   DT           25 non-null     datetime64[ns]
 5   O_PRC        25 non-null     object        
 6   L_PRC        25 non-null     object        
 7   H_PRC        25 non-null     object        
 8   C_PRC        25 non-null     object        
 9   VOL          25 non-null     object        
 10  FIN_ITM_VAL  25 non-null     object        
dtypes: datetime64[ns](1), object(10)
memory usage: 2.3+ KB


In [80]:
# o_prc 컬럼부터 마지막  컬럼까지 숫자형으로 변환
df.loc[  : , "O_PRC": ] = df.loc[  : , "O_PRC": ].astype("int64")

In [87]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25 entries, 0 to 24
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   STK_CD       25 non-null     object        
 1   STK_NM       25 non-null     object        
 2   SEC_NM       25 non-null     object        
 3   EX_CD        25 non-null     object        
 4   DT           25 non-null     datetime64[ns]
 5   O_PRC        25 non-null     object        
 6   L_PRC        25 non-null     object        
 7   H_PRC        25 non-null     object        
 8   C_PRC        25 non-null     object        
 9   VOL          25 non-null     object        
 10  FIN_ITM_VAL  25 non-null     object        
dtypes: datetime64[ns](1), object(10)
memory usage: 2.3+ KB


In [88]:
df["STK_NM"].head(50)

0        동화약품
1        유유제약
2       일동홀딩스
3        삼일제약
4      종근당홀딩스
5          한독
6        국제약품
7        부광약품
8     에이프로젠제약
9        대원제약
10       보령제약
11      우리들제약
12     녹십자홀딩스
13        녹십자
14       일양약품
15       광동제약
16    한올바이오파마
17       신풍제약
18       셀트리온
19       이연제약
20       한미약품
21      동아에스티
22        종근당
23     JW생명과학
24       일동제약
Name: STK_NM, dtype: object

In [89]:
df.rename(columns={"FIN_ITM_VAL" : "주당순이익"}, inplace=True)

In [90]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25 entries, 0 to 24
Data columns (total 11 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   STK_CD  25 non-null     object        
 1   STK_NM  25 non-null     object        
 2   SEC_NM  25 non-null     object        
 3   EX_CD   25 non-null     object        
 4   DT      25 non-null     datetime64[ns]
 5   O_PRC   25 non-null     object        
 6   L_PRC   25 non-null     object        
 7   H_PRC   25 non-null     object        
 8   C_PRC   25 non-null     object        
 9   VOL     25 non-null     object        
 10  주당순이익   25 non-null     object        
dtypes: datetime64[ns](1), object(10)
memory usage: 2.3+ KB


In [92]:
df["PER"] = df["C_PRC"] / df["주당순이익"]
df

Unnamed: 0,STK_CD,STK_NM,SEC_NM,EX_CD,DT,O_PRC,L_PRC,H_PRC,C_PRC,VOL,주당순이익,PER
0,20,동화약품,제약바이오,KP,2019-01-04,9100,9100,9700,9530,115336,365,26.109589
1,220,유유제약,제약바이오,KP,2019-01-04,10050,9970,10300,10200,82844,639,15.962441
2,230,일동홀딩스,제약바이오,KP,2019-01-04,12116,11830,12259,12211,5895,-363,-33.639118
3,520,삼일제약,제약바이오,KP,2019-01-04,19900,19650,20550,20550,38394,-1621,-12.67736
4,1630,종근당홀딩스,제약바이오,KP,2019-01-04,58600,57800,59800,58400,3773,5481,10.65499
5,2390,한독,제약바이오,KP,2019-01-04,28150,27400,28950,28000,151657,590,47.457627
6,2720,국제약품,제약바이오,KP,2019-01-04,4070,4070,4395,4275,50009,128,33.398438
7,3000,부광약품,제약바이오,KP,2019-01-04,20670,20583,21588,21413,391200,3019,7.092746
8,3060,에이프로젠제약,제약바이오,KP,2019-01-04,1863,1765,1883,1844,477730,-12,-153.666667
9,3220,대원제약,제약바이오,KP,2019-01-04,13865,13685,14000,13955,29095,1263,11.049089


In [93]:
df[["STK_CD", "STK_NM", "PER"]]

Unnamed: 0,STK_CD,STK_NM,PER
0,20,동화약품,26.109589
1,220,유유제약,15.962441
2,230,일동홀딩스,-33.639118
3,520,삼일제약,-12.67736
4,1630,종근당홀딩스,10.65499
5,2390,한독,47.457627
6,2720,국제약품,33.398438
7,3000,부광약품,7.092746
8,3060,에이프로젠제약,-153.666667
9,3220,대원제약,11.049089
