## 6장. 데이터 로딩과 저장, 파일형식

#### pandas에서 데이터 읽어오기
-----------------------------------  
read_csv    : 기본 구분자는 쉼표 ,  
read_excel  : 엑셀파일 읽어오기    
read_table  : 기본 구분자는 탭  \t  
read_hdf    : pandas에서 저장한 HDFS 파일 불러오기   
read_json   : jason 파일 읽어오기   
read_sql    : sql 쿼리결과를 DataFrame 형식으로 읽어오기   
read_html   : html 문서내의 모든 테이블의 데이터를 읽어오기   

In [1]:
import pandas as pd 
import numpy as np 

In [6]:
df = pd.read_csv("examples/ex1.csv")
df

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


In [7]:
df = pd.read_csv("examples/ex2.csv")  #컬럼명이 없는 경우는 직접 지정할 수 있다. 
df 

Unnamed: 0,1,2,3,4,hello
0,5,6,7,8,world
1,9,10,11,12,foo


In [8]:
col_name = ['a', 'b', 'c', 'd', 'message']
df = pd.read_csv("examples/ex2.csv", names = col_name)  #컬럼명을 직접 추가 
df

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


In [9]:
col_name = ['a', 'b', 'c', 'd', 'message']
df = pd.read_csv("examples/ex2.csv", names = col_name, index_col = 'message')  # index 컬럼을 지정 
df 

Unnamed: 0_level_0,a,b,c,d
message,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
hello,1,2,3,4
world,5,6,7,8
foo,9,10,11,12


In [21]:
#공백으로 구분된 데이터를 불러올때는 " sep='\s+' "를 사용한다. 
df1 = pd.read_table('examples/ex3.txt')
df1

  


Unnamed: 0,A B C
0,aaa -0.264438 -1.026059 -0.619500
1,bbb 0.927272 0.302904 -0.032399
2,ccc -0.264273 -0.386314 -0.217601
3,ddd -0.871858 -0.348382 1.100491


In [22]:
df2 = pd.read_table('examples/ex3.txt', sep = '\s+')
df2

  """Entry point for launching an IPython kernel.


Unnamed: 0,A,B,C
aaa,-0.264438,-1.026059,-0.6195
bbb,0.927272,0.302904,-0.032399
ccc,-0.264273,-0.386314,-0.217601
ddd,-0.871858,-0.348382,1.100491


In [12]:
# skiprows 옵션을 사용하여 불필요한 행을 제외하고 불러온다.  
df = pd.read_csv('examples/ex4.csv')
df

Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,# hey!
a,b,c,d,message
# just wanted to make things more difficult for you,,,,
# who reads CSV files with computers,anyway?,,,
1,2,3,4,hello
5,6,7,8,world
9,10,11,12,foo


In [14]:
df = pd.read_csv('examples/ex4.csv', skiprows = [0, 2, 3])   # 0, 2, 3번재 열을 제외 
df

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


In [25]:
#첫 x개의 row만 읽어오기  
df = pd.read_csv('examples/ex6.csv', nrows = 5)
df

Unnamed: 0,one,two,three,four,key
0,0.467976,-0.038649,-0.295344,-1.824726,L
1,-0.358893,1.404453,0.704965,-0.200638,B
2,-0.50184,0.659254,-0.421691,-0.057688,G
3,0.204886,1.074134,1.388361,-0.982404,R
4,0.354628,-0.133116,0.283763,-0.837063,Q


In [28]:
# 보통은 이렇게 한다. ~ head()함수로 위와 동일한 결과 
df = pd.read_csv('examples/ex6.csv')
df.head(10)   

Unnamed: 0,one,two,three,four,key
0,0.467976,-0.038649,-0.295344,-1.824726,L
1,-0.358893,1.404453,0.704965,-0.200638,B
2,-0.50184,0.659254,-0.421691,-0.057688,G
3,0.204886,1.074134,1.388361,-0.982404,R
4,0.354628,-0.133116,0.283763,-0.837063,Q
5,1.81748,0.742273,0.419395,-2.251035,Q
6,-0.776764,0.935518,-0.332872,-1.875641,U
7,-0.913135,1.530624,-0.572657,0.477252,K
8,0.35848,-0.497572,-0.367016,0.507702,S
9,-1.740877,-1.160417,-1.63783,2.172201,G


In [30]:
# 데이터를 csv 파일로 내보내기 
df = pd.read_csv('examples/ex5.csv')
df

Unnamed: 0,something,a,b,c,d,message
0,one,1,2,3.0,4,
1,two,5,6,,8,world
2,three,9,10,11.0,12,foo


In [34]:
df.to_csv('examples/export_ex5.csv', index = False, header = True)  #index는 없애고, header는 유지하라 

In [48]:
# json 파일 읽어오기 
## example.json = [{"a": 1, "b": 2, "c": 3},  {"a": 4, "b": 5, "c": 6},  {"a": 7, "b": 8, "c": 9}]

df = pd.read_json('examples/example.json')
df

Unnamed: 0,a,b,c
0,1,2,3
1,4,5,6
2,7,8,9


In [39]:
#html파일 읽어오기 ~ table tag안의 모든 표 형식 데이터를 가져온다. 
tables = pd.read_html('examples/fdic_failed_bank_list.html')
tables

[                                  Bank Name             City  ST   CERT  \
 0                               Allied Bank         Mulberry  AR     91   
 1              The Woodbury Banking Company         Woodbury  GA  11297   
 2                    First CornerStone Bank  King of Prussia  PA  35312   
 3                        Trust Company Bank          Memphis  TN   9956   
 4                North Milwaukee State Bank        Milwaukee  WI  20364   
 5                    Hometown National Bank         Longview  WA  35156   
 6                       The Bank of Georgia   Peachtree City  GA  35259   
 7                              Premier Bank           Denver  CO  34112   
 8                            Edgebrook Bank          Chicago  IL  57772   
 9                    Doral Bank  En Espanol         San Juan  PR  32102   
 10        Capitol City Bank & Trust Company          Atlanta  GA  33938   
 11                  Highland Community Bank          Chicago  IL  20290   
 12         

In [43]:
#Excel 데이터 가져오기 & 내보내기 
df = pd.read_excel("examples/ex1.xlsx", 'Sheet1')  # Sheet1은 생략가능 
df

Unnamed: 0.1,Unnamed: 0,a,b,c,d,message
0,0,1,2,3,4,hello
1,1,5,6,7,8,world
2,2,9,10,11,12,foo


In [45]:
df.to_excel("examples/export_ex1.xlsx", index = False, header = True)