# 데이터 입출력

In [4]:
import numpy as np
import pandas as pd

다음 포맷 지원
- CSV
- Excel
- HTML
- JSON
- HDF5
- SAS
- STATA
- SQL

In [None]:
# %%writefile로 csv파일을 쉽게 만들 수 있다

In [2]:
%%writefile samples/sample1.csv
c1, c2, c3
1, 1.11, one
2, 2.22, two
3, 3.33, three

Overwriting samples/sample1.csv


### CSV 파일 읽기

In [7]:
# c1, c2, c3를 알아서 header로
pd.read_csv('samples/sample1.csv')

Unnamed: 0,c1,c2,c3
0,1,1.11,one
1,2,2.22,two
2,3,3.33,three


In [8]:
%%writefile samples/sample2.csv
1, 1.11, one
2, 2.22, two
3, 3.33, three

Writing samples/sample2.csv


In [12]:
# Header가 없는 놈을 읽어옴. 그러면 첫 번째 행을 Header로
# 이게 싫으면 names = names 로 해서 따로 주던가
""" csv 읽어올 땐 columns = columns가 아니다! """
# 아니면 Header로 쓰지 말라고 header = None. 이러면 알아서 숫자로 줌
pd.read_csv('samples/sample2.csv', header = None)

Unnamed: 0,0,1,2
0,1,1.11,one
1,2,2.22,two
2,3,3.33,three


In [15]:
names = ['c1','c2','c3']
pd.read_csv('samples/sample2.csv', names = names)

Unnamed: 0,c1,c2,c3
0,1,1.11,one
1,2,2.22,two
2,3,3.33,three


In [17]:
    # 특정 컬럼을 인덱스로 쓰고 싶다? index_col로 내가 주면 됨
pd.read_csv('samples/sample1.csv', index_col='c1')

Unnamed: 0_level_0,c2,c3
c1,Unnamed: 1_level_1,Unnamed: 2_level_1
1,1.11,one
2,2.22,two
3,3.33,three


### CSV 파일이 아니라면? (TXT 파일이면)

In [19]:
%%writefile samples/sample3.txt
c1        c2        c3        c4
0.179181 -1.538472  1.347553  0.43381
1.024209  0.087307 -1.281997  0.49265
0.417899 -2.002308  0.255245 -1.10515

Writing samples/sample3.txt


In [21]:
# seperator를 줘야 함
pd.read_table('samples/sample3.txt', sep='\s+')

Unnamed: 0,c1,c2,c3,c4
0,0.179181,-1.538472,1.347553,0.43381
1,1.024209,0.087307,-1.281997,0.49265
2,0.417899,-2.002308,0.255245,-1.10515


In [22]:
%%writefile samples/sample4.txt
파일 제목: sample4.txt
데이터 포맷의 설명:
c1, c2, c3
1, 1.11, one
2, 2.22, two
3, 3.33, three

Writing samples/sample4.txt


In [36]:
# 따로 설명/헤더가 있다? 헤더 지우고 가져오려면 skiprows
pd.read_table('samples/sample4.txt', sep='\s+', skiprows=[0,1])

Unnamed: 0,"c1,","c2,",c3
0,1,"1.11,",one
1,2,"2.22,",two
2,3,"3.33,",three


In [32]:
%%writefile samples/sample5.csv
c1, c2, c3
1, 1.11, one
2, , two
누락, 3.33, three

Writing samples/sample5.csv


In [38]:
# '누락' 이라는 특정값을 없애고 싶다면 na_values
""" NaN값은 실수값에만 있는 것. 같은 열은 데이터타입이 같아야 하니까 """
""" 정수값 1, 2가 실수값으로 바뀜 """
df = pd.read_csv('samples/sample5.csv', na_values=['누락'])
df

Unnamed: 0,c1,c2,c3
0,1.0,1.11,one
1,2.0,,two
2,,3.33,three


### CSV 파일 출력(CSV 파일로 저장하기)

In [41]:
# samples5 와 똑같을 것
df.to_csv('samples/sample6.csv')

In [42]:
# sep 바꿔주기
df.to_csv('samples/sample7.csv', sep='|')

In [43]:
df.to_csv('samples/sample8.csv', na_rep='누락')
!cat sample8.csv

'cat'��(��) ���� �Ǵ� �ܺ� ����, ������ �� �ִ� ���α׷�, �Ǵ�
��ġ ������ �ƴմϴ�.


In [44]:
# 인덱스랑 헤더도 없애?
df.to_csv('samples/sample9.csv', index=False, header=False)

### 인터넷 상의 CSV파일 입력

In [45]:
df = pd.read_csv("https://raw.githubusercontent.com/datascienceschool/docker_rpython/master/data/titanic.csv")

In [50]:
df.head(15)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


In [51]:
# 끝을 보고싶다?
df.tail()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.45,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0,C148,C
890,891,0,3,"Dooley, Mr. Patrick",male,32.0,0,0,370376,7.75,,Q
