## 使用內建功能讀取 txt 檔

# [教學目標]
- 示範 Pandas 各種 讀取 / 寫入 檔案的方式

# [範例重點]
- 讀取 txt 檔 (In[2], Out[2])
- 存取 json 檔 (In[4], In[5], In[7], In[8])
- 存取 npy 檔 (numpy專用檔, In[10], In[11]) 
- 讀取 Pickle 檔 (In[12], In[13])

In [4]:
with open("data/example.txt", 'r') as f:
    data = f.readlines()
print(data)

['id,sex,age,score\n', '001,F,20,77\n', '002,F,25,90\n', '003,M,22,80\n', '004,F,30,66\n', '005,M,40,60\n', '006,M,29,87']


## 將 txt 轉成 pandas dataframe

In [2]:
import pandas as pd

data = []
with open("data/example.txt", 'r') as f:
    for line in f:
        line = line.replace('\n', '').split(',') # 將每句最後的 /n 取代成空值後，再以逗號斷句
        data.append(line)
data

[['n00015388_157\thttp://farm1.static.flickr.com/145/430300483_21e993670c.jpg'],
 ['n00015388_238\thttp://farm2.static.flickr.com/1005/3352960681_37b9c1d27b.jpg'],
 ['n00015388_304\thttp://farm1.static.flickr.com/27/51009336_a9663af3dd.jpg'],
 ['n00015388_327\thttp://farm4.static.flickr.com/3025/2444687979_bf7bc8df21.jpg'],
 ['n00015388_355\thttp://img100.imageshack.us/img100/3253/forrest004fs9.jpg'],
 ['n00015388_373\thttp://img172.imageshack.us/img172/4606/napo03072en9.jpg'],
 ['n00015388_379\thttp://fotootjesvanannelies.web-log.nl/olifantfotootjes/images/2008/06/04/img_8870.jpg'],
 ['n00015388_388\thttp://www.deweekkrant.nl/images/library/pictures/4f/2e/a5/cf/2_3d81b762a5e8a345bbb1f0884fab2e9762146129.jpg'],
 ['n00015388_442\thttp://www.porschemania.it/discus/messages/815/102099.jpg'],
 ['n00015388_474\thttp://farm1.static.flickr.com/72/225029238_805b0937ca.jpg'],
 ['n00015388_514\thttp://farm2.static.flickr.com/1128/1432436038_6c131f1bb0.jpg'],
 ['n00015388_671\thttp://farm4.static

In [3]:
df = pd.DataFrame(data[1:])
df.columns = data[0]
df

ValueError: Length mismatch: Expected axis has 4 elements, new values have 1 elements

## 將資料轉成 json 檔後輸出
將 json 讀回來後，是否與我們原本想要存入的方式一樣? (以 id 為 key)

In [4]:
import json
df.to_json('data/example01.json')

In [5]:
# 上面的存入方式，會將 column name 做為主要的 key, row name 做為次要的 key
with open('data/example01.json', 'r') as f:
    j1 = json.load(f)
j1

{'age': {'0': '20', '1': '25', '2': '22', '3': '30', '4': '40', '5': '29'},
 'id': {'0': '001',
  '1': '002',
  '2': '003',
  '3': '004',
  '4': '005',
  '5': '006'},
 'score': {'0': '77', '1': '90', '2': '80', '3': '66', '4': '60', '5': '87'},
 'sex': {'0': 'F', '1': 'F', '2': 'M', '3': 'F', '4': 'M', '5': 'M'}}

In [6]:
df.set_index('id', inplace=True)
df

Unnamed: 0_level_0,sex,age,score
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,F,20,77
2,F,25,90
3,M,22,80
4,F,30,66
5,M,40,60
6,M,29,87


In [7]:
df.to_json('data/example02.json', orient='index')

In [8]:
with open('data/example02.json', 'r') as f:
    j2 = json.load(f)
j2

{'001': {'age': '20', 'score': '77', 'sex': 'F'},
 '002': {'age': '25', 'score': '90', 'sex': 'F'},
 '003': {'age': '22', 'score': '80', 'sex': 'M'},
 '004': {'age': '30', 'score': '66', 'sex': 'F'},
 '005': {'age': '40', 'score': '60', 'sex': 'M'},
 '006': {'age': '29', 'score': '87', 'sex': 'M'}}

## 將檔案存為 npy 檔
一個專門儲存 numpy array 的檔案格式
使用 npy 通常可以讓你更快讀取資料喔!  
[建議閱讀](https://towardsdatascience.com/why-you-should-start-using-npy-file-more-often-df2a13cc0161)

In [9]:
import numpy as np
# 將 data 的數值部分轉成 numpy array
array = np.array(data[1:])
array

array([['001', 'F', '20', '77'],
       ['002', 'F', '25', '90'],
       ['003', 'M', '22', '80'],
       ['004', 'F', '30', '66'],
       ['005', 'M', '40', '60'],
       ['006', 'M', '29', '87']], dtype='<U3')

In [10]:
np.save(arr=array, file='data/example.npy')

In [11]:
array_back = np.load('data/example.npy')
array_back

array([['001', 'F', '20', '77'],
       ['002', 'F', '25', '90'],
       ['003', 'M', '22', '80'],
       ['004', 'F', '30', '66'],
       ['005', 'M', '40', '60'],
       ['006', 'M', '29', '87']], dtype='<U3')

## Pickle
存成 pickle 檔  
什麼都包，什麼都不奇怪的 [Pickle](https://docs.python.org/3/library/pickle.html)  
比如說 [CIFAR10](https://www.cs.toronto.edu/~kriz/cifar.html) 的資料集就是用 pickle 包的喔!

In [12]:
import pickle
with open('data/example.pkl', 'wb') as f:
    pickle.dump(file=f, obj=data)

In [13]:
with open('data/example.pkl', 'rb') as f:
    pkl_data = pickle.load(f)
pkl_data

[['id', 'sex', 'age', 'score'],
 ['001', 'F', '20', '77'],
 ['002', 'F', '25', '90'],
 ['003', 'M', '22', '80'],
 ['004', 'F', '30', '66'],
 ['005', 'M', '40', '60'],
 ['006', 'M', '29', '87']]