## 文本格式数据的读写

pandas的解析函数：
- read_csv
- read_table
- ...

In [12]:
import pandas as pd
import numpy as np

In [13]:
df = pd.read_csv('examples/ex1.csv')
print(df) #用read_csv()读取csv文件
print(pd.read_table('examples/ex1.csv', sep=',')) #用read_table()读取csv文件，并指定分隔符
print(pd.read_csv('examples/ex2.csv', header=None)) #没有列名时，header=None
print(pd.read_csv('examples/ex2.csv', names=['a', 'b', 'c', 'd', 'message'])) #指定列名
names = ['a', 'b', 'c', 'd', 'message']
print(pd.read_csv('examples/ex2.csv', names=names, index_col='message')) #指定列名，并将message列作为索引

   a   b   c   d message
0  1   2   3   4   hello
1  5   6   7   8   world
2  9  10  11  12     foo
   a   b   c   d message
0  1   2   3   4   hello
1  5   6   7   8   world
2  9  10  11  12     foo
   0   1   2   3      4
0  1   2   3   4  hello
1  5   6   7   8  world
2  9  10  11  12    foo
   a   b   c   d message
0  1   2   3   4   hello
1  5   6   7   8   world
2  9  10  11  12     foo
         a   b   c   d
message               
hello    1   2   3   4
world    5   6   7   8
foo      9  10  11  12


In [14]:
parsed = pd.read_csv('examples/csv_mindex.csv', index_col=['key1', 'key2'])
print(parsed) #指定多列作为索引
print(list(open('examples/ex3.txt'))) #逐行读取文本文件
result = pd.read_table('examples/ex3.txt', sep='\s+') #用正则表达式作为分隔符读取文本文件，\s+表示多个空格
print(result)

           value1  value2
key1 key2                
one  a          1       2
     b          3       4
     c          5       6
     d          7       8
two  a          9      10
     b         11      12
     c         13      14
     d         15      16
['            A         B         C\n', 'aaa -0.264438 -1.026059 -0.619500\n', 'bbb  0.927272  0.302904 -0.032399\n', 'ccc -0.264273 -0.386314 -0.217601\n', 'ddd -0.871858 -0.348382  1.100491\n']
            A         B         C
aaa -0.264438 -1.026059 -0.619500
bbb  0.927272  0.302904 -0.032399
ccc -0.264273 -0.386314 -0.217601
ddd -0.871858 -0.348382  1.100491


  result = pd.read_table('examples/ex3.txt', sep='\s+') #用正则表达式作为分隔符读取文本文件，\s+表示多个空格


In [15]:
print(pd.read_csv('examples/ex4.csv', skiprows=[0, 2, 3])) #跳过指定行读取csv文件
results = pd.read_csv('examples/ex5.csv')
print(results)
print(results.isnull()) #检查缺失值
results = pd.read_csv('examples/ex5.csv', na_values=['NULL']) #将指定值识别为NaN
print(results)
sentinels = {'message': ['foo', 'NA'], 'something': ['two']}
print(pd.read_csv('examples/ex5.csv', na_values=sentinels)) #为不同列指定不同的缺失值标记

   a   b   c   d message
0  1   2   3   4   hello
1  5   6   7   8   world
2  9  10  11  12     foo
  something  a   b     c   d message
0       one  1   2   3.0   4     NaN
1       two  5   6   NaN   8   world
2     three  9  10  11.0  12     foo
   something      a      b      c      d  message
0      False  False  False  False  False     True
1      False  False  False   True  False    False
2      False  False  False  False  False    False
  something  a   b     c   d message
0       one  1   2   3.0   4     NaN
1       two  5   6   NaN   8   world
2     three  9  10  11.0  12     foo
  something  a   b     c   d message
0       one  1   2   3.0   4     NaN
1       NaN  5   6   NaN   8   world
2     three  9  10  11.0  12     NaN


### 分块读入文本文件

In [16]:
pd.options.display.max_rows = 10 #设置显示的最大行数
result = pd.read_csv('examples/ex6.csv')
print(result)
print(pd.read_csv('examples/ex6.csv', nrows=5)) #只读取前5行

           one       two     three      four key
0     0.467976 -0.038649 -0.295344 -1.824726   L
1    -0.358893  1.404453  0.704965 -0.200638   B
2    -0.501840  0.659254 -0.421691 -0.057688   G
3     0.204886  1.074134  1.388361 -0.982404   R
4     0.354628 -0.133116  0.283763 -0.837063   Q
...        ...       ...       ...       ...  ..
9995  2.311896 -0.417070 -1.409599 -0.515821   L
9996 -0.479893 -0.650419  0.745152 -0.646038   E
9997  0.523331  0.787112  0.486066  1.093156   K
9998 -0.362559  0.598894 -1.843201  0.887292   G
9999 -0.096376 -1.012999 -0.657431 -0.573315   0

[10000 rows x 5 columns]
        one       two     three      four key
0  0.467976 -0.038649 -0.295344 -1.824726   L
1 -0.358893  1.404453  0.704965 -0.200638   B
2 -0.501840  0.659254 -0.421691 -0.057688   G
3  0.204886  1.074134  1.388361 -0.982404   R
4  0.354628 -0.133116  0.283763 -0.837063   Q


In [17]:
chunks = pd.read_csv('examples/ex6.csv', chunksize=1000) #分块读取大文件，每次读取1000行
print(chunks) #返回一个TextFileReader对象，可以迭代
chuner = pd.read_csv('examples/ex6.csv', chunksize=1000)
tot = pd.Series([])
for piece in chuner:
	tot = tot.add(piece['key'].value_counts(), fill_value=0) #统计每个块中key列的值的出现次数，并累加

tot = tot.sort_values(ascending=False) #按出现次数排序
print(tot[:10]) #显示出现次数最多的前10个值

<pandas.io.parsers.readers.TextFileReader object at 0x7f6a6899fec0>
key
E    368
X    364
L    346
O    343
Q    340
M    338
J    337
F    335
K    334
H    330
dtype: object


### 将数据写入文本格式

In [18]:
data = pd.read_csv('examples/ex5.csv')
print(data)
data.to_csv('examples/out.csv') #将DataFrame写入csv格式的字符串文件
import sys
data.to_csv(sys.stdout, sep='|') #将DataFrame写入csv格式的文件，指定分隔符
data.to_csv(sys.stdout, na_rep='NULL') #将NaN值写为指定字符串
data.to_csv(sys.stdout, index=False, header=False) #不写入行索引和列名
data.to_csv(sys.stdout, index=False, columns=['a', 'b', 'c']) #只写入指定的列
dates = pd.date_range('1/1/2000', periods=7)
ts = pd.Series(np.arange(7), index=dates)
print(ts)
ts.to_csv('examples/tseries.csv') #将时间序列写入csv文件

  something  a   b     c   d message
0       one  1   2   3.0   4     NaN
1       two  5   6   NaN   8   world
2     three  9  10  11.0  12     foo
|something|a|b|c|d|message
0|one|1|2|3.0|4|
1|two|5|6||8|world
2|three|9|10|11.0|12|foo
,something,a,b,c,d,message
0,one,1,2,3.0,4,NULL
1,two,5,6,NULL,8,world
2,three,9,10,11.0,12,foo
one,1,2,3.0,4,
two,5,6,,8,world
three,9,10,11.0,12,foo
a,b,c
1,2,3.0
5,6,
9,10,11.0
2000-01-01    0
2000-01-02    1
2000-01-03    2
2000-01-04    3
2000-01-05    4
2000-01-06    5
2000-01-07    6
Freq: D, dtype: int64


### 使用分隔格式

In [19]:
import csv
f = open('examples/ex7.csv')
reader = csv.reader(f) #使用csv模块读取csv文件
for line in reader:
	print(line)
with open('examples/ex7.csv') as f:
	lines = list(csv.reader(f)) #将csv文件读取为列表
header, values = lines[0], lines[1:] #分离表头和数据
data_dict = {h: v for h, v in zip(header, zip(*values))} #将数据转换为字典
print(data_dict)

['a', 'b', 'c']
['1', '2', '3']
['1', '2', '3']
{'a': ('1', '1'), 'b': ('2', '2'), 'c': ('3', '3')}


### JSON数据

In [20]:
obj = """
{"name": "Wes", "places_lived": ["United States", "Spain", "Germany"], "pet": null, "siblings": [{"name": "Scott", "age": 30, "pet": "Zuko"}, {"name": "Katie", "age": 38, "pet": "Cisco"}]}
"""
import json
result = json.loads(obj) #将JSON字符串解析为Python字典
print(result)
asjson = json.dumps(result) #将Python字典转换为JSON字符串
print(asjson)
siblings = pd.DataFrame(result['siblings'], columns=['name', 'age', 'pet']) #将字典中的列表转换为DataFrame
print(siblings)
data = pd.read_json('examples/example.json') #从JSON文件读取数据
print(data)
print(data.to_json()) #将DataFrame转换为JSON字符串
print(data.to_json(orient='records')) #将DataFrame转换为JSON字符串，按记录格式

{'name': 'Wes', 'places_lived': ['United States', 'Spain', 'Germany'], 'pet': None, 'siblings': [{'name': 'Scott', 'age': 30, 'pet': 'Zuko'}, {'name': 'Katie', 'age': 38, 'pet': 'Cisco'}]}
{"name": "Wes", "places_lived": ["United States", "Spain", "Germany"], "pet": null, "siblings": [{"name": "Scott", "age": 30, "pet": "Zuko"}, {"name": "Katie", "age": 38, "pet": "Cisco"}]}
    name  age    pet
0  Scott   30   Zuko
1  Katie   38  Cisco
   a  b  c
0  1  2  3
1  4  5  6
2  7  8  9
{"a":{"0":1,"1":4,"2":7},"b":{"0":2,"1":5,"2":8},"c":{"0":3,"1":6,"2":9}}
[{"a":1,"b":2,"c":3},{"a":4,"b":5,"c":6},{"a":7,"b":8,"c":9}]


### XML和HTML：网络抓取

In [21]:
tables = pd.read_html('examples/fdic_failed_bank_list.html') #从HTML文件中提取所有表格
print(len(tables)) #返回一个列表，包含所有提取的表格
failures = tables[0]
print(failures.head()) #显示第一个表格的前5行
close_dates = pd.to_datetime(failures['Closing Date']) #将字符串转换为日期时间格式
print(close_dates.dt.year.value_counts()) #统计每年关闭的银行数量
print(failures['City'].value_counts()) #统计每个城市关闭的银行数量

1
                      Bank Name             City  ST   CERT  \
0                   Allied Bank         Mulberry  AR     91   
1  The Woodbury Banking Company         Woodbury  GA  11297   
2        First CornerStone Bank  King of Prussia  PA  35312   
3            Trust Company Bank          Memphis  TN   9956   
4    North Milwaukee State Bank        Milwaukee  WI  20364   

                 Acquiring Institution        Closing Date       Updated Date  
0                         Today's Bank  September 23, 2016  November 17, 2016  
1                          United Bank     August 19, 2016  November 17, 2016  
2  First-Citizens Bank & Trust Company         May 6, 2016  September 6, 2016  
3           The Bank of Fayette County      April 29, 2016  September 6, 2016  
4  First-Citizens Bank & Trust Company      March 11, 2016      June 16, 2016  
Closing Date
2010    157
2009    140
2011     92
2012     51
2008     25
       ... 
2004      4
2001      4
2007      3
2003      3
2000  

#### 使用ixml.objectify解析XML

In [22]:
#待补充

## 二进制格式

In [None]:
import websocket, json, time, jieba
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from collections import Counter

room_id = "21414905"          # 换成你的直播间号
danmu_list = []             # 实时弹幕池

# ---------- 1. 连上弹幕服务器 ----------
def on_open(ws):
    # 官方握手包格式
    auth = {"uid": 0, "roomid": int(room_id), "protover": 3, "platform": "web"}
    ws.send(json.dumps(auth).encode())

def on_message(ws, msg):
    # 只提取普通弹幕
    if "DANMU_MSG" in str(msg):
        info = json.loads(msg)
        dm = info["info"][1]          # 弹幕文本
        danmu_list.append(dm)
        print(dm)                     # 终端实时打印

# ---------- 2. 每 30 秒画一次图 ----------
def plot():
    while True:
        time.sleep(30)
        if not danmu_list:
            continue
        text = " ".join(danmu_list)
        words = jieba.lcut(text)

        # 词云
        wc = WordCloud(font_path="simhei.ttf", background_color="white",
                       width=600, height=400).generate(" ".join(words))
        plt.subplot(1, 2, 1)
        plt.imshow(wc, interpolation="bilinear")
        plt.axis("off"); plt.title("弹幕词云")

        # 折线图：每分钟弹幕量
        plt.subplot(1, 2, 2)
        bucket = [i//60 for i in range(len(danmu_list))]
        counter = Counter(bucket)
        plt.plot(counter.keys(), counter.values(), marker="o")
        plt.title("每分钟弹幕数"); plt.xlabel("分钟"); plt.ylabel("条数")

        plt.tight_layout(); plt.pause(0.1)   # 实时刷新
        plt.savefig("danmu_live.png")        # 同时保存截图

# ---------- 3. 启动 ----------
if __name__ == "__main__":
    import threading
    threading.Thread(target=plot, daemon=True).start()

    url = "wss://broadcastlv.chat.bilibili.com/sub"
    ws = websocket.WebSocketApp(url,
                                on_open=on_open,
                                on_message=on_message)
    ws.run_forever()