### 文本的输入输出

In [1]:
import sys
sys.path.append('../')

In [2]:
import re

In [3]:
# 了解即可
def parse(text):
    # 使用正则表达式去除标点符号和换行符
    text = re.sub(r'[^\w ]', ' ', text)
    
    # 转为小写
    text = text.lower()
    
    # 生成所有单词的列表
    word_list = text.split(' ')
    
    # 去除空白单词
    word_list = filter(None, word_list)
    
    # 生成单词和词频的字典
    word_cnt = {}
    for word in word_list:
        if word not in word_cnt:
            word_cnt[word] = 0
        word_cnt[word] += 1
    
    # 按照词频排序
    sorted_word_cnt = sorted(word_cnt.items(), key=lambda kv: kv[1], reverse=True)
    return sorted_word_cnt

In [4]:
with open('in.txt', 'r') as fin:
    text = fin.read()

In [5]:
word_and_freq = parse(text)

In [6]:
with open('out.txt', 'w') as fout:
    for word, freq in word_and_freq:
        fout.write('{} {}\n'.format(word, freq))

### JSON序列化
接受Python的基本数据雷顷，然后将其序列化为string

In [1]:
import json

In [2]:
params = {
    'symbol': '123456',
    'type': 'limit',
    'price': 123.4,
    'amount': 23
}

params_str = json.dumps(params)

print('after json serialization')
print('type of params_str = {}, params_str = {}'.format(type(params_str), params_str))

after json serialization
type of params_str = <class 'str'>, params_str = {"symbol": "123456", "type": "limit", "price": 123.4, "amount": 23}


### 反序列化
接受一个合法字符串，然后将其反序列化为Python的基本数据类型

In [4]:
original_params = json.loads(params_str)

print('after json deserialization')
print('type of original_params = {}, original_params = {}'.format(type(original_params), original_params))

after json deserialization
type of original_params = <class 'dict'>, original_params = {'symbol': '123456', 'type': 'limit', 'price': 123.4, 'amount': 23}


In [5]:
with open('params.json', 'w') as fout:
    params_str = json.dump(params, fout)  # dump用于文件句柄 dumps用于内存对象    

In [8]:
with open('params.json', 'r') as fin:
    original_params = json.load(fin)  # load用于文件句柄 loads用于内存对象
print('type of original_params={}, original_params={}'.format(type(original_params), original_params))

type of original_params=<class 'dict'>, original_params={'symbol': '123456', 'type': 'limit', 'price': 123.4, 'amount': 23}


### 思考题一

In [13]:
import re
def parse(text):
    # 使用正则表达式去除标点符号和换行符
    text = re.sub(r'[^\w ]', ' ', text)
    # 转为小写
    text = text.lower()
    # 生成所有单词的列表
    word_list = text.split(' ')
    # 去除空白单词
    word_list = filter(None, word_list)
    
    # 生成单词和词频的字典
    word_cnt = {}
    for word in word_list:
        if word not in word_cnt:
            word_cnt[word] = 0
        word_cnt[word] += 1
    # 按词频排序
    sorted_word_cnt = sorted(word_cnt.items(), key=lambda x: x[1], reverse=True)
    return sorted_word_cnt

#### 当文件过大，一次读入会导致内存溢出时的做法

In [18]:
# 将文件按行读入
def parse_readline(infile):
    word_cnt = {}
    while True:
        text = infile.readline()
        if not text:
            break;
        print(text)
        text = re.sub(r'[^\w ]', ' ', text)
        text = text.lower()
        word_list = text.split(' ')
        word_list = filter(None, word_list)
        
        for word in word_list:
            if word not in word_cnt:
                word_cnt[word] = 0
            word_cnt[word] += 1
        sorted_word_cnt = sorted(word_cnt.items(), key=lambda x: x[1], reverse=True)
    return sorted_word_cnt

In [19]:
with open('in.txt', 'r') as fin:
    word_and_freq = parse_readline(fin)

I have a dream that my four little children will one day live in a nation where they will not be judged by the color of their skin but by the content of their character. I have a dream today.

 

I have a dream that one day down in Alabama, with its vicious racists, . . . one day right there in Alabama little black boys and black girls will be able to join hands with little white boys and white girls as sisters and brothers. I have a dream today.

 

I have a dream that one day every valley shall be exalted, every hill and mountain shall be made low, the rough places will be made plain, and the crooked places will be made straight, and the glory of the Lord shall be revealed, and all flesh shall see it together.

 

This is our hope. . . With this faith we will be able to hew out of the mountain of despair a stone of hope. With this faith we will be able to transform the jangling discords of our nation into a beautiful symphony of brotherhood. With this faith we will be able to work to

In [20]:
with open('out_readline.txt', 'w') as fout:
    for word, freq in word_and_freq:
        fout.write('{} {}\n'.format(word, freq))

In [1]:
if x < 0:
    y = -x
else:
    y = x