In [1]:
import numpy as np
import pandas as pd
pd.set_option('max_columns', None)
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import time
from tqdm import tqdm

pd.set_option('display.max_columns', None)

%matplotlib inline

#每次可以输出多个变量
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

#屏蔽warning
import warnings
warnings.simplefilter("ignore")

from pylab import rcParams
rcParams['figure.figsize'] = 14, 6

#中文字体
import matplotlib
matplotlib.use('qt4agg')
#指定默认字体
matplotlib.rcParams['font.sans-serif'] = ['SimHei']
matplotlib.rcParams['font.family'] = 'sans-serif'
#解决负号'-'显示为方块的问题
matplotlib.rcParams['axes.unicode_minus'] = False

In [2]:
from collections import defaultdict, Counter
import ast

In [3]:
file = open('./北大(人民日报)语料库199801.txt', 'rb')
dic = defaultdict(lambda: defaultdict(lambda: 0))

for line in file:
    line = line.decode('utf-8')[23:].split('  ')
    for item in line:
        item = item.split('/')
        if (len(item) == 2):
            dic[item[0]][item[1]] += 1

In [4]:
file = open('./北大(人民日报)语料库199801.txt', 'rb')
dic_2gram = defaultdict(lambda: defaultdict(lambda: 0))

for line in file:
    line = line.decode('utf-8')[23:].split('  ')
    pre = ''
    for item in line:
        item = item.split('/')
        if (len(item) == 2):
            dic_2gram[item[0]][pre] += 1
            pre = item[0]

In [5]:
dic_2gram_new = defaultdict(lambda: defaultdict(lambda: 0))
min_pro = 1

for key, value in dic_2gram.items():
    temp = ast.literal_eval(str(Counter(value))[8:-1])
    s = sum(temp.values())
    for k, v in temp.items():
        temp[k] = v / s
    temp[''] = 1 / s
    if min_pro > temp['']:
        min_pro = temp['']
    dic_2gram_new[key] = temp

In [6]:
def strB2Q(ustring):
    """半角转全角"""
    rstring = ""
    for uchar in ustring:
        inside_code = ord(uchar)
        if inside_code == 32:  #半角空格直接转化
            inside_code = 12288
        elif inside_code >= 32 and inside_code <= 126:  #半角字符（除空格）根据关系转化
            inside_code += 65248
        rstring += chr(inside_code)
    return rstring

In [7]:
# 简单版


def scentence_scenteparticiple(x):
    fmm = []
    hmm = []

    x = strB2Q(x)

    point = 0
    for i in range(1, len(x) + 1):
        if x[point:i] not in dic_2gram_new.keys():
            fmm.append(x[point:i - 1])
            point = i - 1
    fmm.append(x[point:])

    point = len(x) + 1
    for i in range(1, len(x) + 1):
        if x[-i:point] not in dic_2gram_new.keys():
            hmm.append(x[-i + 1:point])
            point = -i + 1
    hmm.append(x[:point])
    hmm.reverse()
    return fmm, hmm

In [19]:
# 复杂版


def scentence_scenteparticiple(x):
    fmm = []
    hmm = []

    x = strB2Q(x)

    i = 1
    point = 0
    while i <= len(x):
        flag = 0
        if x[point:i] not in dic_2gram_new.keys():
            for j in sorted(range(1, 10), reverse=True):
                if (x[point:i + j] in dic_2gram_new.keys()) & (flag == 0):
                    fmm.append(x[point:i + j - 1])
                    point = i + j - 1
                    i += j
                    flag = 1
            if (flag == 0):
                fmm.append(x[point:i - 1])
                point = i - 1
                i += 1
        else:
            i += 1
    fmm.append(x[point:])

    i = 1
    point = len(x) + 1
    while i <= len(x):
        flag = 0
        if x[-i:point] not in dic_2gram_new.keys():
            for j in sorted(range(1, 10), reverse=True):
                if (x[-i - j:point] in dic_2gram_new.keys()) & (flag == 0):
                    hmm.append(x[-i - j + 1:point])
                    point = -i - j + 1
                    i += j
                    flag = 1
            if (flag == 0):
                hmm.append(x[-i + 1:point])
                point = -i + 1
                i += 1
        else:
            i += 1
    hmm.append(x[:point])
    hmm.reverse()
    return fmm, hmm

In [20]:
import math


def com_probability(x, y):
    prox = 0
    proy = 0
    st = 0

    for i in range(0, min(len(x), len(y))):
        if x[i] == y[i]:
            st = i
        else:
            break

    if (st == len(x) - 1):
        return x
    elif (st == len(y) - 1):
        return y

    for i in range(st, len(x) - 1):
        if x[i] not in dic_2gram_new[x[i + 1]].keys():
            pro = min(dic_2gram_new[x[i + 1]].values(), default=min_pro)
        else:
            pro = dic_2gram_new[x[i + 1]][x[i]]
        prox = prox + abs(math.log(pro))

    for i in range(st, len(y) - 1):
        if y[i] not in dic_2gram_new[y[i + 1]].keys():
            pro = min(dic_2gram_new[y[i + 1]].values(), default=min_pro)
        else:
            pro = dic_2gram_new[y[i + 1]][y[i]]
        proy = proy + abs(math.log(pro))

    if prox <= proy:
        return x
    else:
        return y

In [21]:
file = open('./test.txt', 'rb')
test = ''

for line in file:
    line = line.decode('gbk')
    test = test + line

test = test.split('\r\n')

In [22]:
result = []

for item in test:
    x, y = scentence_scenteparticiple(item)
    result.append(com_probability(x, y))

In [23]:
result[95]

['中国',
 '跳水',
 '选手',
 '蔡',
 '玉燕',
 '和',
 '陈',
 '莉',
 '今天',
 '夺',
 '得了',
 '女子',
 '１０',
 '米',
 '台',
 '双人',
 '比赛',
 '银牌',
 '，',
 '乌克兰',
 '的',
 '朱皮娜',
 '和',
 '瑟比娜',
 '摘',
 '走',
 '金牌',
 '。']

In [28]:
isinstance(result[0][1], str)

True

In [27]:
with open('./2017111454.txt', 'a+') as f:
    for i in range(0, len(result)):
        for j in range(0, len(result[i])):
            if j != len(result[i]) - 1:
                f.writelines((result[i][j] + ' ').decode('utf8').encode('gbk'))
            else:
                f.writelines((result[i][j] + '\r\n').encode('gbk'))

AttributeError: 'str' object has no attribute 'decode'