In [1]:
PYCANTONESE_PATH = r'/home/lun/csrp/corpuses/pycantonese/'
CORPUS_PATH = r'/home/lun/csrp/code/corpus/hkcancor/'
OUTPUT_PATH = r'/home/lun/csrp/code/jieba-cantonese/'
DICT_PATH = r'/home/lun/csrp/code/dictionaries/'

import sys, re, glob, math, collections
sys.path.insert(0, PYCANTONESE_PATH) # PyCantonese v0.21, modified
import pycantonese as pc
import pandas as pd
import numpy as np
import pickle
from pprint import pprint
from contextlib import redirect_stdout

### Extract text and pos tags from HKCanCor CHAT files
### and save them in separate files
#### Enable Code if this has not done prior

In [2]:
corpus_size = 58
dataframes = []
for i in range(corpus_size):
    with open(CORPUS_PATH + r'text/hk_cantonese_corpus_%d.txt' % i, 
        'r', encoding='utf-8') as ftext:
        
        # put each string in list into a list of words,
        # removing all empty entries in latter
        text_list = [x.split(" ") for x in ftext.read().splitlines()]
        text_list = [list(filter(str.strip, x )) for x in text_list]
    assert(ftext.closed)
    
    
    with open(CORPUS_PATH + r'pos/hk_cantonese_corpus_pos_%d.txt' % i,
        'r', encoding='utf-8') as fpos:
        
        # put each string in list into a list of pos tags
        # removing all empty entries in latter
        pos_list = [x.split(" ") for x in fpos.read().splitlines()]
        pos_list = [list(filter(str.strip, x)) for x in pos_list]
    assert(fpos.closed)
    
    
    table = pd.DataFrame({ 'file_num': i, 'text' : text_list, 'pos' : pos_list })
    dataframes.append(table)
    


In [3]:
def removePunctuations(entry):
    removed = [re.sub(r'[\,"\-\.\!…\?？○#]', "", x, count=0) for x in entry]
    return list(filter(None, removed)) # remove all empty strings in list

def removeTrailingNumbersFromPOS(entry):
    return [x.strip('0123456789') for x in entry]

# removePunctuations(['v1', 'v', 'n', 'y', 'nr', 'nr', '?'])

In [4]:
# do this for all dataframes
for index, d in enumerate(dataframes):
    d['pos'] = d['pos'].apply(removePunctuations).apply(removeTrailingNumbersFromPOS)
    d['text'] = d['text'].apply(removePunctuations)
    
    # a check to see if the pos lengths are the same as the text lengths
    if d.pos.str.len().all() != d.text.str.len().all():
        print (index)

In [5]:
dataframes[0].head()

Unnamed: 0,file_num,pos,text
0,0,"[v, v, n, y, nr, nr]","[有冇, 養, 寵物, 𡃉, 王, 美美]"
1,0,[v],[有]
2,0,"[v, u, m, q, n]","[養, 咗, 兩, 隻, 狗]"
3,0,"[a, y]","[真, 㗎]"
4,0,"[r, n, y]","[乜嘢, 樣, 𡃉]"


In [6]:
def joinASCIIWords(rowtext, rowpos):
    def string_is_ascii(string):
        try:
            string.encode(encoding='ascii')
        except UnicodeEncodeError:
            return False
        return True
    
    for index, (word, pos) in enumerate(zip(rowtext, rowpos)):
        next_index = index + 1       
        
        while next_index < len(rowtext) and \
            string_is_ascii(word) and \
            string_is_ascii(rowtext[next_index]):
            
            rowtext[index] += "_" + rowtext[next_index]
            rowtext.pop(next_index)
            rowpos.pop(next_index)
            word = rowtext[index] # reset the word

    return rowtext, rowpos


# do this for all dataframes
for d in dataframes:
    result =  d.apply(lambda row: joinASCIIWords(
        row['text'], row['pos']), axis=1).apply(pd.Series)
    d['text'] = result[0]
    d['pos'] = result[1]
    
del result

# x = ['c', 'v', 'xn', 'xn', 'xn', 'q']
# y = ['跟住', '買', 'fax', 'modem', 'modem', '個']
# joinASCIIWords(y, x)

# joinASCIIWords(['跟住', '買', '個', 'fax', 'modem', '而家', '幾百', '蚊', '啫', '嗎'])
# joinASCIIWords(['跟住', '買', '個', 'fax', 'modem', 'hello'])
# joinASCIIWords(['fax', 'modem', 'hello','跟住', 'hello','買', '個'])

In [7]:
dataframes[0].head()

Unnamed: 0,file_num,pos,text
0,0,"[v, v, n, y, nr, nr]","[有冇, 養, 寵物, 𡃉, 王, 美美]"
1,0,[v],[有]
2,0,"[v, u, m, q, n]","[養, 咗, 兩, 隻, 狗]"
3,0,"[a, y]","[真, 㗎]"
4,0,"[r, n, y]","[乜嘢, 樣, 𡃉]"


In [8]:
exempt = ['One2Free', 'A1', '323', '121', 'N64', '東方188' ]
def fixCantoneseJupingsInWordColumn(word_list, exempt_list):
    for index, word in enumerate(word_list):
        if word not in exempt_list and re.search(r'[0-9]', word, re.DOTALL):
            word = re.sub(r"[0-9]", "_", word, re.DOTALL)
            if word[-1] == "_":
                word = word[:-1]
        word_list[index] = word
    return word_list

# test
#fixCantoneseJupings('Zip1', words_with_digits)

In [9]:
for d in dataframes:
    d['text'] = d['text'].apply(fixCantoneseJupingsInWordColumn, exempt_list=exempt)

---
## Create a stopword list using a statistical model
### For details in the methodology behind, see 
#### `Zou et. al 2006, "Automatic Construction of Chinese Stop Word Lists"`

In [10]:
# create a stopword list
df_text = []

for d in dataframes:
    df_text.append(np.sum(d['text'].values) )


In [11]:
df_text = pd.DataFrame({'transcript': df_text})
df_text.head()

Unnamed: 0,transcript
0,"[有冇, 養, 寵物, 𡃉, 王, 美美, 有, 養, 咗, 兩, 隻, 狗, 真, 㗎, ..."
1,"[除咗, 係, 唔係, 應該, 點, 講, 個, 感覺, 係, 係, 啊, 誒, 誒, 或者..."
2,"[Medium_rare, 呀, medium_rare, 跟住, 呢, 就, medium..."
3,"[喂, 你, 下年, 畢業, 嚹, 喎, 你, 諗, 住, 做, 乜嘢, 啊, 我, 諗, ..."
4,"[噉, 𠻺, 最近, 呢, 就, 喺, 新聞, 裏邊, 呢, 睇, 到, 呢, 就, 係, ..."


In [12]:
# df_text.head()
total_num_text = df_text.count()[0]

In [16]:
df_words = []
for index, dt in enumerate(df_text['transcript'].values ):
    df_words.append(pd.DataFrame(dt, columns=['word']) )
    numwords = df_words[index].count()[0]
    
    df_words[index] = df_words[index].groupby('word')['word'].count()
    df_words[index] = pd.DataFrame(df_words[index])
    df_words[index].columns = ['num_instances']
    df_words[index]['word_prob'] = df_words[index]['num_instances'] / numwords
    df_words[index]['text_num'] = index
    df_words[index].reset_index(inplace=True)

In [None]:
df_words = pd.concat(df_words, axis=0, ignore_index=True)

In [17]:
df_words[0]

Unnamed: 0,word,num_instances,word_prob,text_num
0,Bandai,6,0.001929,0
1,CSL,1,0.000321,0
2,IMS,1,0.000321,0
3,Internet,7,0.002250,0
4,Kelly,1,0.000321,0
5,Netscape,1,0.000321,0
6,OT,4,0.001286,0
7,QC_check,1,0.000321,0
8,Samsung,1,0.000321,0
9,base,1,0.000321,0


In [None]:
df_words.head()

In [None]:
df_words.set_index('word', inplace=True)

In [None]:
df_sumN_prob = df_words.groupby('word')['word_prob'].sum()
df_sumN_prob.rename('sum_n_prob', inplace=True)

df_mean_prob = df_sumN_prob / total_num_text 
df_mean_prob.rename('mean_prob', inplace=True)

# join dataframes
df_sum_N_var_prob = df_words.join(pd.DataFrame(df_mean_prob) )

In [None]:
df_sum_N_var_prob.reset_index(inplace=True)

In [None]:
df_sum_N_var_prob.head()

In [None]:
df_sum_N_var_prob['sum_N_var_prob'] = np.power(
    df_sum_N_var_prob['word_prob'].values - df_sum_N_var_prob['mean_prob'].values, 2)

In [None]:
df_sum_N_var_prob.sort_values('num_instances', ascending=False).head()

In [None]:
df_sum_N_var_prob = df_sum_N_var_prob.groupby('word')['sum_N_var_prob'].sum()
df_var_prob = df_sum_N_var_prob / numwords
df_var_prob.rename('var_prob', inplace=True)

df_stopwords = pd.DataFrame({
    'mean_prob' : df_mean_prob,
    'var_prob' : df_var_prob,  
    'sat_val' : df_sumN_prob / df_sum_N_var_prob
})

In [None]:
df_stopwords = df_stopwords[['mean_prob', 'var_prob', 'sat_val']]

In [None]:
df_stopwords.sort_values('mean_prob', ascending=False) # high mp
# df_stopwords.sort_values('var_prob', ascending=False) # high var_prob
# df_stopwords.sort_values('sat_val', ascending=False) # high var_prob

---
## Create a stopword list using an information model
### For details in the methodology behind, see 
#### `Zou et. al 2006, "Automatic Construction of Chinese Stop Word Lists"`

In [None]:
# calculate the entropy for each word
df_words.head()

In [None]:
df_entropy = df_words['word_prob'] * np.log2(1 / df_words['word_prob'])
df_entropy.rename('entropy', inplace=True)
df_entropy = pd.DataFrame(df_entropy)
df_entropy.reset_index(inplace=True)

In [None]:
df_stopwords['entropy'] = df_entropy.groupby('word')['entropy'].sum()

In [None]:
df_stopwords.sort_values('entropy', ascending=False)

In [None]:
#pre: attribute_type must be 'sat_val', 'mean_prob', 'var_prob', 'entropy'
def findRank(attribute_type, bool_ascending):
    return df_stopwords.sort_values(
        [attribute_type], ascending=bool_ascending ).reset_index().reset_index().set_index(
        'word')[['index']]

In [None]:
df_rank_sat_val = findRank('sat_val', True) # The higher the
df_rank_mean_prob = findRank('mean_prob', False)
df_rank_var_prob = findRank('var_prob', False)
df_rank_entropy = findRank('entropy', False)

In [None]:
df_rank = pd.DataFrame({
    'sat_val_rank' : df_rank_sat_val['index'], 
    'mean_prob_rank' : df_rank_mean_prob['index'], 
    'var_prob_rank' : df_rank_var_prob['index'], 
    'entropy_rank' : df_rank_entropy['index'] })

In [None]:
df_rank['weight'] = df_rank.sum(axis=1)

In [None]:
df_rank.reset_index(inplace=True)

In [None]:
df_rank.sort_values('weight', ascending=True, inplace=True)

In [None]:
df_rank.head()

In [None]:
# output stop words
# this list might need some further human cleaning
df_rank['index'].head(50).to_csv(DICT_PATH + \
    r'hkcancorpus_stopwords.txt', 
    sep=' ', index=False, header=False)

## Compile HMM data

In [None]:
df = pd.concat(dataframes, axis=0, ignore_index=True)
df = df[['file_num', 'text', 'pos']].copy()

In [None]:
# function to check if string is an ascii
def string_is_ascii(string):
    try:
        string.encode(encoding='ascii')
    except UnicodeEncodeError:
        return False
    return True


# preprocessing function for BMES tagging of words
#
# It separates words to a list of characters
# To preserve the ascii word,
# find the first pointer ascii character position
# find the last pointer ascii character position
# concatenate ascii characters in the sublist, 
# removing empty strings and white spaces in between
# parameter: word - word
# returns: a list of tokens
def tokenize_word(word):
    char_list = list(word)
#     print("Word is separated to : " + str(char_list))

    first_ascii_pos = []
    last_ascii_pos = []
    
    ascii_flag = False
    for i, c in enumerate(char_list):
#         print("The current character is %s" %c)
        if string_is_ascii(c):           
            if ascii_flag == False:
#                 print("ascii set to true")
                first_ascii_pos.append(i)
                ascii_flag = True
            if i == len(char_list) - 1:
                last_ascii_pos.append(len(char_list))
        else:
            if ascii_flag == True:
#                 print("ascii set to false")
                last_ascii_pos.append(i)
                ascii_flag = False
        
    if len(first_ascii_pos): # if array is not empty
#         print(first_ascii_pos[::-1])
#         print(last_ascii_pos[::-1])
        for i, j in zip(first_ascii_pos[::-1], last_ascii_pos[::-1]):
#             print(i, j)
            char_list[i:j] = list(
                filter(None, "".join(char_list[i:j]).split(" ") ) )
    return char_list

    

# function to tag words using the
# BMES (begin, middle, end, single) tagging system
# precondition: string must not be empty
# returns list of separated words and corresponding
# BMES tags
def tagWord_BMES(word):
    word_length = len(word)
    assert(word_length)
    bmes_list = []
    
    word_list = tokenize_word(word)
    if len(word_list) == 1:
        bmes_list.append("S")
    else:
        for i, w in enumerate(word_list):
            if i == 0:
                bmes_list.append("B")
            elif i == len(word_list) - 1:
                bmes_list.append("E")
            else:
                bmes_list.append("M")
        
    return bmes_list

In [None]:
# # test cases
# print(tokenize_word('你office land過牆梯'))
# print(tokenize_word('office'))
# print(tokenize_word('你office'))
# print(tokenize_word('hello過牆梯world'))
# print(tokenize_word('hello過牆梯world過牆'))
# print(tokenize_word('你過牆梯'))
# print(tokenize_word('Hello  World'))

# print(tagWord_BMES('你有過牆梯')) # BMMME
# print(tagWord_BMES('office')) # S
# print(tagWord_BMES('你office')) # BE
# print(tagWord_BMES('office牆')) # BE
# print(tagWord_BMES('Hello World')) # BE
# print(tagWord_BMES('有有')) # BE
# print(tagWord_BMES('有')) # S

In [None]:
# a helper to count total number of start instances
def countTotalStartInstances(start_dict):
    total = 0
    for _, value in start_dict.items():
        total += value
    return total


# create training algorithm to calculate 
# emission (BMES->word) and transition (BMES->BMES)
# probabilities
# returns: a tuple of prob_trans, prob_emit, and prob_start
def trainingHMM_BMESTagging(text_lists):
    emission = {}
    transition = {}
    context = {} 
    start = {}
    
    # for prob_*.* files in jieba
    prob_trans = collections.defaultdict(dict)
    prob_emit = collections.defaultdict(dict)
    prob_start = {}
    
    
    # this is for the training part
    for line_list in text_lists:
        previous = '<s>'
        if previous not in context:
            context[previous] = 0
        context[previous] += 1
        
        for j, character in enumerate(line_list):
#             print("The entry contains %s" % character)
            text_bmesTags_list = tagWord_BMES(character)
            if j == 0:
                start_tag = text_bmesTags_list[0]
                if start_tag not in start:
                    start[start_tag] = 0
                start[start_tag] += 1
        
            for i, bmesTag in enumerate(text_bmesTags_list):
                
                transition_bigram = previous + " " + bmesTag
                if transition_bigram not in transition:
                    transition[transition_bigram] = 0
                transition[transition_bigram] += 1

                if bmesTag not in context:
                    context[bmesTag] = 0
                context[bmesTag] += 1

                bigram_emission = bmesTag + " " + character[i]
                if bigram_emission not in emission:
                    emission[bigram_emission] = 0
                emission[bigram_emission] += 1

                previous = bmesTag
            
        bigram_transition = previous + " </s>"
        if bigram_transition not in transition:
            transition[bigram_transition] = 0
        transition[bigram_transition] += 1

    # output transition, emission and start probabilities
#     print(context)
    for key, value in transition.items():
        previous_tag, current_tag = key.split(" ", maxsplit=1)
        if previous_tag != '<s>' and current_tag != "</s>":
            prob_trans[previous_tag][current_tag] = math.log2(float(value)/context[previous_tag])
#         print("Transition probability of %s is %.15f" % (key, math.log2(float(value)/context[previous_tag]) ) )        
#     print("\n\n\n")
    
    for key, value in emission.items():
        tag, word = key.split(" ", maxsplit=1)
#         print("Context contains %d instances" % context[tag])
#         print("tag is %s, which emits %s, with emission probability of %.15f\n" % (
#             tag, word,  math.log2(float(value)/context[tag])))
        prob_emit[tag][word] = math.log2(float(value)/context[tag])
    
#     print("Start dict contains" + str(start))
    for tag, value in start.items():
        prob_start[tag] = math.log2(float(value)/countTotalStartInstances(start))
    prob_start["M"] = -3.14e100 # minimum float value defined in jieba (MIN_FLOAT)
    prob_start["E"] = -3.14e100 # minimum float value defined in jieba (MIN_FLOAT)
    
    return dict(prob_trans), dict(prob_emit), prob_start

In [None]:
prob_trans1, prob_emit1, prob_start1 = trainingHMM_BMESTagging(df.text.tolist())

In [None]:
# create training algorithm to calculate 
# emission (pos->word) and transition (pos-pos)
# probabilities
def trainingHMM_POSTagging(text_lists, posTags_lists):
    emission = {}
    transition = {}
    context = {} 
    start = {}
    char_state = {}
    
    # for char_state_tab.*, prob_*.* files in jieba
    prob_trans = collections.defaultdict(dict)
    prob_emit = collections.defaultdict(dict)
    prob_start = {}
    
    # existing tagset may contain a combination of tags
    pos_tagset1 = [pos_tag for posTag_list in posTags_lists for pos_tag in posTag_list]
    pos_tagset2 = ['ag', 'a', 'ad', 'an', 'bg', 'b', 'c', 'dg', 'd', 
        'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'mg', 'm', 'ng', 'n', 
        'nr', 'ns', 'nt', 'nx', 'nz', 'o', 'p', 'qg', 'q', 'rg', 'r', 
        's', 'tg', 't', 'ug', 'u', 'vg', 'v', 'vd', 'vn', 'w', 'x', 
        'yg', 'y', 'z'] # official HKCanCor tagset
    unique_pos_tagset = set(pos_tagset1 + pos_tagset2)
    bmes_tagset = {'B', 'M', 'E', 'S'}
    
    # this is for the training part
    for line_list, linePosTags_list in zip(text_lists, posTags_lists):
        previous = ('<s>') # start sentence tag
        if previous not in context:
            context[previous] = 0
        context[previous] += 1
       
        for j, (character, pos_tag) in enumerate(
            zip(line_list, linePosTags_list) ):
#             print("The entry contains %s with tag %s " % (character, pos_tag) )
            character_bmesTags_list = tagWord_BMES(character)
            character_token_list = tokenize_word(character)
            
            # build up start dictionary
            if j == 0:
                start_tag = character_bmesTags_list[0]
                if (start_tag, pos_tag) not in start:
                    start[(start_tag, pos_tag)] = 0
                start[(start_tag, pos_tag)] += 1
        
            # build up transition, emission dictionaries
            for i, (token, bmesTag) in enumerate(
                zip(character_token_list, character_bmesTags_list)):
                
                tag_pair = (bmesTag, pos_tag)
                
                transition_bigram = (previous, tag_pair)
                if transition_bigram not in transition:
                    transition[transition_bigram] = 0
                transition[transition_bigram] += 1

                if tag_pair not in context:
                    context[tag_pair] = 0
                context[tag_pair] += 1

                bigram_emission = (tag_pair, token)
                if bigram_emission not in emission:
                    emission[bigram_emission] = 0
                emission[bigram_emission] += 1
                
                if token not in char_state:
                    char_state[token] = []
                char_state[token].append(tag_pair)
                    
                
                previous = tag_pair
            
        bigram_transition = (previous, "</s>")
        if bigram_transition not in transition:
            transition[bigram_transition] = 0
        transition[bigram_transition] += 1
        
    
# output transition, emission and start probabilities
#     print(context)
    for (previous_tag_pair, current_tag_pair), value in transition.items():
        if previous_tag_pair != ('<s>') and current_tag_pair != ("</s>"):
            prob_trans[previous_tag_pair][current_tag_pair] = math.log2(float(value)/context[previous_tag_pair])
#         print("Transition probability of %s is %.15f" % (key, math.log2(float(value)/context[previous_tag]) ) )        
#     print("\n\n\n")
    for bmes_tag in bmes_tagset: # do this for empty tag pairs 
        for pos_tag in unique_pos_tagset:
            if (bmes_tag, pos_tag) not in prob_trans:
                prob_trans[(bmes_tag, pos_tag)] = {}
                
    
    for (token, tag_pair_list) in char_state.items():
        char_state[token] = tuple(set(tag_pair_list)) # only keep unique tag sets
        
    
    for (tag_pair, word), value in emission.items():
#         print("Context contains %d instances" % context[tag_pair])
#         print("tag is %s, which emits %s, with emission probability of %.15f\n" % (
#             tag, word,  math.log2(float(value)/context[tag_pair])))
        prob_emit[tag_pair][word] = math.log2(float(value)/context[tag_pair])
    
    for bmes_tag in bmes_tagset: # do this for empty tag pairs 
        for pos_tag in unique_pos_tagset:
            if (bmes_tag, pos_tag) not in prob_emit:
                prob_emit[(bmes_tag, pos_tag)] = {}

                
#     print("Start dict contains" + str(start))
    for tag_pair, value in start.items():
        prob_start[tag_pair] = math.log2(float(value)/countTotalStartInstances(start))
    for bmes_tag in bmes_tagset: # do this for empty tag pairs 
        for pos_tag in unique_pos_tagset:
            if (bmes_tag, pos_tag) not in prob_start:
                prob_start[(bmes_tag, pos_tag)] = -3.14e100 # minimum float value defined in jieba (MIN_FLOAT)
    
    return dict(prob_trans), dict(prob_emit), prob_start, char_state

In [None]:
prob_trans2, prob_emit2, prob_start2, char_state2 = trainingHMM_POSTagging(df.text.tolist(), df.pos.tolist())

In [None]:
# char_state2

In [None]:
# s4, s5, s6, s7 = trainingHMM_POSTagging([['重', '有得', '搞']], [['d', 'vu', 'v']])

In [None]:
# s4

In [None]:
### Output prob_trans, prob_emit, prob_start

In [None]:
def outputDictionary(filename, prob_dict):
    with open(OUTPUT_PATH + filename, 'w', encoding='utf-8') as f:
        with redirect_stdout(f):
            print("P=", end='')
            pprint(prob_dict) 
    assert(f.closed)

def pickleDictionary(filename, prob_dict):
    with open(OUTPUT_PATH + filename, 'wb') as f:
        # jieba uses protocol 0 encoding for its pickle files
        pickle.dump(prob_dict, f, protocol=0)
    assert(f.closed)
    
def depickleDictionary(filename):
    with open(OUTPUT_PATH + filename, 'rb') as f:
        prob_dict = pickle.load(f, encoding='utf-8')
    assert(f.closed)
    return prob_dict
    
outputDictionary("posseg/prob_trans.py", prob_trans2)
outputDictionary("posseg/prob_emit.py", prob_emit2)
outputDictionary("posseg/prob_start.py", prob_start2)
outputDictionary("posseg/char_state_tab.py", char_state2)

pickleDictionary("posseg/prob_trans.p", prob_trans2)
pickleDictionary("posseg/prob_emit.p", prob_emit2)
pickleDictionary("posseg/prob_start.p", prob_start2)
pickleDictionary("posseg/char_state_tab.p", char_state2)

outputDictionary("finalseg/prob_trans.py", prob_trans1)
outputDictionary("finalseg/prob_emit.py", prob_emit1)
outputDictionary("finalseg/prob_start.py", prob_start1)

pickleDictionary("finalseg/prob_trans.p", prob_trans1)
pickleDictionary("finalseg/prob_emit.p", prob_emit1)
pickleDictionary("finalseg/prob_start.p", prob_start1)


# s1 = depickleDictionary("finalseg/prob_trans.p")
# s2 = depickleDictionary("finalseg/prob_emit.p")
# s3 = depickleDictionary("finalseg/prob_start.p")

# s1 = depickleDictionary("posseg/prob_trans.p")
# s2 = depickleDictionary("posseg/prob_emit.p")
# s3 = depickleDictionary("posseg/prob_start.p")
# s4 = depickleDictionary("posseg/char_state_tab.p")

In [None]:
# s4

### Compile Dictionary Word Count

In [None]:
# Put everything in Pandas
# This is not necessary, but 
# it shows the layouts neatly
df_full = pd.concat(dataframes).reset_index(drop=True)

In [None]:
df_full

In [None]:
df_full = pd.DataFrame(np.sum(df_full.apply(lambda row: list(zip(
    row['text'], row['pos']) ), axis=1).values) )

In [None]:
df_full.columns = ['word', 'pos']

In [None]:
df_full

In [None]:
df_full = df_full.groupby(['word','pos'], sort=False).size().reset_index(name='count')

In [None]:
# reorder the columns according to Jieba Dictionary layout
df_full = df_full[['word', 'count', 'pos']]

In [None]:
df_full.head()

In [None]:
# words_with_digits = df_full[df_full['word'].str.contains(r"[0-9]+", regex=True)]['word'].values

# # define what to exclude from the set words_with_digits
# words_with_digits = set(words_with_digits).difference({'One2Free', 'A1', '323', '121', 'N64', '東方188' })

In [None]:
# words_with_digits

In [None]:
# df_full['word'] = df_full['word'].apply(fixCantoneseJupingsInWordColumn, fix_list=words_with_digits)

In [None]:
# df_full[df_full['word'].str.contains(r"[0-9]+", regex=True)]['word'].values

In [None]:
# df_full[df_full['word'].str.contains(r"[A-Za-z]+", regex=True)]['word'].values

In [None]:
df_full.reset_index(drop=True, inplace=True)

In [None]:
df_nouns = df_full[df_full.pos.str.contains('^n|[^va]n', regex=True)]
df_nouns.head()

In [None]:
df_others = df_full[~df_full.isin(df_nouns)].dropna()
df_others['count'] = df_others['count'].astype('int')

In [None]:
df_nouns.word.to_csv(
    DICT_PATH + r'nouns.txt', 
    sep=' ', index=False, header=False)

In [None]:
df_others.word.to_csv(
    DICT_PATH + r'others.txt', 
    sep=' ', index=False, header=False)

In [None]:
df_full.to_csv(
    DICT_PATH + r'hkcantonesedict.txt', 
    sep=' ', index=False, header=False)