In [1]:
PYCANTONESE_PATH = r'/home/lun/csrp/corpuses/pycantonese/'
CORPUS_PATH = r'/home/lun/csrp/code/corpus/hkcancor/'
OUTPUT_PATH = r'/home/lun/csrp/code/jieba-cantonese/'

import sys, re, glob, math, collections
sys.path.insert(0, PYCANTONESE_PATH)
import pycantonese as pc
import pandas as pd
import numpy as np
import pickle
from pprint import pprint
from contextlib import redirect_stdout

### Extract text and pos tags from HKCanCor CHAT files

In [2]:
corpus_size = 58
dataframes = []
for i in range(corpus_size):
    with open(CORPUS_PATH + r'text/hk_cantonese_corpus_%d.txt' % i, 
        'r', encoding='utf-8') as ftext:
        
        # put each string in list into a list of words,
        # removing all empty entries in latter
        text_list = [x.split(" ") for x in ftext.read().splitlines()]
        text_list = [list(filter(str.strip, x )) for x in text_list]
    assert(ftext.closed)
    
    
    with open(CORPUS_PATH + r'pos/hk_cantonese_corpus_pos_%d.txt' % i,
        'r', encoding='utf-8') as fpos:
        
        # put each string in list into a list of pos tags
        # removing all empty entries in latter
        pos_list = [x.split(" ") for x in fpos.read().splitlines()]
        pos_list = [list(filter(str.strip, x)) for x in pos_list]
    assert(fpos.closed)
    
    
    table = pd.DataFrame({ 'file_num': i, 'text' : text_list, 'pos' : pos_list })
    dataframes.append(table)
    


# Create a stopword list using a statistical model
### For details in the methodology behind, see 
#### `Zou et. al 2006, "Automatic Construction of Chinese Stop Word Lists"`

In [3]:
# create a stopword list
df_text = []

for d in dataframes:
    df_text.append(np.sum(d['text'].values) )


In [4]:
df_text = pd.DataFrame({'transcript': df_text})

In [5]:
# df_text.head()
total_num_text = df_text.count()[0]

In [6]:
df_words = []
for index, dt in enumerate(df_text['transcript'].values ):
    df_words.append(pd.DataFrame(dt) )
    numwords = df_words[index].count()[0]
    
    df_words[index].columns = ['word']
    df_words[index] = df_words[index].groupby('word')['word'].count()
    df_words[index] = pd.DataFrame(df_words[index])
    df_words[index].columns = ['num_instances']
    df_words[index]['word_prob'] = df_words[index]['num_instances'] / numwords
    df_words[index]['text_num'] = index
    df_words[index].reset_index(inplace=True)

In [7]:
df_words = pd.concat(df_words, axis=0, ignore_index=True)

In [8]:
df_words.head()

Unnamed: 0,word,num_instances,word_prob,text_num
0,Good,1,0.000475,0
1,Hifi,1,0.000475,0
2,Wilson,1,0.000475,0
3,fastforward,1,0.000475,0
4,office,1,0.000475,0


In [9]:
df_words.set_index('word', inplace=True)

In [10]:
df_sumN_prob = df_words.groupby('word')['word_prob'].sum()
df_sumN_prob.rename('sum_n_prob', inplace=True)

df_mean_prob = df_sumN_prob / total_num_text 
df_mean_prob.rename('mean_prob', inplace=True)

# join dataframes
df_sum_N_var_prob = df_words.join(pd.DataFrame(df_mean_prob) )

In [11]:
df_sum_N_var_prob.reset_index(inplace=True)

In [12]:
df_sum_N_var_prob

Unnamed: 0,word,num_instances,word_prob,text_num,mean_prob
0,',1,0.000354,43,0.000006
1,121,2,0.000571,2,0.000010
2,323,4,0.001143,2,0.000020
3,A,36,0.009549,12,0.000255
4,A,2,0.000883,26,0.000255
5,A,2,0.001031,35,0.000255
6,A,5,0.002555,39,0.000255
7,A,3,0.000750,46,0.000255
8,A1,4,0.001061,12,0.000018
9,AGM,3,0.000609,31,0.000011


In [13]:
df_sum_N_var_prob['sum_N_var_prob'] = np.power(
    df_sum_N_var_prob['word_prob'].values - df_sum_N_var_prob['mean_prob'].values, 2)

In [14]:
df_sum_N_var_prob.sort_values('num_instances', ascending=False).head()

Unnamed: 0,word,num_instances,word_prob,text_num,mean_prob,sum_N_var_prob
3727,係,334,0.061127,56,0.040246,0.000436
9481,噉,231,0.042277,56,0.020902,0.000457
3506,佢,202,0.036969,56,0.016589,0.000415
3686,係,190,0.05064,15,0.040246,0.000108
8013,啊,180,0.044042,18,0.031676,0.000153


In [15]:
df_sum_N_var_prob = df_sum_N_var_prob.groupby('word')['sum_N_var_prob'].sum()
df_var_prob = df_sum_N_var_prob / numwords
df_var_prob.rename('var_prob', inplace=True)

df_stopwords = pd.DataFrame({
    'mean_prob' : df_mean_prob,
    'var_prob' : df_var_prob,  
    'sat_val' : df_sumN_prob / df_sum_N_var_prob
})

In [16]:
df_stopwords = df_stopwords[['mean_prob', 'var_prob', 'sat_val']]

In [17]:
df_stopwords.sort_values('mean_prob', ascending=False) # high mp
# df_stopwords.sort_values('var_prob', ascending=False) # high var_prob
# df_stopwords.sort_values('sat_val', ascending=False) # high var_prob

Unnamed: 0_level_0,mean_prob,var_prob,sat_val
word,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
係,0.040246,9.182646e-06,171.642311
啊,0.031676,5.379635e-06,230.596737
呢,0.023257,6.018811e-06,151.326950
我,0.022570,9.152442e-06,96.575643
噉,0.020902,2.005573e-06,408.153301
你,0.019405,2.419016e-06,314.165411
唔,0.017552,7.505671e-06,91.583393
都,0.016892,1.025094e-06,645.340465
佢,0.016589,3.285804e-06,197.714609
好,0.016198,7.290269e-06,87.012368


# Create a stopword list using an information model
### For details in the methodology behind, see 
#### `Zou et. al 2006, "Automatic Construction of Chinese Stop Word Lists"`

In [18]:
# calculate the entropy for each word
df_words.head()

Unnamed: 0_level_0,num_instances,word_prob,text_num
word,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Good,1,0.000475,0
Hifi,1,0.000475,0
Wilson,1,0.000475,0
fastforward,1,0.000475,0
office,1,0.000475,0


In [19]:
df_entropy = df_words['word_prob'] * np.log2(1 / df_words['word_prob'])
df_entropy.rename('entropy', inplace=True)
df_entropy = pd.DataFrame(df_entropy)
df_entropy.reset_index(inplace=True)

In [20]:
df_stopwords['entropy'] = df_entropy.groupby('word')['entropy'].sum()

In [21]:
df_stopwords.sort_values('entropy', ascending=False)

Unnamed: 0_level_0,mean_prob,var_prob,sat_val,entropy
word,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
係,0.040246,9.182646e-06,171.642311,10.531480
啊,0.031676,5.379635e-06,230.596737,8.925534
呢,0.023257,6.018811e-06,151.326950,7.027129
我,0.022570,9.152442e-06,96.575643,6.839800
噉,0.020902,2.005573e-06,408.153301,6.633132
你,0.019405,2.419016e-06,314.165411,6.232663
都,0.016892,1.025094e-06,645.340465,5.678637
唔,0.017552,7.505671e-06,91.583393,5.671489
佢,0.016589,3.285804e-06,197.714609,5.459486
好,0.016198,7.290269e-06,87.012368,5.324538


In [22]:
#pre: attribute_type must be 'sat_val', 'mean_prob', 'var_prob', 'entropy'
def findRank(attribute_type, bool_ascending):
    return df_stopwords.sort_values(
        [attribute_type], ascending=bool_ascending ).reset_index().reset_index().set_index(
        'word')[['index']]

In [23]:
df_rank_sat_val = findRank('sat_val', True) # The higher the
df_rank_mean_prob = findRank('mean_prob', False)
df_rank_var_prob = findRank('var_prob', False)
df_rank_entropy = findRank('entropy', False)

In [24]:
df_rank = pd.DataFrame({
    'sat_val_rank' : df_rank_sat_val['index'], 
    'mean_prob_rank' : df_rank_mean_prob['index'], 
    'var_prob_rank' : df_rank_var_prob['index'], 
    'entropy_rank' : df_rank_entropy['index'] })

In [25]:
df_rank['weight'] = df_rank.sum(axis=1)

In [26]:
df_rank.reset_index(inplace=True)

In [27]:
df_rank.sort_values('weight', ascending=True, inplace=True)

In [39]:
# output stop words
# this list might need some further human cleaning
df_rank['index'].head(200).to_csv(
    r'/home/lun/Desktop/hkcancorpus_stopwords.txt', 
    sep=' ', index=False, header=False)

---

### Compile HMM data from output in Part 1

### Compile Statistical Data from HKCanCor CHAT files