### Import libraries

In [1]:
import pandas as pd
import glob
import nltk
import re
import heapq
from nltk.tokenize.punkt import PunktSentenceTokenizer, PunktLanguageVars
import pickle
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

### Loading myanmar stopwords and dictionary file

In [2]:
dict_words = open("myanmar-dictionary.txt", encoding="utf8").read().splitlines() # your path here
stop_words = open("myanmar-stopwords-list.txt", encoding="utf8").read().splitlines() # your path here
print(len(dict_words))
print(len(stop_words))

40196
599


### Syllabification

In [3]:
# Myanmar Language
myConsonant = "က-အ"
enChar = "a-zA-Z0-9"
mmDigit = "၀-၉"
otherChar = "ဣဤဥဦဧဩဪဿ၌၍၏၊။!-/:-@[-`{-~\s"
ssSymbol = '္'
ngaThat = 'င်'
aThat = '်'
# Modified RE (for myanmar digit)
BreakPattern = re.compile(r"((?<!" + ssSymbol + r")["+ myConsonant + r"](?![" + aThat + ssSymbol + r"])" + r"|[" + otherChar + r"]|[၀-၉]+|[a-zA-Z]+)", re.UNICODE)
# sylbreak function
def sylbreak(line):
       line = re.sub(r"\s+","", line)
       line = BreakPattern.sub(r" " + r"\1", line)
       return line

### Function for Searching word in Dictionary

In [4]:
# Search the created N-syllable words in dictionary
def _is_in_dictionary(word):
    found = True
    if word in found_words:
        found = True
    elif word in not_found_words:
        found = False
    elif not (word in dict_words or word in stop_words):
        not_found_words.add(word)
        found = False
    else:
        found_words.add(word)
    return found

### Function to segment myanmar words

In [5]:
# Greedy Left to Right Matching with maximum syllable, 6
def _left_to_right_matching(_input, _max_syllable):
    length = len(_input)
    position = 0
    result = []
    
    while length > 0:
        for i in range(_max_syllable, 0, -1):
            size = position + i
            
            # Proposed Segmented Words
            word = "".join(_input[position:size])
            
            if _is_in_dictionary(word) or i == 1:
                result.append(word)
                position += i
                length -= i
                break
    return result

### Customized Sentence Segmentation For Myanmar Text

In [6]:
# Customize Class for Burmese sentence segmentation
class BulletPointLangVars(PunktLanguageVars):
    sent_end_chars = ('။')

### Pre processing Myanmar Text

In [7]:
found_words = set()
not_found_words = set()
max_syllable = 6

In [8]:
def pre_process_data(data):
    # Replacing End of Sentence symbol for segementation purpose
    formatted_data = re.sub("[။]", " ။ ", data)
    # Sentences segmentation
    tokenizer = PunktSentenceTokenizer(lang_vars = BulletPointLangVars())
    sentences = tokenizer.tokenize(formatted_data)
    # Clean Text
    formatted_data = re.sub('[။?၊\\(\\)]', ' ', data)
    formatted_data = re.sub(r'\s+', ' ', formatted_data)
    # Syllable Break for input text
    syllables = sylbreak(formatted_data)
    # Restore syllables in List
    syllable_tokens = nltk.word_tokenize(syllables)
    # Greedy Segmentation with max_syllable = 6
    segmented_text = _left_to_right_matching(syllable_tokens, max_syllable)
    return segmented_text

### Function to read raw news

In [9]:
### Creating a function to load all news
def read_raw_news(text_files, category):
    for t in text_files:
        f = open(t, encoding="utf8")
        f = f.read()
        t = f.split("\n")
        
        data = pre_process_data(t[0])
        strr = " ".join(map(str, data))
        raw_train_data.append({'data' : strr, 'flag' : category_list.index(category)})

### Defining the news category

In [10]:
# Category list
category_list = ['business','crime','editorial','entertainment','politic']
# Training dataset
raw_train_data = []

### Function to read news directory

In [11]:
# creating a function to read all directory of news
def read_news_directory(category):
    link = "dataset\\" + category + "\\*.txt" # your path here
    directory = [link] 
    # Getting news directory
    news = list(map(lambda x: glob.glob(x), directory))
    # Splitting each news directory
    news = [item for sublist in news for item in sublist]
    # Reading all news under each category
    read_raw_news(news, category)

### Loading news

In [12]:
# business
read_news_directory(category_list[0])
# crime
read_news_directory(category_list[1])
# editorial
read_news_directory(category_list[2])
# entertainment
read_news_directory(category_list[3])
# politics
read_news_directory(category_list[4])

### Creating a dataframe

In [13]:
data_df = pd.DataFrame(raw_train_data, columns=['data', 'flag'])
print(data_df.data.shape)

(499,)


### Saving data as pickle file

In [14]:
data_df.to_pickle("data_df.pkl") # your path here

### Count Vectorizer

In [15]:
count_vectorizer = CountVectorizer(tokenizer=lambda x: x.split())
X_train_counts = count_vectorizer.fit_transform(data_df.data)
pickle.dump(count_vectorizer.vocabulary_ , open("count_vector.pkl", "wb")) # your path here

### TFIDF

In [16]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
pickle.dump(tfidf_transformer , open("tfidf.pkl", "wb")) # your path here

### Model Construction

In [17]:
X_train, X_test, Y_train, Y_test = train_test_split(X_train_tfidf, data_df.flag, test_size=0.25, random_state=42)
nb = MultinomialNB().fit(X_train, Y_train)
# Save model as pickle file
pickle.dump(nb, open("nb_model.pkl", "wb"))
pickle.dump(X_train, open("x_train.pkl", "wb"))
pickle.dump(X_test, open("x_test.pkl", "wb"))
pickle.dump(Y_train, open("y_train.pkl", "wb"))
pickle.dump(Y_test, open("y_test.pkl", "wb"))