In [2]:
import os
import nltk
import tensorflow as tf

# for tokenizing
nltk.download('punkt')

# for POS tagging
from nltk.tag.perceptron import AveragedPerceptron
from nltk.tag.perceptron import PerceptronTagger


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
def load_data(path):
  """Return a list of tokenized sentence, and
      a dictionary of frequency of word
    
    Args:
      path: the path of the file (in txt) to parse and tokenize
  """
  f = open(path, "r")
  text = f.read()
  token_dict = dict()

  # documents -> list of sentences
  sentences_lst = nltk.sent_tokenize(text)

  # list of sentences -> list of tokenized sentences
  tokenized_sentences = []
  for sentence in sentences_lst:
    tokenized = nltk.word_tokenize(sentence)
    tokenized_sentences.append(tokenized)
    for token in tokenized:
      if token in token_dict:
        token_dict[token] += 1
      else:
        token_dict[token] = 1

  return tokenized_sentences, token_dict

In [4]:
def merge_dict(dict1, dict2):
  """Return a new dictionary with the content of dict1 and dict2"""  
  res = {**dict1, **dict2}
  return res

In [5]:
def process_data():
    """Retun a list of tokenized sentences and a word frequency dictionary"""
    cwd = os.getcwd()
    categories = ["/businessEconomy", "/lifestyle", "/politics", "/technology"]

    tokenized_sentences = []
    word_freq_dict = dict()

    for category in categories:
        docs_list = os.listdir(cwd+category)
        for docs in docs_list:
            doc_path = cwd+category+"/"+docs
            tkn_sent, tkn_dict = load_data(doc_path)
            tokenized_sentences += tkn_sent
            word_freq_dict = merge_dict(word_freq_dict, tkn_dict)

    return tokenized_sentences, word_freq_dict        
        

In [6]:
tokenized_sentences, word_freq_dict = process_data()

In [7]:
tagged_data = tf.keras.utils.get_file(fname="tagged_tsv_gh", origin="https://raw.githubusercontent.com/famrashel/idn-tagged-corpus/master/Indonesian_Manually_Tagged_Corpus_ID.tsv")

In [54]:
class PrepareTrainData():
    def __init__(self, path):
        self.tagged_data = path
        self.pos_tags = dict()
        self.train_data = []
        self.pos_tags_set = set()

    def load(self):
        f = open(self.tagged_data, "r")
        lines = f.readlines()
        for line in lines:
            line = line.strip()
            if line[:2] == "</":
                # end of a sentence
                self.train_data.append(tagged_entry)
            elif line[:1] == "<":
                # instantiate a new tagged entry
                tagged_entry = []
            else:
                lst = line.split("\t")
                word = lst[0]
                tag = lst[1]

                # add entry to tagged_entry to build tagged sentence
                entry = (word, tag)
                tagged_entry.append(entry)
                
                # add the tag and word to pos_tags
                if tag not in self.pos_tags:
                    self.pos_tags[tag] = set(word)
                else:
                    self.pos_tags[tag].add(word)

        self.pos_tags_set = set(pos_tags.keys())
                

In [60]:
train_data = PrepareTrainData(tagged_data)
train_data.load()
train_data = train_data.train_data

In [59]:
perceptron_tagger = PerceptronTagger(load=False)

In [73]:
perceptron_tagger.train(train_data, save_loc="trained_model.pickle")