## Import Library

In [23]:
import os
import random
import string
import nltk
from nltk import word_tokenize
from collections import defaultdict
from nltk import FreqDist
import sklearn
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
import pickle

## Specifies the location of the dir

In [24]:
BASE_DIR = './data'
LABELS = ['business', 'entertainment', 'politics', 'sport', 'tech']

## Stop word settings

The stop word used is in English, because the data used is in English

In [25]:
stop_word = set(stopwords.words('english'))
stop_word.add('said')
stop_word.add('mr')

## Create data set

Read all the data contained in the data folder. Then write the data back to the data.txt file. The data in the data.txt file will be labeled according to the location of the data folder

In [26]:
def create_data_set():
    with open('data.txt', 'w', encoding='utf8') as outfile:
        for label in LABELS:
            dir = '%s/%s' % (BASE_DIR, label)
            for filename in os.listdir(dir):
                fullfilename = '%s/%s' % (dir, filename)
                with open(fullfilename, 'rb') as file:
                    text = file.read().decode(errors='replace').replace('\n', '')
                    outfile.write('%s\t%s\t%s\n' % (label, filename, text))

Make the necessary data set-up, by taking the label data and news text, which are then stored in the docs variable

In [27]:
def setup_doc():
    docs = [] #[(label, text)]
    with open('data.txt', 'r', encoding='utf8') as datafile:
        for row in datafile:
            parts = row.split('\t')
            doc = (parts[0], parts[2].strip())
            
            docs.append(doc)
    return docs

## Clean text

Delete unnecessary words and change all letters to lowercase

In [28]:
def clean_text(text):
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = text.lower()
    
    return text

## Tokenization of words

In [29]:
def get_tokens(text):
    tokens = word_tokenize(text)
    tokens = [t for t in tokens if not t in stop_word]
    return tokens

## Look at the frequency of each word that appears

In [30]:
def frequency_dist(docs):
    tokens = defaultdict(list)
    
    for doc in docs:
        doc_label = doc[0]
        doc_text = clean_text(doc[1])
        
        doc_tokens = get_tokens(doc_text)
        tokens[doc_label].extend(doc_tokens)
    
    for category_label, category_tokens in tokens.items():
        print(category_label)
        fd = FreqDist(category_tokens)
        print(fd.most_common(20))

## Create data train and data test

In [31]:
def get_split(docs):
    
    random.shuffle(docs)
    
    X_train = []
    y_train = []
    
    X_test = []
    y_test= []
    
    pivot = int(.80 * len(docs))
    
    for i in range(0, pivot):
        X_train.append(docs[i][1])
        y_train.append(docs[i][0])
        
    for i in range(pivot, len(docs)):
        X_test.append(docs[i][1])
        y_test.append(docs[i][0])
        
    return X_train, X_test, y_train, y_test

## Evaluate the results of the model created

In [32]:
def evaluate_classifier(title, classifier, vectorizer, X_test, y_test):
    X_test_tfidf = vectorizer.transform(X_test)
    y_pred = classifier.predict(X_test_tfidf)
    
    precision = metrics.precision_score(y_test, y_pred, average='micro')
    recall = metrics.recall_score(y_test, y_pred, average='micro')
    f1 = metrics.f1_score(y_test, y_pred, average='micro')
    
    print('%s\t%f\t%f\t%f\n' % (title, precision, recall, f1))

## Model setup

In [33]:
def train_classifier(docs):
    X_train, X_test, y_train, y_test = get_split(docs)
    
    vectorizer = CountVectorizer(stop_words='english',
                                 ngram_range=(1, 3),
                                 min_df=3, analyzer='word')
    
    dtm = vectorizer.fit_transform(X_train)
    
    naive_bayes_classifier = MultinomialNB().fit(dtm, y_train)
   
    # evaluate_classifier("Naive Bayes\tTRAINT\t", naive_bayes_classifier, vectorizer, X_train, y_train)
    # evaluate_classifier("Naive Bayes\tTEST\t", naive_bayes_classifier, vectorizer, X_test, y_test)
    
    clf_filename = 'naive_bayes_classifire.pkl'
    pickle.dump(naive_bayes_classifier, open(clf_filename, 'wb'))
    
    vec_filename = 'count_vectorize.pkl'
    pickle.dump(vectorizer, open(vec_filename, 'wb'))

In [34]:
if __name__ =='__main__':
    # Create dataset 
    # create_data_set()
    
    # Setup data set
    docs = setup_doc()
    
    # Cek frequency in data set
    # frequency_dist(docs)
    
    # Export model to .pkl
    train_classifier(docs)
    