In [2]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
import cPickle as pickle
from sklearn.externals import joblib
from sklearn.pipeline import Pipeline

## Stopwords

In [3]:
stopword_html = open("id.stopwords.01.01.2016.txt",'r').read()
stopwords     = stopword_html.split("\n")

## Stem Preprocessing

In [4]:
# import StemmerFactory class, github : https://github.com/masdevid/PySastrawi

def rmStem(pars):
    # stemming process
    # Removing the HTML tags, similar with php's strip_tag
    from HTMLParser import HTMLParser
    from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

    class MLStripper(HTMLParser):
        def __init__(self):
            self.reset()
            self.fed = []
        def handle_data(self, d):
            self.fed.append(d)
        def get_data(self):
            return ' '.join(self.fed)

    def strip_tags(html):
        s = MLStripper()
        s.feed(html)
        striped  = s.get_data()                         #get HTML-Tags free text
        lowers   = striped.lower()                      #Lowercase all words
        nospace  = lowers.strip()                       #Remove leading and trailing white space
        return nospace

    # create stemmer
    factory = StemmerFactory()
    stemmer = factory.create_stemmer()
    
    stripped= strip_tags(pars)
    clean   = stemmer.stem(str(stripped)) #Stemming
    return clean

## POS-Tag Preprocessing

In [5]:
def posTag(par):
    from HTMLParser import HTMLParser
    
    postagger = pickle.load(open("POSTAGGER.p", "rb"))
    
    def onlyNounsFromSentence(teks):
        splitted = postagger.tag(teks.split())
        nouns = [word for word,pos in splitted \
            if (pos == 'NN' or pos == 'NNP' or 
                pos == 'NNS' or pos == 'NNPS'or 
                pos == 'VB')]
        nounsSentence = ' '.join(nouns)
        return nounsSentence

    class MLStripper(HTMLParser):
        def __init__(self):
            self.reset()
            self.fed = []
        def handle_data(self, d):
            self.fed.append(d)
        def get_data(self):
            return ' '.join(self.fed)

    def strip_tags(html):
        s = MLStripper()
        s.feed(html)
        striped  = s.get_data()                         #get HTML-Tags free text
        lowers   = striped.lower()                      #Lowercase all words
        nospace  = lowers.strip()                       #Remove leading and trailing white space
        return nospace

    par         = strip_tags(par)
    splittedPar = par.split('.')
    nounsPar    = ' '.join([onlyNounsFromSentence(i) for i in splittedPar])
    return  nounsPar

## Vectorization

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectPost = TfidfVectorizer(stop_words=stopwords,preprocessor=posTag,decode_error='ignore')
vectStem = TfidfVectorizer(stop_words=stopwords,preprocessor=rmStem,decode_error='ignore')

## Models

In [7]:
mnb = MultinomialNB(alpha=0.1)
#knn = KNeighborsClassifier()
svc = SVC(class_weight='balanced',probability=True,kernel='linear',cache_size=600)
#tre = DecisionTreeClassifier(class_weight='balanced')

## Deploy by Pickle

In [8]:
#Dumping model Short-Pos-MNB

dataDeploy = pickle.load( open( "v1.4\DATA_SHORT.p", "rb" ) )
ydataDeploy= pickle.load( open( "v1.4\DATA_TARGET.p", "rb" ) )
modlDeploy = mnb

vectDeploy = vectPost
pipeDeploy = Pipeline([('vect', vectDeploy),('clf', modlDeploy)])

modelReady = pipeDeploy.fit(dataDeploy, ydataDeploy)
joblib.dump(modelReady, 'web-service/model.pkl')

['web-service/model.pkl',
 'web-service/model.pkl_01.npy',
 'web-service/model.pkl_02.npy',
 'web-service/model.pkl_03.npy',
 'web-service/model.pkl_04.npy',
 'web-service/model.pkl_05.npy',
 'web-service/model.pkl_06.npy',
 'web-service/model.pkl_07.npy']

## Testing

In [9]:
from sklearn.externals import joblib
clf = joblib.load('web-service/model.pkl')
testing = ['Indosat akan diakusisisi oleh Qatar Investment',
           'Harga sawah di daerah Jakarta menurun     ']
for test in testing : print test,' : ',clf.predict([test])[0]

Indosat akan diakusisisi oleh Qatar Investment  :  1.0
Harga sawah di daerah Jakarta menurun       :  -1.0
