In [1]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
import cPickle as pickle
from sklearn.externals import joblib
from sklearn.pipeline import Pipeline

## Stopwords

In [2]:
stopword_html = open("id.stopwords.01.01.2016.txt",'r').read()
stopwords     = stopword_html.split("\n")

## Stem Preprocessing

In [3]:
# import StemmerFactory class, github : https://github.com/masdevid/PySastrawi

def rmStem(pars):
    # stemming process
    # Removing the HTML tags, similar with php's strip_tag
    from HTMLParser import HTMLParser
    from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

    class MLStripper(HTMLParser):
        def __init__(self):
            self.reset()
            self.fed = []
        def handle_data(self, d):
            self.fed.append(d)
        def get_data(self):
            return ' '.join(self.fed)

    def strip_tags(html):
        s = MLStripper()
        s.feed(html)
        striped  = s.get_data()                         #get HTML-Tags free text
        lowers   = striped.lower()                      #Lowercase all words
        nospace  = lowers.strip()                       #Remove leading and trailing white space
        return nospace

    # create stemmer
    factory = StemmerFactory()
    stemmer = factory.create_stemmer()
    
    stripped= strip_tags(pars)
    clean   = stemmer.stem(str(stripped)) #Stemming
    return clean

## POS-Tag Preprocessing

In [3]:
def posTag(par):
    from HTMLParser import HTMLParser
    import string
    import re #Regex
    
    postagger = pickle.load(open("POSTAGGER.p", "rb"))
    
    def onlyAZ(teks):
        return re.sub(r'[^a-zA-Z]',' ', teks)

    def onlyNVFromSentence(teks):
        splitted = postagger.tag(onlyAZ(teks).split())
        nouns = [word for word,pos in splitted \
            if (pos == 'NN' or pos == 'NNP' or 
                pos == 'NNS' or pos == 'VB')]
        nounsSentence = string.join(nouns)
        return nounsSentence

    class MLStripper(HTMLParser):
        def __init__(self):
            self.reset()
            self.fed = []
        def handle_data(self, d):
            self.fed.append(d)
        def get_data(self):
            return ' '.join(self.fed)

    def strip_tags(html):
        s = MLStripper()
        s.feed(html)
        striped  = s.get_data()                         #get HTML-Tags free text
        lowers   = striped.lower()                      #Lowercase all words
        nospace  = lowers.strip()                       #Remove leading and trailing white space
        return nospace

    par         = strip_tags(par)
    splittedPar = par.split('.')
    NVPar    = ' '.join([onlyNVFromSentence(i) for i in splittedPar])
    return  NVPar

In [4]:
def strip_tags(html):
    from HTMLParser import HTMLParser
    import string
    import re #Regex
    
    def onlyAZ(teks):
        return re.sub(r'[^a-zA-Z]',' ', teks)
    
    class MLStripper(HTMLParser):
        def __init__(self):
            self.reset()
            self.fed = []
        def handle_data(self, d):
            self.fed.append(d)
        def get_data(self):
            return ' '.join(self.fed)
        
    s = MLStripper()
    s.feed(html)
    striped  = s.get_data()                         #get HTML-Tags free text
    onlyaz   = onlyAZ(striped)
    lowers   = onlyaz.lower()                      #Lowercase all words
    nospace  = lowers.strip()                       #Remove leading and trailing white space
    return nospace

## Vectorization

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer

#vectPost = TfidfVectorizer(stop_words=stopwords,preprocessor=posTag,decode_error='ignore')
vectStop = TfidfVectorizer(preprocessor=strip_tags,decode_error='ignore')
#vectStem = TfidfVectorizer(stop_words=stopwords,preprocessor=rmStem,decode_error='ignore')

## Models

In [6]:
mnb = MultinomialNB(alpha=0.1)
svc = SVC(probability=True,kernel='linear',cache_size=600)

In [13]:
import numpy as np
from sklearn import metrics
y = np.array([1, 1, 2, 2])
scores = np.array([0.1, 0.4, 0.35, 0.8])
fpr, tpr, thresholds = metrics.roc_curve(y, scores, pos_label=2)
fpr
tpr
thresholds

array([ 0.8 ,  0.4 ,  0.35,  0.1 ])

## Deploy by Pickle

In [7]:
#Dumping model Judul-(POSTAG+Stopwords)-MNB
import numpy as np

X_1 = pickle.load( open( "v1.4\DATA_JUDUL-v4a.p", "rb" ) )
X_2 = pickle.load( open( "v1.4\DATA_SHORT-v4a.p", "rb" ) )
X_ = X_1+' '+X_2
y_= pickle.load( open( "v1.4\DATA_TARGET-v4a.p", "rb" ) )

factor = 100.0/100
size   = int(factor*y_[y_.values == -1].count())

xP = X_[y_[y_.values == 1].index]
xN = X_[y_[y_.values == -1].index][:size]
X  = np.append(xP,xN)

yP = y_[y_.values == 1]
yN = y_[y_.values == -1][:size]
y  = np.append(yP,yN)
modlDeploy = mnb

vectDeploy = vectStop
pipeDeploy = Pipeline([('vect', vectDeploy),('clf', modlDeploy)])

modelReady = pipeDeploy.fit(X, y)
joblib.dump(modelReady, 'web-service/model3.pkl')

['web-service/model3.pkl',
 'web-service/model3.pkl_01.npy',
 'web-service/model3.pkl_02.npy',
 'web-service/model3.pkl_03.npy',
 'web-service/model3.pkl_04.npy',
 'web-service/model3.pkl_05.npy',
 'web-service/model3.pkl_06.npy',
 'web-service/model3.pkl_07.npy']

## Testing

In [10]:
from sklearn.externals import joblib
clf = joblib.load('web-service/model3.pkl')
# clf = modelReady
testing = ['Indosat Laba akan akusisisi oleh Qatar Investment',
           'Harga sawah di daerah Jakarta menurun     ']
for test in testing : print test,' : ',clf.predict([test])[0]
for test in testing : print strip_tags(test),' : ',clf.predict([test])[0]

Indosat Laba akan akusisisi oleh Qatar Investment  :  -1.0
Harga sawah di daerah Jakarta menurun       :  -1.0
indosat laba akan akusisisi oleh qatar investment  :  -1.0
harga sawah di daerah jakarta menurun  :  -1.0


In [None]:
clf.steps[0][1].preprocessor('Indosat akan diakusisisi oleh Qatar Investment <i>asdasd</i>')

In [10]:
with open('web-service/sample1.json') as data_file:    
    dataJson = json.load(data_file)

clf = joblib.load('web-service/model.pkl')
ids     = [z['id'] for z in dataJson['text']]
X_train = [z['text'] for z in dataJson['text']]
y_preds = clf.predict(X_train).tolist()
resp = [{'id':ids[i],'flag':y_preds[i]*2} for i in range(0,len(ids))]
type(y_preds[1])

NameError: name 'json' is not defined