In [56]:
# Firstly, please note that the performance of google word2vec is better on big datasets. 
# In this example we are considering only 25000 training examples from the imdb dataset.
# Therefore, the performance is similar to the "bag of words" model.

# Importing libraries
import numpy as np
import pandas as pd
# BeautifulSoup is used to remove html tags from the text
from bs4 import BeautifulSoup 
import re # For regular expressions

# Stopwords can be useful to undersand the semantics of the sentence.
# Therefore stopwords are not removed while creating the word2vec model.
# But they will be removed  while averaging feature vectors.
from nltk.corpus import stopwords
import gensim
from gensim import corpora, models, similarities

import os

In [2]:
negatif_1 = pd.read_json("./xpanas.json")['title'].values.tolist()
negatif_2 = pd.read_json("./vibokep.json")['title'].values.tolist()
negatif_3 = pd.read_json("./bokep2017.json")['title'].values.tolist()
negatif_4 = pd.read_json("./aora.json")['title'].values.tolist()
negatif_5 = pd.read_json("./bbindo.json")['title'].values.tolist()
positif_paragraf = pd.read_json("./onclinic.json")['content'].values.tolist()

In [3]:
negatif = negatif_1 + negatif_2 + negatif_3 + negatif_4 + negatif_5

In [4]:
len(negatif)

1572

In [5]:
positif = []
for x in positif_paragraf:
    for y in x.split('.'):
        for z in y.split('.'):
            positif.append(z)

In [6]:
positif[123]

' Semua pelayanan mengacu kepada standar pelayanan ON CLINIC INTERNATIONAL yang berpusat di Australia'

In [7]:
positif = [x.lower() for x in positif]
negatif = [x.lower() for x in negatif]

In [8]:
negatif[123]

'pelajar ml di dalam bus sekolah'

In [9]:
len(positif)

634

In [10]:
from sklearn.model_selection import train_test_split

In [11]:
pos_train, pos_test = train_test_split(positif, test_size=0.2)
neg_train, neg_test = train_test_split(negatif, test_size=0.2)

In [12]:
len(pos_train)

507

In [13]:
len(pos_test)

127

In [14]:
pos_label = np.ones(len(pos_train),dtype=int)
neg_label = np.zeros(len(neg_train),dtype=int)

In [15]:
df_pos = pd.DataFrame({'review':pos_train, 'sentiment':pos_label})
df_neg = pd.DataFrame({'review':neg_train, 'sentiment':neg_label})

In [16]:
frames = [df_pos, df_neg]
df_train = pd.concat(frames)
df_test = pd.DataFrame({'review':pos_test + neg_test})

In [17]:
df_train

Unnamed: 0,review,sentiment
0,"atas dukungan isteri, robi pun akhirnya menda...",1
1,dengan 15 klinik yang tersebar di banyak kota...,1
2,"\ns/ skrg ibu minum terus racikan tsb, bahkan...",1
3,\n berkurangnya rasa cinta merupakan hal yang...,1
4,untuk mempermudah menuju solusi mengatasi eja...,1
5,"tetapi jika anda mengalami sakit punggung, ya...",1
6,"faktanya, banyak sekali hubungan yang gagal k...",1
7,8,1
8,id/explore-sex/artikel/kamu-dia/mempertahankan...,1
9,jarang ditemukan adanya inkontinensia desakan...,1


In [18]:
df_test

Unnamed: 0,review
0,jika terjadinya sekali-dua kali saja mungkin ...
1,hal ini bisa dikaitkan dengan meningkatnya im...
2,\n* sex : terhindar dari resiko atrofi (penge...
3,"namun, jumlah rata-rata itu lebih besar dua k..."
4,\n* sex : obat penenang yang paling aman di d...
5,"itu sebabnya, jika anda mengalami masalah gan..."
6,"so, stop ya penggunaan obat kuat, beralihlah ..."
7,tetapi jika terlalu sering dan terus dibiarka...
8,impiannya tentang kebahagiaan justru nyaris t...
9,


In [19]:
# This function converts a text to a sequence of words.
def review_wordlist(review, remove_stopwords=False):
    # 1. Removing html tags
    review_text = BeautifulSoup(review).get_text()
    # 2. Removing non-letter.
    review_text = re.sub("[^a-zA-Z]"," ",review_text)
    # 3. Converting to lower case and splitting
    words = review_text.lower().split()
    # 4. Optionally remove stopwords
    if remove_stopwords:
        stops = set(stopwords.words("english"))     
        words = [w for w in words if not w in stops]
    
    return(words)

In [20]:
# word2vec expects a list of lists.
# Using punkt tokenizer for better splitting of a paragraph into sentences.

import nltk.data
#nltk.download('popular')

tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

In [21]:
# This function splits a review into sentences
def review_sentences(review, tokenizer, remove_stopwords=False):
    # 1. Using nltk tokenizer
    raw_sentences = tokenizer.tokenize(review.strip())
    sentences = []
    # 2. Loop for each sentence
    for raw_sentence in raw_sentences:
        if len(raw_sentence)>0:
            sentences.append(review_wordlist(raw_sentence,\
                                            remove_stopwords))

    # This returns the list of lists
    return sentences

In [22]:
sentences = []
print("Parsing sentences from training set")
for review in df_train["review"]:
    sentences += review_sentences(review, tokenizer)

Parsing sentences from training set




 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "lxml")

  markup_type=markup_type))


In [23]:
sentences

[['atas',
  'dukungan',
  'isteri',
  'robi',
  'pun',
  'akhirnya',
  'mendatangi',
  'klinik',
  'yang',
  'dipercaya',
  'memiliki',
  'tingkat',
  'keberhasilan',
  'yang',
  'sangat',
  'tinggi',
  'dalam',
  'mengobati',
  'hingga',
  'tuntas',
  'ini'],
 ['dengan',
  'klinik',
  'yang',
  'tersebar',
  'di',
  'banyak',
  'kota',
  'di',
  'indonesia',
  'on',
  'clinic',
  'dipercaya',
  'memiliki',
  'tingkat',
  'keberhasilan',
  'yang',
  'sangat',
  'tinggi',
  'dalam',
  'mengobati',
  'ejakulasi',
  'dini',
  'hingga',
  'tuntas'],
 ['s',
  'skrg',
  'ibu',
  'minum',
  'terus',
  'racikan',
  'tsb',
  'bahkan',
  'setiap',
  'pagi',
  'ibu',
  'sekarang',
  'th',
  'lakukan',
  'treadmill',
  'yg',
  'injakan',
  'kaki',
  'naik',
  'turun',
  'itu',
  'rutin',
  's',
  'd',
  'sekarang'],
 ['berkurangnya',
  'rasa',
  'cinta',
  'merupakan',
  'hal',
  'yang',
  'sering',
  'terjadi',
  'terutama',
  'pada',
  'pasangan',
  'yang',
  'sudah',
  'menjalin',
  'hubungan',

In [24]:
# Importing the built-in logging module
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [25]:
# Creating the model and setting values for the various parameters
num_features = 300  # Word vector dimensionality
min_word_count = 5 # Minimum word count
num_workers = 4     # Number of parallel threads
context = 10        # Context window size
downsampling = 1e-3 # (0.001) Downsample setting for frequent words

# Initializing the train model
from gensim.models import word2vec
print("Training model....")
model = word2vec.Word2Vec(sentences,\
                          workers=num_workers,\
                          size=num_features,\
                          min_count=min_word_count,\
                          window=context,
                          sample=downsampling)

# To make the model memory efficient
model.init_sims(replace=True)

# Saving the model for later use. Can be loaded using Word2Vec.load()
model_name = "bondowoso"
model.save(model_name)

2018-07-04 20:23:29,402 : INFO : collecting all words and their counts
2018-07-04 20:23:29,403 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2018-07-04 20:23:29,409 : INFO : collected 3013 word types from a corpus of 14335 raw words and 1782 sentences
2018-07-04 20:23:29,409 : INFO : Loading a fresh vocabulary
2018-07-04 20:23:29,413 : INFO : min_count=5 retains 588 unique words (19% of original 3013, drops 2425)
2018-07-04 20:23:29,414 : INFO : min_count=5 leaves 10499 word corpus (73% of original 14335, drops 3836)
2018-07-04 20:23:29,417 : INFO : deleting the raw counts dictionary of 3013 items
2018-07-04 20:23:29,418 : INFO : sample=0.001 downsamples 66 most-common words
2018-07-04 20:23:29,418 : INFO : downsampling leaves estimated 7775 word corpus (74.1% of prior 10499)
2018-07-04 20:23:29,420 : INFO : estimated required memory for 588 words and 300 dimensions: 1705200 bytes
2018-07-04 20:23:29,421 : INFO : resetting layer weights
2018-07-04 20:23:29,

Training model....


2018-07-04 20:23:29,619 : INFO : saved bondowoso


In [26]:
# Few tests: This will print the odd word among them 
model.wv.doesnt_match("aku ngentot anda sex".split())



'ngentot'

In [27]:
# This will print the most similar words present in the model
model.wv.most_similar("crot")

[('di', 0.9991635680198669),
 ('pria', 0.9991549253463745),
 ('akan', 0.9991517066955566),
 ('untuk', 0.9991499781608582),
 ('dapat', 0.9991472959518433),
 ('seks', 0.999146580696106),
 ('tidak', 0.999146580696106),
 ('yang', 0.9991453886032104),
 ('anda', 0.9991423487663269),
 ('dan', 0.9991397261619568)]

In [28]:
#This will give the total number of words in the vocabolary created from this dataset
model.wv.syn0.shape

  


(588, 300)

In [29]:
# Function to average all word vectors in a paragraph
def featureVecMethod(words, model, num_features):
    # Pre-initialising empty numpy array for speed
    featureVec = np.zeros(num_features,dtype="float32")
    nwords = 0
    
    #Converting Index2Word which is a list to a set for better speed in the execution.
    index2word_set = set(model.wv.index2word)
    
    for word in  words:
        if word in index2word_set:
            nwords = nwords + 1
            featureVec = np.add(featureVec,model[word])
    
    # Dividing the result by number of words to get average
    featureVec = np.divide(featureVec, nwords)
    return featureVec

In [30]:
# Function for calculating the average feature vector
def getAvgFeatureVecs(reviews, model, num_features):
    counter = 0
    reviewFeatureVecs = np.zeros((len(reviews),num_features),dtype="float32")
    for review in reviews:
        # Printing a status message every 1000th review
        if counter%10 == 0:
            print("Review %d of %d"%(counter,len(reviews)))
            
        reviewFeatureVecs[counter] = featureVecMethod(review, model, num_features)
        counter = counter+1
        
    return reviewFeatureVecs

In [31]:
# Calculating average feature vector for training set
clean_train_reviews = []
for review in df_train['review']:
    clean_train_reviews.append(review_wordlist(review, remove_stopwords=True))
    
trainDataVecs = getAvgFeatureVecs(clean_train_reviews, model, num_features)



 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "lxml")

  markup_type=markup_type))


Review 0 of 1764
Review 10 of 1764
Review 20 of 1764
Review 30 of 1764
Review 40 of 1764
Review 50 of 1764
Review 60 of 1764
Review 70 of 1764
Review 80 of 1764
Review 90 of 1764
Review 100 of 1764
Review 110 of 1764
Review 120 of 1764
Review 130 of 1764
Review 140 of 1764
Review 150 of 1764
Review 160 of 1764
Review 170 of 1764
Review 180 of 1764
Review 190 of 1764
Review 200 of 1764
Review 210 of 1764
Review 220 of 1764
Review 230 of 1764
Review 240 of 1764
Review 250 of 1764
Review 260 of 1764
Review 270 of 1764
Review 280 of 1764
Review 290 of 1764
Review 300 of 1764
Review 310 of 1764
Review 320 of 1764
Review 330 of 1764
Review 340 of 1764
Review 350 of 1764
Review 360 of 1764
Review 370 of 1764
Review 380 of 1764
Review 390 of 1764
Review 400 of 1764
Review 410 of 1764
Review 420 of 1764
Review 430 of 1764
Review 440 of 1764
Review 450 of 1764
Review 460 of 1764
Review 470 of 1764
Review 480 of 1764
Review 490 of 1764
Review 500 of 1764
Review 510 of 1764
Review 520 of 1764
Revi

  del sys.path[0]
  app.launch_new_instance()


Review 1710 of 1764
Review 1720 of 1764
Review 1730 of 1764
Review 1740 of 1764
Review 1750 of 1764
Review 1760 of 1764


In [32]:
trainDataVecs

array([[-0.04844277,  0.00123369,  0.02288424, ...,  0.08489322,
        -0.0349283 , -0.03653152],
       [-0.04801343,  0.00170678,  0.02287277, ...,  0.08455166,
        -0.03478274, -0.03667087],
       [-0.04950338,  0.00100669,  0.02254903, ...,  0.08412886,
        -0.03488251, -0.03707476],
       ...,
       [-0.0466266 ,  0.00098727,  0.02253562, ...,  0.08436478,
        -0.03520383, -0.03676179],
       [-0.04754343,  0.00199392,  0.01849172, ...,  0.08526554,
        -0.03602573, -0.03447072],
       [-0.04962426,  0.00355955,  0.02188067, ...,  0.08356854,
        -0.03416751, -0.03793616]], dtype=float32)

In [33]:
# Calculating average feature vactors for test set     
clean_test_reviews = []
for review in df_test["review"]:
    clean_test_reviews.append(review_wordlist(review,remove_stopwords=True))
    
testDataVecs = getAvgFeatureVecs(clean_test_reviews, model, num_features)



 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "lxml")

  markup_type=markup_type))


Review 0 of 442
Review 10 of 442
Review 20 of 442
Review 30 of 442
Review 40 of 442
Review 50 of 442
Review 60 of 442
Review 70 of 442
Review 80 of 442
Review 90 of 442
Review 100 of 442
Review 110 of 442
Review 120 of 442
Review 130 of 442
Review 140 of 442
Review 150 of 442
Review 160 of 442
Review 170 of 442
Review 180 of 442
Review 190 of 442
Review 200 of 442
Review 210 of 442
Review 220 of 442
Review 230 of 442
Review 240 of 442
Review 250 of 442
Review 260 of 442
Review 270 of 442
Review 280 of 442
Review 290 of 442
Review 300 of 442
Review 310 of 442
Review 320 of 442
Review 330 of 442
Review 340 of 442
Review 350 of 442
Review 360 of 442
Review 370 of 442
Review 380 of 442
Review 390 of 442
Review 400 of 442
Review 410 of 442
Review 420 of 442
Review 430 of 442
Review 440 of 442


  del sys.path[0]
  app.launch_new_instance()


In [34]:
testDataVecs

array([[-0.04900429,  0.00196793,  0.02295567, ...,  0.08400487,
        -0.03508969, -0.0368119 ],
       [-0.0485411 ,  0.00187539,  0.02306277, ...,  0.08453841,
        -0.03500401, -0.03674699],
       [-0.04919091,  0.00193121,  0.02263741, ...,  0.08557057,
        -0.03425348, -0.03715811],
       ...,
       [-0.04682363,  0.00333181,  0.02246306, ...,  0.08383805,
        -0.03568291, -0.0352934 ],
       [-0.0465318 ,  0.00138538,  0.02126272, ...,  0.0827901 ,
        -0.03637518, -0.03743576],
       [-0.04847575,  0.00130077,  0.02321327, ...,  0.08484703,
        -0.03638764, -0.0365868 ]], dtype=float32)

In [35]:
# df_train = df_train.fillna(0)

In [36]:
col_mean = np.nanmean(trainDataVecs, axis=0)
inds = np.where(np.isnan(trainDataVecs))
trainDataVecs[inds] = np.take(col_mean, inds[1])

In [37]:
# df_train

In [38]:
# Fitting a random forest classifier to the training data
from sklearn.ensemble import RandomForestClassifier
forest = RandomForestClassifier(n_estimators = 100)
print("Fitting random forest to training data....")    
forest = forest.fit(trainDataVecs, df_train["sentiment"])
# trainDataVecs

Fitting random forest to training data....


In [39]:
col_mean = np.nanmean(testDataVecs, axis=0)
inds = np.where(np.isnan(testDataVecs))
testDataVecs[inds] = np.take(col_mean, inds[1])

In [40]:
# df_test

In [41]:
result = forest.predict(testDataVecs)
result_str = []
for x in result:
    if x == 1:
        result_str.append("Positif")
    else :
        result_str.append("Negatif")

In [42]:
result_str

['Positif',
 'Positif',
 'Positif',
 'Positif',
 'Positif',
 'Positif',
 'Positif',
 'Positif',
 'Positif',
 'Positif',
 'Positif',
 'Positif',
 'Positif',
 'Positif',
 'Positif',
 'Positif',
 'Positif',
 'Positif',
 'Positif',
 'Positif',
 'Positif',
 'Positif',
 'Negatif',
 'Positif',
 'Positif',
 'Positif',
 'Positif',
 'Positif',
 'Positif',
 'Negatif',
 'Negatif',
 'Positif',
 'Positif',
 'Positif',
 'Positif',
 'Positif',
 'Positif',
 'Positif',
 'Positif',
 'Positif',
 'Positif',
 'Positif',
 'Negatif',
 'Positif',
 'Positif',
 'Positif',
 'Positif',
 'Negatif',
 'Positif',
 'Positif',
 'Negatif',
 'Negatif',
 'Negatif',
 'Positif',
 'Positif',
 'Negatif',
 'Positif',
 'Positif',
 'Positif',
 'Positif',
 'Positif',
 'Positif',
 'Positif',
 'Negatif',
 'Positif',
 'Positif',
 'Positif',
 'Positif',
 'Positif',
 'Negatif',
 'Positif',
 'Positif',
 'Positif',
 'Positif',
 'Positif',
 'Positif',
 'Positif',
 'Positif',
 'Positif',
 'Positif',
 'Positif',
 'Positif',
 'Positif',
 'Ne

In [43]:
output = pd.DataFrame(data={"analyst":df_test["review"], "sentiment":result_str})

In [44]:
output

Unnamed: 0,analyst,sentiment
0,jika terjadinya sekali-dua kali saja mungkin ...,Positif
1,hal ini bisa dikaitkan dengan meningkatnya im...,Positif
2,\n* sex : terhindar dari resiko atrofi (penge...,Positif
3,"namun, jumlah rata-rata itu lebih besar dua k...",Positif
4,\n* sex : obat penenang yang paling aman di d...,Positif
5,"itu sebabnya, jika anda mengalami masalah gan...",Positif
6,"so, stop ya penggunaan obat kuat, beralihlah ...",Positif
7,tetapi jika terlalu sering dan terus dibiarka...,Positif
8,impiannya tentang kebahagiaan justru nyaris t...,Positif
9,,Positif


In [45]:
# model for all model visualitation
import tensorflow as tf
from tensorflow.contrib.tensorboard.plugins import projector

In [46]:
def visualize(model, output_path):
    meta_file = "bdg_bdws.tsv"
    placeholder = np.zeros((len(model.wv.index2word), 32))

    with open(os.path.join(output_path,meta_file), 'wb') as file_metadata:
        for i, word in enumerate(model.wv.index2word):
            placeholder[i] = model[word]
            # temporary solution for https://github.com/tensorflow/tensorflow/issues/9094
            if word == '':
                print("Emply Line, should replecaed by any thing else, or will cause a bug of tensorboard")
                file_metadata.write("{0}".format('<Empty Line>').encode('utf-8') + b'\n')
            else:
                file_metadata.write("{0}".format(word).encode('utf-8') + b'\n')

    # define the model without training
    sess = tf.InteractiveSession()

    embedding = tf.Variable(placeholder, trainable = False, name = 'bdg_bdws')
    tf.global_variables_initializer().run()

    saver = tf.train.Saver()
    writer = tf.summary.FileWriter(output_path, sess.graph)

    # adding into projector
    config = projector.ProjectorConfig()
    embed = config.embeddings.add()
    embed.tensor_name = 'bdg_bdws'
    embed.metadata_path = meta_file

    # Specify the width and height of a single thumbnail.
    projector.visualize_embeddings(writer, config)
    saver.save(sess, os.path.join(output_path,'bdg_bdws.ckpt'))
    print('Run `tensorboard --logdir={0}` to run visualize result on tensorboard'.format(output_path))

In [47]:
corpus = negatif + positif

In [48]:
tok_corp = [nltk.word_tokenize(sent) for sent in corpus]

In [51]:
model_viz = gensim.models.Word2Vec(tok_corp,min_count=1,size=32)

2018-07-04 20:24:14,679 : INFO : collecting all words and their counts
2018-07-04 20:24:14,682 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2018-07-04 20:24:14,691 : INFO : collected 3566 word types from a corpus of 19189 raw words and 2206 sentences
2018-07-04 20:24:14,692 : INFO : Loading a fresh vocabulary
2018-07-04 20:24:14,700 : INFO : min_count=1 retains 3566 unique words (100% of original 3566, drops 0)
2018-07-04 20:24:14,701 : INFO : min_count=1 leaves 19189 word corpus (100% of original 19189, drops 0)
2018-07-04 20:24:14,713 : INFO : deleting the raw counts dictionary of 3566 items
2018-07-04 20:24:14,715 : INFO : sample=0.001 downsamples 48 most-common words
2018-07-04 20:24:14,716 : INFO : downsampling leaves estimated 16012 word corpus (83.4% of prior 19189)
2018-07-04 20:24:14,739 : INFO : estimated required memory for 3566 words and 32 dimensions: 2695896 bytes
2018-07-04 20:24:14,740 : INFO : resetting layer weights
2018-07-04 20:24:14,79

In [52]:
# model_viz.most_similar('ngentot')

  """Entry point for launching an IPython kernel.
2018-07-04 20:24:21,307 : INFO : precomputing L2-norms of word weight vectors


[(',', 0.9990885853767395),
 ('yang', 0.9988171458244324),
 ('di', 0.9984101057052612),
 ('dan', 0.9983388781547546),
 ('akan', 0.998320460319519),
 ('dengan', 0.998254120349884),
 ('dari', 0.9981817007064819),
 ('bahwa', 0.9979394674301147),
 ('untuk', 0.9979209899902344),
 ('bisa', 0.9978971481323242)]

In [57]:
visualize(model_viz,"./")

  import sys


Run `tensorboard --logdir=./` to run visualize result on tensorboard
