In [1]:
import warnings
warnings.filterwarnings("ignore")

import sys
sys.path.append('/home/server/gli-data-science/')
import ds_db

import numpy as np
import gensim
from multiprocessing import  Pool

import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('max_colwidth', None)



In [2]:
q = '''

select tbtdr_ponta_user, tbtdr_tbto_id, tbtdr_rating, tbtdr_review 
from tb_transaction_delivery_rating ttdr 
where tbtdr_review notnull and tbtdr_review != ''
and tbtdr_created_date between '2021-06-01' and '2021-07-25'
'''

con, cur = ds_db.connect_prd_order_3()
df_re = pd.read_sql(q, con)


con.close()

You are connected


In [3]:
df_re = df_re[df_re['tbtdr_rating'].isin([1,2])]

In [5]:
from nlp_id.lemmatizer import Lemmatizer 
from nlp_id.stopword import StopWord 
from nlp_id.tokenizer import Tokenizer 

import re
import nltk
import string

def basic_clean_single(text):
    text = re.sub(r'\d+', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    lemmatizer = Lemmatizer() 
    
    stopword = StopWord() 
    stopwords = stopword.get_stopword() 
    
    tokenizer = Tokenizer() 
    words = tokenizer.tokenize(text)
    #return [lemmatizer.lemmatize(word) for word in words if word not in stopwords]
    #return [lemmatizer.lemmatize(word) for word in words]
    return words

def basic_clean(df):
    """
    A simple function to clean up the data. All the words that
    are not designated as a stop word is then lemmatized after
    encoding and basic regex parsing are performed.
    """
    df['tbtdr_review_token'] = df['tbtdr_review'].apply(basic_clean_single)
    return df


# processed_docs = list(df_re['tbtdr_review'].apply(basic_clean))

In [6]:
def parallelize_dataframe(df, func, n_cores=32):
    df_split = np.array_split(df, n_cores)
    pool = Pool(n_cores)
    df = pd.concat(pool.map(func, df_split))
    pool.close()
    pool.join()
    return df

processed_docs = parallelize_dataframe(df_re, basic_clean)

In [7]:
processed_docs.head()

Unnamed: 0,tbtdr_ponta_user,tbtdr_tbto_id,tbtdr_rating,tbtdr_review,tbtdr_review_token
233,9990013350229355,6103590,2.0,Pembayaran Sulit,"[Pembayaran, Sulit]"
387,9990010138638665,5550827,1.0,"pesanan ini dibatalkan karna waktu kirim tdk sesuai dg estimasi, tetapi poin saya tetap terpotong","[pesanan, ini, dibatalkan, karna, waktu, kirim, tdk, sesuai, dg, estimasi, tetapi, poin, saya, tetap, terpotong]"
391,9990013149303785,1225562,2.0,"hanya whatsapp konfirmasi pesanan tanpa pemberitahuan produk ready semua atau ga dan hanya confirm shareloc, sesampainya pesanan malah yang dibutuhkan tidak ada. salut lah kecewanya","[hanya, whatsapp, konfirmasi, pesanan, tanpa, pemberitahuan, produk, ready, semua, atau, ga, dan, hanya, confirm, shareloc, sesampainya, pesanan, malah, yang, dibutuhkan, tidak, ada, salut, lah, kecewanya]"
495,9990013226669030,7101935,2.0,Produk rusak,"[Produk, rusak]"
518,9990013357708115,7101676,1.0,"udh belanja, kok poin nya gk ada?","[udh, belanja, kok, poin, nya, gk, ada]"


In [8]:

processed_docs = list(processed_docs['tbtdr_review_token'])
dictionary = gensim.corpora.Dictionary(processed_docs)

In [25]:
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
lda_model =  gensim.models.LdaMulticore(bow_corpus, 
                                   num_topics = 7, 
                                   id2word = dictionary,                                    
                                   passes = 20,
                                   workers = 3)

  and should_run_async(code)


In [26]:
lda_model.print_topics()

  and should_run_async(code)


[(0,
  '0.029*"di" + 0.027*"tidak" + 0.019*"bisa" + 0.016*"dikirim" + 0.014*"digunakan" + 0.013*"Voucher" + 0.013*"yg" + 0.011*"produk" + 0.009*"order" + 0.008*"ada"'),
 (1,
  '0.040*"di" + 0.023*"yg" + 0.021*"kurir" + 0.015*"saya" + 0.013*"tidak" + 0.013*"antar" + 0.013*"nya" + 0.013*"barang" + 0.012*"ada" + 0.011*"dan"'),
 (2,
  '0.033*"Produk" + 0.030*"rusak" + 0.022*"tidak" + 0.015*"nya" + 0.013*"saya" + 0.008*"Kurir" + 0.007*"yang" + 0.007*"lagi" + 0.007*"di" + 0.007*"ada"'),
 (3,
  '0.015*"barang" + 0.015*"saya" + 0.014*"foto" + 0.013*"jam" + 0.009*"tdk" + 0.009*"di" + 0.008*"pesan" + 0.007*"sudah" + 0.007*"yg" + 0.007*"penerima"'),
 (4,
  '0.281*"Pengiriman" + 0.279*"Lama" + 0.014*"tidak" + 0.013*"jam" + 0.012*"Promo" + 0.012*"berlaku" + 0.006*"pagi" + 0.005*"pesan" + 0.005*"ada" + 0.004*"lama"'),
 (5,
  '0.015*"jam" + 0.014*"kurang" + 0.012*"di" + 0.012*"sudah" + 0.011*"lama" + 0.010*"dan" + 0.010*"saya" + 0.009*"ini" + 0.008*"mau" + 0.007*"tapi"'),
 (6,
  '0.083*"tidak" + 0.06

In [27]:
# Visualize the topics
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
pyLDAvis.enable_notebook()


# feed the LDA model into the pyLDAvis instance
lda_viz = gensimvis.prepare(lda_model, bow_corpus, dictionary)
lda_viz

  and should_run_async(code)
