In [1]:
import transformers
from sklearn.cluster import KMeans

from sqlalchemy import create_engine
import pandas as pd
import os


In [2]:
def get_data(dttm):
    connection_info = {
        "host": os.environ.get("PG_HOST", "localhost"),
        "port": os.environ.get("PG_PORT", 5432),
        "db": os.environ.get("PG_DATABASE", "playstore"),
        "user": os.environ.get("PG_USER", "postgres"),
        "password": os.environ.get("PG_PASS", "postgres"),
    }

    # Sql engine
    db_engine = create_engine(
        "postgresql+psycopg2://{}:{}@{}:{}/{}".format(
            connection_info["user"],
            connection_info["password"],
            connection_info["host"],
            connection_info["port"],
            connection_info["db"],
        )
    )

    query = f"""
        SELECT "review"."reviewId"
        , "review"."apps"
        , "review"."score"
        , "review"."at"
        , "review"."content"
        , "review"."repliedAt"
        , "sentiment"."clean_text"
        , "sentiment"."sentiment"
        FROM review
        LEFT JOIN sentiment
        ON ("review"."reviewId"="sentiment"."reviewId")
        WHERE "sentiment"."sentiment" is not null
        AND "review"."at" >= '{dttm}'
        ORDER BY "review"."at"
        ;
        """
    df = pd.read_sql(query, db_engine)
    df["repliedDurationHrs"] = (df["repliedAt"] - df["at"]).apply(
        lambda x: round(x.seconds / 3600, 2)
    )

    return df


In [3]:
df = get_data("2023-02-01")
df = df['clean_text'].apply(lambda x: str.lower(x)).dropna()
text_data = df.to_list()

In [4]:
len(text_data)

9907

In [5]:
BERT_MODEL = "indolem/indobert-base-uncased"

In [6]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer(BERT_MODEL)
embeddings = model.encode(text_data, show_progress_bar=True)

No sentence-transformers model found with name /home/hvzn/.cache/torch/sentence_transformers/indolem_indobert-base-uncased. Creating a new one with MEAN pooling.
Some weights of the model checkpoint at /home/hvzn/.cache/torch/sentence_transformers/indolem_indobert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a B

Batches:   0%|          | 0/310 [00:00<?, ?it/s]

In [8]:
import umap
umap_embeddings = umap.UMAP(n_neighbors=15, 
                            n_components=5, 
                            metric='cosine').fit_transform(embeddings)

OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.


In [None]:

import hdbscan
cluster = hdbscan.HDBSCAN(min_cluster_size=15,
                          metric='euclidean',                      
                          cluster_selection_method='eom').fit(umap_embeddings)

In [None]:
bert_model = transformers.BertModel.from_pretrained("indolem/indobert-base-uncased")

In [None]:
# Tokenize the text data
tokenizer = transformers.BertTokenizer.from_pretrained("indolem/indobert-base-uncased")
tokenized_data = tokenizer.batch_encode_plus(text_data, return_attention_mask=False, return_tensors='pt', padding=True)


In [None]:
bert_output = bert_model(tokenized_data["input_ids"])
topics = KMeans(n_clusters=5).fit_transform(bert_output[0].mean(dim=1).detach().numpy())

In [None]:
# Show the topics
for topic in topics:
    print(topic)