In [1]:
from obsei.source.playstore_scrapper import PlayStoreScrapperConfig, PlayStoreScrapperSource

By default `pip install obsei` will only install core dependencies.
To install all required dependencies use `pip install obsei[all]`.
Refer https://obsei.com/#install-obsei for more options.



In [20]:
# initialize play store source config
source_config = PlayStoreScrapperConfig(
   # Need two parameters package_name and country. 
   # `package_name` can be found at the end of the url of app in play store. 
   # For example - https://play.google.com/store/apps/details?id=com.google.android.gm&hl=en&gl=US
   # `com.google.android.gm` is the package_name for xcode and `us` is country.
   countries=["in"],
   package_name="com.chess",
   max_count=10000, # Number of reviews to fetch
   # lookup_period="1h" # Lookup period from current time, format: `<number><d|h|m>` (day|hour|minute)
)

# initialize play store reviews retriever
source = PlayStoreScrapperSource()

In [21]:
responses = source.lookup(source_config)

In [19]:
len(responses)

100

In [12]:
responses[0].meta["content"]

'great experience'

## cluster responses

In [8]:
import gensim
from gensim import corpora
from gensim.models import LdaModel
# from gensim.summarization import summarize

In [13]:
# Preprocess the data by tokenizing and removing stop words
texts = [[word for word in document.meta["content"].lower().split() if word.isalpha() and word not in gensim.parsing.preprocessing.STOPWORDS] for document in responses]

# Create a dictionary from the preprocessed texts
dictionary = corpora.Dictionary(texts)

# Convert the texts into a bag-of-words representation
corpus = [dictionary.doc2bow(text) for text in texts]

# Define the number of topics to extract
num_topics = 3

# Perform LDA topic modeling on the corpus
lda_model = LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=10)

03/18/2023 16:59:29 - INFO - gensim.corpora.dictionary -   adding document #0 to Dictionary<0 unique tokens: []>
03/18/2023 16:59:29 - INFO - gensim.corpora.dictionary -   built Dictionary<244 unique tokens: ['experience', 'great', 'abandoned', 'ages', 'definitely']...> from 100 documents (total 433 corpus positions)
03/18/2023 16:59:29 - INFO - gensim.utils -   Dictionary lifecycle event {'msg': "built Dictionary<244 unique tokens: ['experience', 'great', 'abandoned', 'ages', 'definitely']...> from 100 documents (total 433 corpus positions)", 'datetime': '2023-03-18T16:59:29.065058', 'gensim': '4.3.1', 'python': '3.10.4 (main, Feb 27 2023, 16:55:46) [Clang 14.0.0 (clang-1400.0.29.202)]', 'platform': 'macOS-12.5-x86_64-i386-64bit', 'event': 'created'}
03/18/2023 16:59:29 - INFO - gensim.models.ldamodel -   using symmetric alpha at 0.3333333333333333
03/18/2023 16:59:29 - INFO - gensim.models.ldamodel -   using symmetric eta at 0.3333333333333333
03/18/2023 16:59:29 - INFO - gensim.mode

In [14]:
# Print the topics and their top words
print("Topics and their top words:")
for idx, topic in lda_model.print_topics(num_topics=num_topics):
    print("Topic {}: {}".format(idx + 1, topic))

    # Extract the top sentence for each topic
    # top_sentence = ""
    # top_sentence_score = -1
    # for document in documents:
    #     document_score = lda_model[dictionary.doc2bow(document.lower().split())][idx][1]
    #     if document_score > top_sentence_score:
    #         top_sentence = summarize(document)
    #         top_sentence_score = document_score
    # print("Top sentence: {}\n".format(top_sentence))

03/18/2023 17:00:14 - INFO - gensim.models.ldamodel -   topic #0 (0.333): 0.036*"game" + 0.019*"frustrating" + 0.015*"chess" + 0.014*"fix" + 0.013*"network" + 0.013*"wifi" + 0.013*"stars" + 0.013*"happens" + 0.013*"losing" + 0.013*"flaw"
03/18/2023 17:00:14 - INFO - gensim.models.ldamodel -   topic #1 (0.333): 0.064*"game" + 0.061*"good" + 0.046*"chess" + 0.031*"play" + 0.031*"love" + 0.029*"app" + 0.019*"great" + 0.016*"like" + 0.016*"learn" + 0.010*"fun"
03/18/2023 17:00:14 - INFO - gensim.models.ldamodel -   topic #2 (0.333): 0.049*"best" + 0.036*"chess" + 0.028*"app" + 0.020*"membership" + 0.018*"amazing" + 0.015*"game" + 0.015*"great" + 0.014*"recommend" + 0.014*"player" + 0.014*"love"


Topics and their top words:
Topic 1: 0.036*"game" + 0.019*"frustrating" + 0.015*"chess" + 0.014*"fix" + 0.013*"network" + 0.013*"wifi" + 0.013*"stars" + 0.013*"happens" + 0.013*"losing" + 0.013*"flaw"
Topic 2: 0.064*"game" + 0.061*"good" + 0.046*"chess" + 0.031*"play" + 0.031*"love" + 0.029*"app" + 0.019*"great" + 0.016*"like" + 0.016*"learn" + 0.010*"fun"
Topic 3: 0.049*"best" + 0.036*"chess" + 0.028*"app" + 0.020*"membership" + 0.018*"amazing" + 0.015*"game" + 0.015*"great" + 0.014*"recommend" + 0.014*"player" + 0.014*"love"
