In [1]:
#import packages
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

In [2]:
#scrape reviews for the facebook android app
!pip install google_play_scraper
from google_play_scraper import Sort, reviews_all

reviews = reviews_all(
    'com.facebook.katana',
    lang='en', # defaults to 'en'
    country='us', # defaults to 'us'
    sort=Sort.MOST_RELEVANT, # defaults to Sort.MOST_RELEVANT
    count=10000, # defaults to 100
    filter_score_with=None # defaults to None(means all score)
)

print(len(reviews))
reviews = pd.DataFrame(reviews)#[['userName', 'content', 'score']]
reviews.to_csv('reviews.csv')

3184


In [3]:
n_features = 5000   # number of words to use
n_components = 50  # number of topics

In [4]:
# count vectorizer
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2,
                                max_features=n_features,
                                stop_words='english')

tf = tf_vectorizer.fit_transform(reviews['content'])

In [5]:
# lda
lda = LatentDirichletAllocation(n_components=n_components, max_iter=25,
                                learning_method='online', random_state=0)
lda.fit(tf)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
                          evaluate_every=-1, learning_decay=0.7,
                          learning_method='online', learning_offset=10.0,
                          max_doc_update_iter=100, max_iter=25,
                          mean_change_tol=0.001, n_components=50, n_jobs=None,
                          perp_tol=0.1, random_state=0, topic_word_prior=None,
                          total_samples=1000000.0, verbose=0)

In [6]:
# topic keywords

def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = f"Topic #{topic_idx} Keywords: "
        message += " ".join([feature_names[i]
                            for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()

tf_feature_names = tf_vectorizer.get_feature_names()
print_top_words(lda, tf_feature_names, 10)


Topic #0 Keywords: account facebook login log password code help email phone number
Topic #1 Keywords: login app account enter problem times needed free deleted 14
Topic #2 Keywords: bugs problems worst app broken update makes fixing worse unusable
Topic #3 Keywords: pw requested fb till valid yesterday great later request solution
Topic #4 Keywords: come notification garbage ones blocking birthday spend randomly box shortcuts
Topic #5 Keywords: app don facebook use like good want just people platform
Topic #6 Keywords: actions billion circles preventing bringing ship headache wth track weeks
Topic #7 Keywords: play games playing game ups confirming rooms 2months role yahoo
Topic #8 Keywords: order respond users months late gives push offer experience complete
Topic #9 Keywords: seemingly relic greed permanently checked stolen mentions shutting complicated harmless
Topic #10 Keywords: notifications page pages load view blank lots close battery created
Topic #11 Keywords: facebook socia

In [7]:
score = lda.transform(tf) #returns document topic distribution. shape [n_samples, n_components]

topic_probs = np.sum(score, axis=0) #distribution of topic prominence across all documents
topic_probs/=np.sum(topic_probs)    #normalzie to add up to 1

documents_in_topic = [[]] *n_components  #contains the documents in each topic

for doc_index, document in enumerate(score):
    topic = documents_in_topic[np.argmax(document)].copy()
    topic.append((doc_index, np.max(document)))
    documents_in_topic[np.argmax(document)] = topic

#sort
for i in range(len(documents_in_topic)):
    row = documents_in_topic[i].copy()
    row.sort(key=lambda x:x[1]) #sort by probabilities (second in tuple)
    row = row[::-1] #reverse to descending order
    documents_in_topic[i] = row

In [8]:
# print most prominent topics and sample documents for those topics

def print_top_words_for_topic(model, feature_names, n_top_words, topic_idx, prominence):
    topic = model.components_[topic_idx]
    message = f"Topic #{topic_idx}\nProminence: {(prominence*100):.2f}%\nKeywords: "
    message += " ".join([feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]])
    print(message)


n_top_topics = 10
thresh = np.sort(topic_probs)[::-1][n_top_topics]
for i, topic in enumerate(documents_in_topic):
    if (topic_probs[i] > thresh):
        print_top_words_for_topic(lda, tf_feature_names, 10, i, topic_probs[i])
        print("Samples:")
        for doc in topic[:10]:
            doc_idx, prob = doc
            print(f"- {reviews['content'][doc_idx]}\tProb:{(prob*100):.2f}%")
        print("\n\n")

Topic #0
Prominence: 12.62%
Keywords: account facebook login log password code help email phone number
Samples:
- I don't know how many times times i try to make account but every time i make account it got disabled without any reason. Facebook demands my email for verification but I'm not received any code . Please take a look . I want my account back 😬	Prob:95.74%
- Someone have accessed my Facebook account without my permission. I am unable to log in my account because, they changed password, mail and phone number. I tried forgot password, all the otp's are sending to their email and phone number. I cannot access my personal account tiil now. I have mailed many times about my account recovery. No reply from Facebook community. Please take any action!	Prob:93.61%
- I cannot login, Any account still cannot log, It always said login error try to log again, Hope u can fix it	Prob:91.09%
- login problem alltime allways whenever am login my facebook account ❕ tell me how to fix the proble