In [2]:
import seaborn as sns
import matplotlib.pyplot as plt
from collections import Counter
from wordcloud import WordCloud
import numpy as np
import pandas as pd
from copy import deepcopy
from bertopic import BERTopic
from config import ORIGINAL_EXCEL_PATH, WHY_NOT_ETHICAL_CLEAN_WORD_COUNT_COLUMN, WHY_NOT_ETHICAL_CLEAN_TEXT_COLUMN, RISK_1_COLUMN
from data_preperation import read_excel_df, fix_df

  from .autonotebook import tqdm as notebook_tqdm


In [8]:
%%time
topic_modeling_df = read_excel_df(ORIGINAL_EXCEL_PATH)
topic_modeling_df = fix_df(topic_modeling_df)

CPU times: user 4.7 s, sys: 0 ns, total: 4.7 s
Wall time: 4.7 s


In [9]:
df = topic_modeling_df[[WHY_NOT_ETHICAL_CLEAN_TEXT_COLUMN, RISK_1_COLUMN]]

In [28]:
Counter(df[RISK_1_COLUMN]).most_common()

[('dp&p uninformed consent', 1264),
 ('onesided terms', 1172),
 ('collection of personal data', 526),
 ('other limitation of liability', 477),
 ('third party data transfers', 457),
 ('limited readability', 417),
 ('limitation of liability', 259),
 ('unfair terms', 239),
 ('storage and retention of personal data', 207),
 ('user content ownership', 187),
 ('data security', 176),
 ('uninformed consent', 159),
 ('tracking user information', 143),
 ('no refunds', 143),
 ('restricting user legal action - general', 141),
 ('dispute resolution clause', 125),
 ('limiting class action', 102),
 ('burdens associated with using the app - general', 57),
 ('international data transfers', 57),
 ('financial risks - general', 52),
 ('legally non compliant', 49),
 ('privacy risks', 46),
 ('processing user information', 44),
 ('eliminating or limiting access to purchased app assets or coins', 38),
 ('in app payments', 33),
 ('advertisements and spam', 32),
 ('severability', 25),
 ('functionality', 21),
 (

In [29]:
most_common_clusters_df = df[df[RISK_1_COLUMN].isin(["dp&p uninformed consent", "onesided terms"])]

In [30]:
most_common_clusters_df.sample(7)

Unnamed: 0,why.not.ethical_clean,Risk 1
9893,"They can change the policy at any time, and it...",onesided terms
23242,It should be shorter than 10 days to process o...,dp&p uninformed consent
5912,Email should not be the only contact option if...,dp&p uninformed consent
31123,"Zynga does not organise opting out, it is hand...",dp&p uninformed consent
18327,I don't think it should be able to get info fr...,dp&p uninformed consent
29145,The company should make sure that the websites...,dp&p uninformed consent
7445,There is no clause that states the consequence...,dp&p uninformed consent


In [32]:
docs = list(most_common_clusters_df.loc[:, WHY_NOT_ETHICAL_CLEAN_TEXT_COLUMN].values)
actual_risks = list(most_common_clusters_df.loc[:, RISK_1_COLUMN].values)
docs[:3], actual_risks[:3]

(["They can suspend or terminate your account if a charge can't be processed or is unpaid. If a payment does not go through, they should take away the thing that you tried to buy, rather than suspending or deleting your account.",
  'It feels as if there is some vagueness as to the protections offered to data being transferred out of the EEA. I would also rather see who the affiliates of Moon Active are.',
  'I think that updated privacy policy changes should always require an active consent from the user.'],
 ['onesided terms', 'dp&p uninformed consent', 'onesided terms'])

In [33]:
%%time
model = BERTopic(language="english")
topics, probs = model.fit_transform(docs)

CPU times: user 41min 5s, sys: 6.98 s, total: 41min 12s
Wall time: 4min 18s


In [38]:
i = 2
probs[i], topics[i], actual_risks[i]

(0.6879019982476003, 7, 'onesided terms')

In [41]:
topic_output_df = pd.DataFrame({"topic": topics, "prob": probs, "actual_risk": actual_risks})
topic_output_df.sample(10)

Unnamed: 0,topic,prob,actual_risk
1365,17,0.623424,onesided terms
1334,-1,0.0,dp&p uninformed consent
1512,1,0.426493,onesided terms
1881,-1,0.0,onesided terms
783,0,1.0,dp&p uninformed consent
154,-1,0.0,dp&p uninformed consent
1773,8,1.0,dp&p uninformed consent
2057,6,1.0,onesided terms
687,21,0.856051,onesided terms
1513,1,0.372805,onesided terms


In [39]:
model.visualize_barchart()

ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed

In [43]:
model.visualize_hierarchy(top_n_topics=2)

ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed

In [45]:
model.visualize_heatmap(n_clusters=5, width=1000, height=1000)

ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed

In [46]:
model.get_topic_info()

Unnamed: 0,Topic,Count,Name
0,0,515,0_data_information_personal_they
1,-1,490,-1_the_to_be_they
2,1,117,1_terms_conditions_change_changes
3,2,116,2_unethical_is_data_it
4,3,95,3_app_the_third_of
5,4,91,4_rights_unethical_waive_discretion
6,5,63,5_consent_withdraw_still_even
7,6,60,6_account_terminated_reason_terminate
8,7,51,7_policy_privacy_changes_the
9,8,51,8_opt_out_optout_option


In [47]:
model.get_topic(0)

[('data', 0.03477487459001497),
 ('information', 0.023420267528693636),
 ('personal', 0.02123344341948135),
 ('they', 0.018496673088147404),
 ('to', 0.01814906199866913),
 ('third', 0.018031922110124072),
 ('and', 0.017147913328007986),
 ('it', 0.0169835515650802),
 ('is', 0.016804295606467217),
 ('your', 0.016595368947614805)]

In [48]:
model.get_topic(1)

[('terms', 0.08381328566085121),
 ('conditions', 0.03669396553599796),
 ('change', 0.031096653945836945),
 ('changes', 0.02824050856538263),
 ('and', 0.027712773505304468),
 ('the', 0.0266284422901492),
 ('to', 0.024607239298443138),
 ('them', 0.023930452307466286),
 ('new', 0.023086051046159708),
 ('any', 0.022330305952817344)]

In [49]:
model.get_topic_freq()

Unnamed: 0,Topic,Count
0,0,515
1,-1,490
2,1,117
3,2,116
4,3,95
5,4,91
6,5,63
7,6,60
8,7,51
9,8,51


In [50]:
model.visualize_topics()

ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed