In [1]:
import pandas as pd
import tldextract
import numpy as np
pd.options.mode.chained_assignment = None

Read in the original dataset of 19K websites, as well as the 11K websites we retrieved product pages from.

In [2]:
web19k = pd.read_csv('../../data/final-list/shopping-english.csv')
web11k = pd.read_csv('../../data/final-list/shopping-english-11K.csv')

web11k['domain'] = web11k.apply(lambda row: '.'.join(tldextract.extract(row.url.strip())[1:]), axis=1)

Filter down the original 19K dataset to only contain the 11K, call this "data"

In [3]:
data = web19k[web19k['url'].isin(web11k['domain'].tolist())]

In [4]:
data.shape

(11266, 8)

Next, read in the Dark Patterns:

In [5]:
dps = pd.read_csv('../../data/final-dark-patterns/dark-patterns.csv')

In [6]:
dps.shape

(1818, 7)

In [7]:
dps.columns

Index([u'Pattern String', u'Comment', u'Pattern Category', u'Pattern Type',
       u'Where in website?', u'Deceptive?', u'Website Page'],
      dtype='object')

Extract the domain from each product URL 

In [8]:
dps['domain'] = dps.apply(lambda row: '.'.join(tldextract.extract(row['Website Page'].strip())[1:]), axis=1)

In [9]:
len(set(dps['domain'].tolist()))

1254

Group by "Pattern Type" and "Pattern Name" and examine all the combinations.

In [10]:
dps[['Pattern Category', 'Pattern Type']].groupby(['Pattern Category', 'Pattern Type']).count().reset_index()

Unnamed: 0,Pattern Category,Pattern Type
0,Forced Action,Forced Enrollment
1,Misdirection,Confirmshaming
2,Misdirection,Pressured Selling
3,Misdirection,Trick Questions
4,Misdirection,Visual Interference
5,Obstruction,Hard to Cancel
6,Scarcity,High-demand Message
7,Scarcity,Low-stock Message
8,Sneaking,Hidden Costs
9,Sneaking,Hidden Subscription


Looks clean, let's proceed by domain and Pattern Category first.

In [11]:
dps[['Pattern Category', 'domain']].groupby('Pattern Category').agg({'domain': ['count', pd.Series.nunique]})

Unnamed: 0_level_0,domain,domain
Unnamed: 0_level_1,count,nunique
Pattern Category,Unnamed: 1_level_2,Unnamed: 2_level_2
Forced Action,6,6
Misdirection,270,244
Obstruction,31,31
Scarcity,679,609
Sneaking,26,23
Social Proof,325,275
Urgency,481,437


Let's proceed by domain and Pattern Type next.

In [12]:
dps[['Pattern Type', 'domain']].groupby('Pattern Type').agg({'domain': ['count', pd.Series.nunique]})

Unnamed: 0_level_0,domain,domain
Unnamed: 0_level_1,count,nunique
Pattern Type,Unnamed: 1_level_2,Unnamed: 2_level_2
Activity Notification,313,264
Confirmshaming,169,164
Countdown Timer,393,361
Forced Enrollment,6,6
Hard to Cancel,31,31
Hidden Costs,5,5
Hidden Subscription,14,13
High-demand Message,47,43
Limited-time Message,88,84
Low-stock Message,632,581


Assign a column to the original data indicating whether there was a dark pattern on that website.

In [13]:
dpdomains = set(dps['domain'].tolist())
deceptivedomains = set(dps[dps['Deceptive?'].isin(['Depends', 'Yes']) ]['domain'].tolist())

data['dp'] = data['url'].apply(lambda x: x in dpdomains)
data['deceptive'] = data['url'].apply(lambda x: x in deceptivedomains)

In [14]:
data.dp.value_counts()

False    10017
True      1249
Name: dp, dtype: int64

In [15]:
data.to_csv('../../data/final-dark-patterns/dark-patterns-ranked.csv', index=False)

Bin the ranks for prevalence and plotting

In [16]:
bins = np.linspace(0, 400000, 41)
bins

array([     0.,  10000.,  20000.,  30000.,  40000.,  50000.,  60000.,
        70000.,  80000.,  90000., 100000., 110000., 120000., 130000.,
       140000., 150000., 160000., 170000., 180000., 190000., 200000.,
       210000., 220000., 230000., 240000., 250000., 260000., 270000.,
       280000., 290000., 300000., 310000., 320000., 330000., 340000.,
       350000., 360000., 370000., 380000., 390000., 400000.])

In [17]:
data['binned_rank'] = pd.cut(data.global_rank, bins)

In [18]:
percs = data[['binned_rank', 'dp']].groupby('binned_rank').apply(lambda x: x[x.dp].count()*100/x.count())['dp'].tolist()

Write to disk for plotting:

In [19]:
with open('../../data/final-dark-patterns/freq_percentages.txt', 'w') as f:
    for item in percs:
        f.write("%s\n" % item)

Stats about deceptive patterns. Collapse "Depends" into "Yes".

In [20]:
dps['Deceptive?'].value_counts()

No         1584
Yes         208
Depends      26
Name: Deceptive?, dtype: int64

In [21]:
dps['Deceptive?'] = dps['Deceptive?'].replace('Depends', 'Yes')

In [22]:
dps['Deceptive?'].value_counts()

No     1584
Yes     234
Name: Deceptive?, dtype: int64

Deceptive patterns by domain:

In [23]:
dps[['Deceptive?', 'domain']].groupby('Deceptive?').agg({'domain': ['count', pd.Series.nunique]})

Unnamed: 0_level_0,domain,domain
Unnamed: 0_level_1,count,nunique
Deceptive?,Unnamed: 1_level_2,Unnamed: 2_level_2
No,1584,1175
Yes,234,183


Deceptive patterns by Pattern Type and domain:

In [24]:
dps[['Pattern Type', 'Deceptive?', 'domain']].groupby(['Pattern Type', 'Deceptive?']).agg({'domain': ['count', pd.Series.nunique]})

Unnamed: 0_level_0,Unnamed: 1_level_0,domain,domain
Unnamed: 0_level_1,Unnamed: 1_level_1,count,nunique
Pattern Type,Deceptive?,Unnamed: 2_level_2,Unnamed: 3_level_2
Activity Notification,No,284,248
Activity Notification,Yes,29,20
Confirmshaming,No,169,164
Countdown Timer,No,236,230
Countdown Timer,Yes,157,140
Forced Enrollment,No,6,6
Hard to Cancel,No,31,31
Hidden Costs,Yes,5,5
Hidden Subscription,Yes,14,13
High-demand Message,No,47,43
