# First POC for dataset anonymization

## Imports

In [1]:
import pandas as pd
import random
import datetime as dt
from faker import Faker

In [2]:
fake = Faker()

In [3]:
for _ in range(20):
  print(fake.name())

Laura Patel
Devon Wise
Lynn Perry
Sandra Gibson
Melissa Smith
Abigail King
Jeanne Saunders
Chad Chavez
William Francis
Holly Watson
Caroline Elliott
William Davis
Kent Lopez
Heather Lewis
Brenda Small
Christopher Ramirez
Faith Barr
Lynn Gonzalez
Megan Smith
Timothy Riley


## Create and import a sample dataset

In [4]:
df = pd.read_csv('../datasets/test_dataset1.csv', parse_dates=['purchased_date'])

In [5]:
df_original = df.copy()

In [6]:
df

Unnamed: 0,user_id,name,item,amount,category_id,purchased_quantity,purchased_date
0,1004,Donna Martin,TV,199.95,14,1,2020-04-04
1,1004,Donna Martin,Vaccum,95.12,15,2,2020-05-12
2,1003,Bobby Bradley,Cup,12.76,44,1,2020-05-13
3,1011,Austin Thomas,Plate,5.55,156,1,2020-05-14
4,1000,Jonathan Medina,Lotion,4.44,23,2,2020-05-15
5,1000,Jonathan Medina,Toothbrush,5.96,123,1,2020-05-16
6,1019,Brandon Beasley,Sanitizer,2.23,12,3,2020-05-17
7,1011,Austin Thomas,Wine,15.67,122,1,2020-04-04
8,1014,Susan Morales,Yoga mat,45.53,1212,2,2020-04-05
9,1001,Rachael Massey,Beer,2.45,42,4,2020-04-06


## Replace names

In [7]:
unique_names = df['name'].unique()

In [8]:
unique_names

array(['Donna Martin', 'Bobby Bradley', 'Austin Thomas',
       'Jonathan Medina', 'Brandon Beasley', 'Susan Morales',
       'Rachael Massey', 'Ross Strickland', 'Jennifer Cox',
       'Danny Lynn DDS', 'Ashley Nichols', 'Angela Oneill', 'Cody Moreno',
       'Douglas Atkinson', 'Ellen Mccarthy'], dtype=object)

In [9]:
name_dict = {name: fake.name() for name in unique_names}

In [10]:
name_dict

{'Donna Martin': 'Patrick Franklin',
 'Bobby Bradley': 'Craig James',
 'Austin Thomas': 'Kenneth Miller',
 'Jonathan Medina': 'Karen Hicks',
 'Brandon Beasley': 'Richard Johnson',
 'Susan Morales': 'Dr. Brandon Hicks',
 'Rachael Massey': 'Michelle Barrera',
 'Ross Strickland': 'Katherine Sanders',
 'Jennifer Cox': 'Victor Johnson',
 'Danny Lynn DDS': 'Jeremy Rose',
 'Ashley Nichols': 'Karen Shaffer',
 'Angela Oneill': 'James Gibbs',
 'Cody Moreno': 'Theresa Smith',
 'Douglas Atkinson': 'Austin Aguilar',
 'Ellen Mccarthy': 'Ashley Willis'}

In [11]:
df['name'] = df['name'].map(name_dict)

## Replace ids

### UserId

In [12]:
unique_user_ids = df['user_id'].unique()

In [13]:
user_id_dict = {id_: fake.uuid4() for id_ in unique_user_ids}

In [14]:
user_id_dict

{1004: '830870a6-a8cc-4b8c-a5bc-91917f9bea55',
 1003: '00064f68-9bbf-480c-b9b8-6d5f644d4fbf',
 1011: '22b66ffc-0a68-49b3-ace8-50f2594d84cb',
 1000: 'ef563cb5-ca99-47f9-b6b5-55065b246032',
 1019: '4c511fac-5caa-4bbf-99fa-e66570ebc1ea',
 1014: '11def67e-a57c-4dd7-a4cc-b9ab256c66f8',
 1001: 'e203dea7-f0b4-49ac-9a50-f033d5d76e87',
 1005: '2873ff92-b038-44d9-a1ba-e82be415aeb2',
 1020: '77f69293-073c-4e81-a158-ca1286b7cef9',
 1006: '81ae4265-85c9-43f9-ba85-fae4a992fc6e',
 1015: 'eee7dbb7-7eb0-4105-a274-397d4874de6a',
 1009: '177e3013-452b-4c65-b6ce-d1e1e489fcf3',
 1016: '66499eee-5eab-4d9e-b224-94fe3c3a9e0f'}

In [15]:
df['user_id'] = df['user_id'].map(user_id_dict)

### Category_id

In [16]:
unique_category_ids = df['category_id'].unique()

In [17]:
category_id_dict = {id_: fake.uuid4() for id_ in unique_category_ids}

In [18]:
category_id_dict

{14: '76b443eb-9fe6-4d56-a772-e6ccd943439e',
 15: '2b8eae6c-e8fa-401c-a966-0c980da2dc8b',
 44: 'c0c7cec4-2699-45e7-8c09-9d5040aa4f62',
 156: 'bdd84506-87fb-426a-a3ec-d2db46a99e9b',
 23: 'bb47ac89-1687-4ef9-9d10-a7dd8f8d192c',
 123: 'ba5d2313-8123-4377-a448-973b3d141e16',
 12: 'd847a532-e355-4684-8f56-9d1e3803030a',
 122: '4e2962fe-9863-485e-b37c-cd66a689b371',
 1212: 'fd05dac6-3082-4d67-95ff-9cf07df5c0ae',
 42: 'f0991b9d-fff9-42a1-999e-f5fefc676c9d',
 356: 'd85adc4e-78ef-43de-a5ba-3be9f10935c4'}

In [19]:
df['category_id'] = df['category_id'].map(category_id_dict)

## Replace whole numbers

In [20]:
whole_numbers = df['purchased_quantity']

In [21]:
whole_numbers

0     1
1     2
2     1
3     1
4     2
5     1
6     3
7     1
8     2
9     4
10    5
11    3
12    4
13    2
14    6
15    1
16    2
17    4
18    1
19    1
Name: purchased_quantity, dtype: int64

In [22]:
X_std = (whole_numbers - whole_numbers.min()) / (whole_numbers.max() - whole_numbers.min())
X_scaled = (X_std * (10 - 1) + 1) 
X_scaled_whole_randomized = (X_scaled * random.randint(1, 10)).astype(int)

In [23]:
X_scaled_whole_randomized

0      8
1     22
2      8
3      8
4     22
5      8
6     36
7      8
8     22
9     51
10    65
11    36
12    51
13    22
14    80
15     8
16    22
17    51
18     8
19     8
Name: purchased_quantity, dtype: int64

In [24]:
# whole_number_dict = {
#     number: fake.random_int(
#         min=min(whole_numbers), max=max(whole_numbers)
#     )
#     for number in whole_numbers
# }

In [25]:
# whole_number_dict

In [26]:
df['purchased_quantity'] = X_scaled_whole_randomized

## Replace decimal numbers

In [27]:
decimal_numbers = df['amount']

In [28]:
X_std = (decimal_numbers - decimal_numbers.min()) / (decimal_numbers.max() - decimal_numbers.min())
X_scaled = (X_std * (10 - 1) + 1) 
X_scaled_decimal_randomized = (X_scaled * random.randint(1, 10))

In [29]:
# decimal_number_dict

In [30]:
df['amount'] = X_scaled_decimal_randomized

## Categorical data

In [31]:
category = df['item']

In [32]:
category

0             TV
1         Vaccum
2            Cup
3          Plate
4         Lotion
5     Toothbrush
6      Sanitizer
7           Wine
8       Yoga mat
9           Beer
10        Coffee
11        Lotion
12        Coffee
13            TV
14        Vaccum
15     Sanitizer
16          Wine
17         Plate
18            TV
19        Vaccum
Name: item, dtype: object

In [33]:
unique_category = category.unique()

In [34]:
rand_category = random.randint(0,1000)
category_dict = {
    category:  "Category_" + str(rand_category) + " " +str(iteration)
    for iteration, category in enumerate(unique_category)
}

In [35]:
category_dict

{'TV': 'Category_511 0',
 'Vaccum': 'Category_511 1',
 'Cup': 'Category_511 2',
 'Plate': 'Category_511 3',
 'Lotion': 'Category_511 4',
 'Toothbrush': 'Category_511 5',
 'Sanitizer': 'Category_511 6',
 'Wine': 'Category_511 7',
 'Yoga mat': 'Category_511 8',
 'Beer': 'Category_511 9',
 'Coffee': 'Category_511 10'}

In [36]:
df['item'] = df['item'].map(category_dict)

### Dates

In [37]:
dates = df['purchased_date']

In [38]:
fake.date_between(start_date=dates.min(), end_date=dates.max())

datetime.date(2020, 5, 16)

In [39]:
dates_dict = {
    date: fake.date_between(start_date=dates.min(), end_date=dates.max()).strftime('%Y-%m-%d')
    for date in dates
}

In [40]:
dates_dict

{Timestamp('2020-04-04 00:00:00'): '2020-04-10',
 Timestamp('2020-05-12 00:00:00'): '2020-04-25',
 Timestamp('2020-05-13 00:00:00'): '2020-04-28',
 Timestamp('2020-05-14 00:00:00'): '2020-04-12',
 Timestamp('2020-05-15 00:00:00'): '2020-05-05',
 Timestamp('2020-05-16 00:00:00'): '2020-05-05',
 Timestamp('2020-05-17 00:00:00'): '2020-04-18',
 Timestamp('2020-04-05 00:00:00'): '2020-05-07',
 Timestamp('2020-04-06 00:00:00'): '2020-04-24',
 Timestamp('2020-04-07 00:00:00'): '2020-05-08',
 Timestamp('2020-04-08 00:00:00'): '2020-05-02',
 Timestamp('2020-04-09 00:00:00'): '2020-05-02',
 Timestamp('2020-04-10 00:00:00'): '2020-05-11',
 Timestamp('2020-05-06 00:00:00'): '2020-04-07',
 Timestamp('2020-05-07 00:00:00'): '2020-05-10',
 Timestamp('2020-05-08 00:00:00'): '2020-04-22',
 Timestamp('2020-05-09 00:00:00'): '2020-04-20',
 Timestamp('2020-05-10 00:00:00'): '2020-04-21',
 Timestamp('2020-05-11 00:00:00'): '2020-04-22'}

In [41]:
df['purchased_date'] = df['purchased_date'].map(dates_dict)

# Original vs Final

In [42]:
df_original

Unnamed: 0,user_id,name,item,amount,category_id,purchased_quantity,purchased_date
0,1004,Donna Martin,TV,199.95,14,1,2020-04-04
1,1004,Donna Martin,Vaccum,95.12,15,2,2020-05-12
2,1003,Bobby Bradley,Cup,12.76,44,1,2020-05-13
3,1011,Austin Thomas,Plate,5.55,156,1,2020-05-14
4,1000,Jonathan Medina,Lotion,4.44,23,2,2020-05-15
5,1000,Jonathan Medina,Toothbrush,5.96,123,1,2020-05-16
6,1019,Brandon Beasley,Sanitizer,2.23,12,3,2020-05-17
7,1011,Austin Thomas,Wine,15.67,122,1,2020-04-04
8,1014,Susan Morales,Yoga mat,45.53,1212,2,2020-04-05
9,1001,Rachael Massey,Beer,2.45,42,4,2020-04-06


In [43]:
df

Unnamed: 0,user_id,name,item,amount,category_id,purchased_quantity,purchased_date
0,830870a6-a8cc-4b8c-a5bc-91917f9bea55,Patrick Franklin,Category_511 0,80.0,76b443eb-9fe6-4d56-a772-e6ccd943439e,8,2020-04-10
1,830870a6-a8cc-4b8c-a5bc-91917f9bea55,Patrick Franklin,Category_511 1,41.826017,2b8eae6c-e8fa-401c-a966-0c980da2dc8b,22,2020-04-25
2,00064f68-9bbf-480c-b9b8-6d5f644d4fbf,Craig James,Category_511 2,11.834513,c0c7cec4-2699-45e7-8c09-9d5040aa4f62,8,2020-04-28
3,22b66ffc-0a68-49b3-ace8-50f2594d84cb,Kenneth Miller,Category_511 3,9.208982,bdd84506-87fb-426a-a3ec-d2db46a99e9b,8,2020-04-12
4,ef563cb5-ca99-47f9-b6b5-55065b246032,Karen Hicks,Category_511 4,8.804774,bb47ac89-1687-4ef9-9d10-a7dd8f8d192c,22,2020-05-05
5,ef563cb5-ca99-47f9-b6b5-55065b246032,Karen Hicks,Category_511 5,9.358284,ba5d2313-8123-4377-a448-973b3d141e16,8,2020-05-05
6,4c511fac-5caa-4bbf-99fa-e66570ebc1ea,Richard Johnson,Category_511 6,8.0,d847a532-e355-4684-8f56-9d1e3803030a,36,2020-04-18
7,22b66ffc-0a68-49b3-ace8-50f2594d84cb,Kenneth Miller,Category_511 7,12.894194,4e2962fe-9863-485e-b37c-cd66a689b371,8,2020-04-10
8,11def67e-a57c-4dd7-a4cc-b9ab256c66f8,Dr. Brandon Hicks,Category_511 8,23.767752,fd05dac6-3082-4d67-95ff-9cf07df5c0ae,22,2020-05-07
9,e203dea7-f0b4-49ac-9a50-f033d5d76e87,Michelle Barrera,Category_511 9,8.080113,f0991b9d-fff9-42a1-999e-f5fefc676c9d,51,2020-04-24
