# First POC for dataset anonymization

## Imports

In [1]:
import pandas as pd
import random
import datetime as dt
from faker import Faker

In [2]:
fake = Faker()

In [3]:
for _ in range(20):
  print(fake.name())

Tonya Ochoa
Eric Romero
Kimberly Preston
Andrew Nelson
Jason Grant
Gary Hood
Shawn Pacheco
Colton Henry
Cindy Miller
Cassie Miller
Ethan Hall
Jordan Martinez
Mark Olson
Sherry Sutton
Jonathan Bryan
Mark Guerra
Albert Blair
Cynthia Tucker
Timothy Henry
Susan Weber


## Create and import a sample dataset

In [4]:
df = pd.read_csv('../datasets/test_dataset1.csv', parse_dates=['purchased_date'])

In [5]:
df_original = df.copy()

In [6]:
df

Unnamed: 0,user_id,name,item_name,price,category_id,quantity,purchased_date
0,1004,Donna Martin,TV,199.95,14,1,2020-04-04
1,1004,Donna Martin,Vaccum,95.12,15,2,2020-05-12
2,1003,Bobby Bradley,Cup,12.76,44,1,2020-05-13
3,1011,Austin Thomas,Plate,5.55,156,1,2020-05-14
4,1000,Jonathan Medina,Lotion,4.44,23,2,2020-05-15
5,1000,Jonathan Medina,Toothbrush,5.96,123,1,2020-05-16
6,1019,Brandon Beasley,Sanitizer,2.23,12,3,2020-05-17
7,1011,Austin Thomas,Wine,15.67,122,1,2020-04-04
8,1014,Susan Morales,Yoga mat,45.53,1212,2,2020-04-05
9,1001,Rachael Massey,Beer,2.45,42,4,2020-04-06


## Replace names

In [7]:
unique_names = df['name'].unique()

In [8]:
unique_names

array(['Donna Martin', 'Bobby Bradley', 'Austin Thomas',
       'Jonathan Medina', 'Brandon Beasley', 'Susan Morales',
       'Rachael Massey', 'Ross Strickland', 'Jennifer Cox',
       'Danny Lynn DDS', 'Ashley Nichols', 'Angela Oneill', 'Cody Moreno',
       'Douglas Atkinson', 'Ellen Mccarthy'], dtype=object)

In [9]:
name_dict = {name: fake.name() for name in unique_names}

In [10]:
name_dict

{'Donna Martin': 'Matthew Hendrix',
 'Bobby Bradley': 'Peter Douglas',
 'Austin Thomas': 'Beth Zavala',
 'Jonathan Medina': 'Jodi Baker',
 'Brandon Beasley': 'Katherine Herrera',
 'Susan Morales': 'James Campbell',
 'Rachael Massey': 'Matthew Fernandez',
 'Ross Strickland': 'Jessica Carlson',
 'Jennifer Cox': 'Karen Watson',
 'Danny Lynn DDS': 'Jason Taylor',
 'Ashley Nichols': 'Elaine Carr',
 'Angela Oneill': 'Maxwell Stewart',
 'Cody Moreno': 'Michael Johnson',
 'Douglas Atkinson': 'Susan Williams',
 'Ellen Mccarthy': 'Anthony Martinez'}

In [11]:
df['name'] = df['name'].map(name_dict)

## Replace ids

### UserId

In [12]:
unique_user_ids = df['user_id'].unique()

In [13]:
user_id_dict = {id_: fake.uuid4() for id_ in unique_user_ids}

In [14]:
user_id_dict

{1004: '9dab54de-0ef4-4301-8a1b-14c403b92e8a',
 1003: '37fa0a4d-f2f2-4763-bdb3-744043bdffd6',
 1011: '99326c31-39b6-4c3e-ae3a-b4fa475745cf',
 1000: 'f69f3eb4-3fe6-4b23-8da1-6898ee0710f8',
 1019: 'f8403092-0be5-480c-b78d-d371b4b7d809',
 1014: '11c34221-8680-4f4c-b96d-1fc5b92dd0ee',
 1001: '8396e0d8-b645-48b9-9b2d-85af61b7d671',
 1005: 'aacdfd95-a28c-4353-a243-94f08c9b374f',
 1020: 'a63a2d7b-9fea-43a6-bbe7-79cae1324291',
 1006: '04a817ab-9529-432e-a7d1-9f7bb596fcf2',
 1015: 'd501181d-bdc7-4982-a56a-8483a1fad9e0',
 1009: 'f348e978-5e1b-4513-a5e4-65f09ca495e6',
 1016: 'f60fab4c-db43-455e-a9b4-b6e66d848ab2'}

In [15]:
df['user_id'] = df['user_id'].map(user_id_dict)

### Category_id

In [16]:
unique_category_ids = df['category_id'].unique()

In [17]:
category_id_dict = {id_: fake.uuid4() for id_ in unique_category_ids}

In [18]:
category_id_dict

{14: '2ef0d0c5-7fee-4cdc-9a4d-fe102df9a56f',
 15: '2b753826-336d-49da-bf19-9eca812966a4',
 44: '3ad589e7-46d3-46e0-823f-555ab2b9f12f',
 156: '053e1647-803c-46aa-919e-3dd46fd09985',
 23: 'd3ac3ca8-4b20-4d64-95cc-57c296bc21c0',
 123: '76842d0f-d92f-4755-b788-4887d9bbe6be',
 12: '2af63a27-0ae5-4e0d-ae9e-d8dcf1a7db84',
 122: 'ab8902e3-7c5f-42e2-a923-ed88a292fffb',
 1212: '46521576-a94a-4e75-8472-5d1d941a6357',
 42: 'e9060c1c-9b0b-4dce-a120-f48c86a631ae',
 356: 'faea3c02-f074-4aad-b3c3-26fef4c77210'}

In [19]:
df['category_id'] = df['category_id'].map(category_id_dict)

## Replace whole numbers

In [20]:
whole_numbers = df['quantity']

In [21]:
whole_numbers

0     1
1     2
2     1
3     1
4     2
5     1
6     3
7     1
8     2
9     4
10    5
11    3
12    4
13    2
14    6
15    1
16    2
17    4
18    1
19    1
Name: quantity, dtype: int64

In [22]:
X_std = (whole_numbers - whole_numbers.min()) / (whole_numbers.max() - whole_numbers.min())
X_scaled = (X_std * (10 - 1) + 1) 
X_scaled_whole_randomized = (X_scaled * random.randint(1, 10)).astype(int)

In [23]:
X_scaled_whole_randomized

0      6
1     16
2      6
3      6
4     16
5      6
6     27
7      6
8     16
9     38
10    49
11    27
12    38
13    16
14    60
15     6
16    16
17    38
18     6
19     6
Name: quantity, dtype: int64

In [24]:
# whole_number_dict = {
#     number: fake.random_int(
#         min=min(whole_numbers), max=max(whole_numbers)
#     )
#     for number in whole_numbers
# }

In [25]:
# whole_number_dict

In [26]:
df['quantity'] = X_scaled_whole_randomized

## Replace decimal numbers

In [27]:
decimal_numbers = df['price']

In [28]:
X_std = (decimal_numbers - decimal_numbers.min()) / (decimal_numbers.max() - decimal_numbers.min())
X_scaled = (X_std * (10 - 1) + 1) 
X_scaled_decimal_randomized = (X_scaled * random.randint(1, 10))

In [29]:
# decimal_number_dict

In [30]:
df['price'] = X_scaled_decimal_randomized

## Categorical data

In [31]:
category = df['item_name']

In [32]:
category

0             TV
1         Vaccum
2            Cup
3          Plate
4         Lotion
5     Toothbrush
6      Sanitizer
7           Wine
8       Yoga mat
9           Beer
10        Coffee
11        Lotion
12        Coffee
13            TV
14        Vaccum
15     Sanitizer
16          Wine
17         Plate
18            TV
19        Vaccum
Name: item_name, dtype: object

In [33]:
unique_category = category.unique()

In [34]:
rand_category = random.randint(0,1000)
category_dict = {
    category:  "Category_" + str(rand_category) + " " +str(iteration)
    for iteration, category in enumerate(unique_category)
}

In [35]:
category_dict

{'TV': 'Category_744 0',
 'Vaccum': 'Category_744 1',
 'Cup': 'Category_744 2',
 'Plate': 'Category_744 3',
 'Lotion': 'Category_744 4',
 'Toothbrush': 'Category_744 5',
 'Sanitizer': 'Category_744 6',
 'Wine': 'Category_744 7',
 'Yoga mat': 'Category_744 8',
 'Beer': 'Category_744 9',
 'Coffee': 'Category_744 10'}

In [36]:
df['item_name'] = df['item_name'].map(category_dict)

### Dates

In [37]:
dates = df['purchased_date']

In [38]:
fake.date_between(start_date=dates.min(), end_date=dates.max())

datetime.date(2020, 4, 20)

In [39]:
dates_dict = {
    date: fake.date_between(start_date=dates.min(), end_date=dates.max()).strftime('%Y-%m-%d')
    for date in dates
}

In [40]:
dates_dict

{Timestamp('2020-04-04 00:00:00'): '2020-05-13',
 Timestamp('2020-05-12 00:00:00'): '2020-04-04',
 Timestamp('2020-05-13 00:00:00'): '2020-04-06',
 Timestamp('2020-05-14 00:00:00'): '2020-04-25',
 Timestamp('2020-05-15 00:00:00'): '2020-05-04',
 Timestamp('2020-05-16 00:00:00'): '2020-05-13',
 Timestamp('2020-05-17 00:00:00'): '2020-04-08',
 Timestamp('2020-04-05 00:00:00'): '2020-04-25',
 Timestamp('2020-04-06 00:00:00'): '2020-04-10',
 Timestamp('2020-04-07 00:00:00'): '2020-04-04',
 Timestamp('2020-04-08 00:00:00'): '2020-05-01',
 Timestamp('2020-04-09 00:00:00'): '2020-05-03',
 Timestamp('2020-04-10 00:00:00'): '2020-04-17',
 Timestamp('2020-05-06 00:00:00'): '2020-04-17',
 Timestamp('2020-05-07 00:00:00'): '2020-04-10',
 Timestamp('2020-05-08 00:00:00'): '2020-05-11',
 Timestamp('2020-05-09 00:00:00'): '2020-05-08',
 Timestamp('2020-05-10 00:00:00'): '2020-04-15',
 Timestamp('2020-05-11 00:00:00'): '2020-05-06'}

In [41]:
df['purchased_date'] = df['purchased_date'].map(dates_dict)

# Original vs Final

In [42]:
df_original

Unnamed: 0,user_id,name,item_name,price,category_id,quantity,purchased_date
0,1004,Donna Martin,TV,199.95,14,1,2020-04-04
1,1004,Donna Martin,Vaccum,95.12,15,2,2020-05-12
2,1003,Bobby Bradley,Cup,12.76,44,1,2020-05-13
3,1011,Austin Thomas,Plate,5.55,156,1,2020-05-14
4,1000,Jonathan Medina,Lotion,4.44,23,2,2020-05-15
5,1000,Jonathan Medina,Toothbrush,5.96,123,1,2020-05-16
6,1019,Brandon Beasley,Sanitizer,2.23,12,3,2020-05-17
7,1011,Austin Thomas,Wine,15.67,122,1,2020-04-04
8,1014,Susan Morales,Yoga mat,45.53,1212,2,2020-04-05
9,1001,Rachael Massey,Beer,2.45,42,4,2020-04-06


In [43]:
df

Unnamed: 0,user_id,name,item_name,price,category_id,quantity,purchased_date
0,9dab54de-0ef4-4301-8a1b-14c403b92e8a,Matthew Hendrix,Category_744 0,60.0,2ef0d0c5-7fee-4cdc-9a4d-fe102df9a56f,6,2020-05-13
1,9dab54de-0ef4-4301-8a1b-14c403b92e8a,Matthew Hendrix,Category_744 1,31.369512,2b753826-336d-49da-bf19-9eca812966a4,16,2020-04-04
2,37fa0a4d-f2f2-4763-bdb3-744043bdffd6,Peter Douglas,Category_744 2,8.875885,3ad589e7-46d3-46e0-823f-555ab2b9f12f,6,2020-04-06
3,99326c31-39b6-4c3e-ae3a-b4fa475745cf,Beth Zavala,Category_744 3,6.906737,053e1647-803c-46aa-919e-3dd46fd09985,6,2020-04-25
4,f69f3eb4-3fe6-4b23-8da1-6898ee0710f8,Jodi Baker,Category_744 4,6.603581,d3ac3ca8-4b20-4d64-95cc-57c296bc21c0,16,2020-05-04
5,f69f3eb4-3fe6-4b23-8da1-6898ee0710f8,Jodi Baker,Category_744 5,7.018713,76842d0f-d92f-4755-b788-4887d9bbe6be,6,2020-05-13
6,f8403092-0be5-480c-b78d-d371b4b7d809,Katherine Herrera,Category_744 6,6.0,2af63a27-0ae5-4e0d-ae9e-d8dcf1a7db84,27,2020-04-08
7,99326c31-39b6-4c3e-ae3a-b4fa475745cf,Beth Zavala,Category_744 7,9.670645,ab8902e3-7c5f-42e2-a923-ed88a292fffb,6,2020-05-13
8,11c34221-8680-4f4c-b96d-1fc5b92dd0ee,James Campbell,Category_744 8,17.825814,46521576-a94a-4e75-8472-5d1d941a6357,16,2020-04-25
9,8396e0d8-b645-48b9-9b2d-85af61b7d671,Matthew Fernandez,Category_744 9,6.060085,e9060c1c-9b0b-4dce-a120-f48c86a631ae,38,2020-04-10
