# First POC for dataset anonymization

## Imports

In [1]:
import pandas as pd
import random
import datetime as dt
from faker import Faker

In [2]:
fake = Faker()

In [3]:
for _ in range(20):
  print(fake.name())

Christopher Mcmillan
James Smith
Andrea Ruiz
Stephanie Hardy
Christina Lopez
Tammy Graham
Matthew Rodriguez
Brian Wagner
Chelsea Haley
Leslie Fisher
Tiffany Bell
Christopher Callahan
Amanda Ford
Christian Green
William Payne
Sarah Arias
Scott Benitez
Joseph Miller
Daniel Mack
Karen Hartman


## Create and import a sample dataset

In [4]:
df = pd.read_csv('../datasets/test_dataset1.csv', parse_dates=['purchased_date'])

In [5]:
df_original = df.copy()

In [6]:
df

Unnamed: 0,user_id,name,item_name,price,category_id,quantity,purchased_date,email
0,1004,Donna Martin,TV,199.95,14,1,2020-04-04,donna.martin@gmail.com
1,1004,Donna Martin,Vaccum,95.12,15,2,2020-05-12,donna.martin@gmail.com
2,1003,Bobby Bradley,Cup,12.76,44,1,2020-05-13,bobby.bradley@gmail.com
3,1011,Austin Thomas,Plate,5.55,156,1,2020-05-14,austin.thomas@gmail.com
4,1000,Jonathan Medina,Lotion,4.44,23,2,2020-05-15,jonathan.medina@gmail.com
5,1000,Jonathan Medina,Toothbrush,5.96,123,1,2020-05-16,jonathan.medina@gmail.com
6,1019,Brandon Beasley,Sanitizer,2.23,12,3,2020-05-17,brandon.beasley@gmail.com
7,1011,Austin Thomas,Wine,15.67,122,1,2020-04-04,austin.thomas@gmail.com
8,1014,Susan Morales,Yoga mat,45.53,1212,2,2020-04-05,susan.morales@gmail.com
9,1001,Rachael Massey,Beer,2.45,42,4,2020-04-06,rachael.massey@gmail.com


## Replace names

In [7]:
unique_names = df['name'].unique()

In [8]:
unique_names

array(['Donna Martin', 'Bobby Bradley', 'Austin Thomas',
       'Jonathan Medina', 'Brandon Beasley', 'Susan Morales',
       'Rachael Massey', 'Ross Strickland', 'Jennifer Cox',
       'Danny Lynn DDS', 'Ashley Nichols', 'Angela Oneill', 'Cody Moreno',
       'Douglas Atkinson', 'Ellen Mccarthy'], dtype=object)

In [9]:
name_dict = {name: fake.name() for name in unique_names}

In [10]:
name_dict

{'Donna Martin': 'Robert Harper',
 'Bobby Bradley': 'Morgan Hawkins',
 'Austin Thomas': 'Daniel Jones',
 'Jonathan Medina': 'Leah Harrell',
 'Brandon Beasley': 'Paul Cox',
 'Susan Morales': 'Vincent White',
 'Rachael Massey': 'William Young',
 'Ross Strickland': 'Rachel Barnes',
 'Jennifer Cox': 'Troy Graham',
 'Danny Lynn DDS': 'Nancy Woodard',
 'Ashley Nichols': 'James Jones',
 'Angela Oneill': 'Rachael Wilson',
 'Cody Moreno': 'Daniel Estrada',
 'Douglas Atkinson': 'Christopher Hall',
 'Ellen Mccarthy': 'Robert Mccoy'}

In [11]:
df['name'] = df['name'].map(name_dict)

## Replace ids

### UserId

In [12]:
unique_user_ids = df['user_id'].unique()

In [13]:
user_id_dict = {id_: fake.uuid4() for id_ in unique_user_ids}

In [14]:
user_id_dict

{1004: '171115a4-59d3-403e-ba3c-af3735960be6',
 1003: 'd279af76-46b7-4c69-b819-d44999233806',
 1011: '85dbeaf9-5a67-4aaf-9926-8e8dd99ecc4b',
 1000: '22dc32e4-9170-4ae8-aa69-f647cf519a5d',
 1019: 'e38a7c8e-8df0-448b-a1e1-24151bd303db',
 1014: '9d623d03-a821-47fb-b8b1-0888f0241a73',
 1001: '3525444f-b98c-454a-9851-78df523144a0',
 1005: '7d544d38-5db1-4334-b504-f9b2f52dfd00',
 1020: '62b73084-7737-49c7-a022-87c31fab3b6c',
 1006: 'dce70cd1-b30b-491c-b87d-c28b1cec0959',
 1015: '9b6b7a20-599c-4687-b122-a6fe7d3402b5',
 1009: 'bf430a53-5bd5-48d3-adbb-165d8efc5bcf',
 1016: 'b2d7c866-dc88-4188-8e13-0a58b9ff35fb'}

In [15]:
df['user_id'] = df['user_id'].map(user_id_dict)

### Category_id

In [16]:
unique_category_ids = df['category_id'].unique()

In [17]:
category_id_dict = {id_: fake.uuid4() for id_ in unique_category_ids}

In [18]:
category_id_dict

{14: '4e5fe3a3-18b2-4f11-99a6-ebb06909be24',
 15: '8625cfd1-5be4-40b6-8382-c64fdac21edc',
 44: '9208afa1-602a-4c15-a025-8d5a8fc3649c',
 156: 'bd4a8b96-8d65-4551-8cc9-e003fa2a0d4a',
 23: 'a42c9f20-7595-4f43-adbb-e19ec22a8899',
 123: '78fd542f-f88c-4fde-a1be-4c810b25ae5d',
 12: 'e56ce237-da8a-4970-b1ff-0e42ec9404d2',
 122: '11214f06-a2c2-48bc-8cea-e13db1ab05ae',
 1212: '082fa5f6-a10b-404e-b431-72492d680751',
 42: '1ae34ca8-146e-4af6-ad50-74a249e5cb28',
 356: 'aadd09f0-8c5b-494a-aa7b-ca2eed71fbc1'}

In [19]:
df['category_id'] = df['category_id'].map(category_id_dict)

## Replace whole numbers

In [20]:
whole_numbers = df['quantity']

In [21]:
whole_numbers

0     1
1     2
2     1
3     1
4     2
5     1
6     3
7     1
8     2
9     4
10    5
11    3
12    4
13    2
14    6
15    1
16    2
17    4
18    1
19    1
Name: quantity, dtype: int64

In [22]:
X_std = (whole_numbers - whole_numbers.min()) / (whole_numbers.max() - whole_numbers.min())
X_scaled = (X_std * (10 - 1) + 1) 
X_scaled_whole_randomized = (X_scaled * random.randint(1, 10)).astype(int)

In [23]:
X_scaled_whole_randomized

0      1
1      2
2      1
3      1
4      2
5      1
6      4
7      1
8      2
9      6
10     8
11     4
12     6
13     2
14    10
15     1
16     2
17     6
18     1
19     1
Name: quantity, dtype: int64

In [24]:
# whole_number_dict = {
#     number: fake.random_int(
#         min=min(whole_numbers), max=max(whole_numbers)
#     )
#     for number in whole_numbers
# }

In [25]:
# whole_number_dict

In [26]:
df['quantity'] = X_scaled_whole_randomized

## Replace decimal numbers

In [27]:
decimal_numbers = df['price']

In [28]:
X_std = (decimal_numbers - decimal_numbers.min()) / (decimal_numbers.max() - decimal_numbers.min())
X_scaled = (X_std * (10 - 1) + 1) 
X_scaled_decimal_randomized = (X_scaled * random.randint(1, 10))

In [29]:
# decimal_number_dict

In [30]:
df['price'] = X_scaled_decimal_randomized

## Categorical data

In [31]:
category = df['item_name']

In [32]:
category

0             TV
1         Vaccum
2            Cup
3          Plate
4         Lotion
5     Toothbrush
6      Sanitizer
7           Wine
8       Yoga mat
9           Beer
10        Coffee
11        Lotion
12        Coffee
13            TV
14        Vaccum
15     Sanitizer
16          Wine
17         Plate
18            TV
19        Vaccum
Name: item_name, dtype: object

In [33]:
unique_category = category.unique()

In [34]:
rand_category = random.randint(0,1000)
category_dict = {
    category:  "Category_" + str(rand_category) + " " +str(iteration)
    for iteration, category in enumerate(unique_category)
}

In [35]:
category_dict

{'TV': 'Category_695 0',
 'Vaccum': 'Category_695 1',
 'Cup': 'Category_695 2',
 'Plate': 'Category_695 3',
 'Lotion': 'Category_695 4',
 'Toothbrush': 'Category_695 5',
 'Sanitizer': 'Category_695 6',
 'Wine': 'Category_695 7',
 'Yoga mat': 'Category_695 8',
 'Beer': 'Category_695 9',
 'Coffee': 'Category_695 10'}

In [36]:
df['item_name'] = df['item_name'].map(category_dict)

### Dates

In [37]:
dates = df['purchased_date']

In [38]:
fake.date_between(start_date=dates.min(), end_date=dates.max())

datetime.date(2020, 4, 26)

In [39]:
dates_dict = {
    date: fake.date_between(start_date=dates.min(), end_date=dates.max()).strftime('%Y-%m-%d')
    for date in dates
}

In [40]:
dates_dict

{Timestamp('2020-04-04 00:00:00'): '2020-04-23',
 Timestamp('2020-05-12 00:00:00'): '2020-05-04',
 Timestamp('2020-05-13 00:00:00'): '2020-05-01',
 Timestamp('2020-05-14 00:00:00'): '2020-04-19',
 Timestamp('2020-05-15 00:00:00'): '2020-04-04',
 Timestamp('2020-05-16 00:00:00'): '2020-04-16',
 Timestamp('2020-05-17 00:00:00'): '2020-04-16',
 Timestamp('2020-04-05 00:00:00'): '2020-04-08',
 Timestamp('2020-04-06 00:00:00'): '2020-05-14',
 Timestamp('2020-04-07 00:00:00'): '2020-05-07',
 Timestamp('2020-04-08 00:00:00'): '2020-04-19',
 Timestamp('2020-04-09 00:00:00'): '2020-04-14',
 Timestamp('2020-04-10 00:00:00'): '2020-05-15',
 Timestamp('2020-05-06 00:00:00'): '2020-04-15',
 Timestamp('2020-05-07 00:00:00'): '2020-04-08',
 Timestamp('2020-05-08 00:00:00'): '2020-04-24',
 Timestamp('2020-05-09 00:00:00'): '2020-04-24',
 Timestamp('2020-05-10 00:00:00'): '2020-04-13',
 Timestamp('2020-05-11 00:00:00'): '2020-05-13'}

In [41]:
df['purchased_date'] = df['purchased_date'].map(dates_dict)

### Email

### if name column exist and was passed for anon, create email using name, otherwise create new emails

In [44]:
unique_email = df['email'].unique()

In [46]:
email_dict = {email: fake.email() for email in unique_email}

In [47]:
email_dict

{'donna.martin@gmail.com': 'haley13@hotmail.com',
 'bobby.bradley@gmail.com': 'rodriguezshannon@yahoo.com',
 'austin.thomas@gmail.com': 'yblair@bennett-gardner.net',
 'jonathan.medina@gmail.com': 'jennifer80@smith.biz',
 'brandon.beasley@gmail.com': 'alison16@yahoo.com',
 'susan.morales@gmail.com': 'jessicaklein@romero-ryan.com',
 'rachael.massey@gmail.com': 'ericsmith@rose.info',
 'ross.strickland@gmail.com': 'aaron46@cohen.com',
 'jennifer.cox@gmail.com': 'johnfleming@yahoo.com',
 'danny.lynn.dds@gmail.com': 'psharp@gmail.com',
 'ashley.nichols@gmail.com': 'curtis35@bishop.com',
 'angela.oneill@gmail.com': 'veronicarodriguez@yahoo.com',
 'cody.moreno@gmail.com': 'ibrewer@krueger.com',
 'douglas.atkinson@gmail.com': 'barbara07@hotmail.com',
 'ellen.mccarthy@gmail.com': 'john91@greene-ortiz.net'}

In [48]:
df['email_new'] = df['email'].map(email_dict)

### Email from name

In [52]:
df['email'] = (df.name.str.replace('\s+', '.') + '@fakeemail.com').str.lower()

# Original vs Final

In [53]:
df_original

Unnamed: 0,user_id,name,item_name,price,category_id,quantity,purchased_date,email
0,1004,Donna Martin,TV,199.95,14,1,2020-04-04,donna.martin@gmail.com
1,1004,Donna Martin,Vaccum,95.12,15,2,2020-05-12,donna.martin@gmail.com
2,1003,Bobby Bradley,Cup,12.76,44,1,2020-05-13,bobby.bradley@gmail.com
3,1011,Austin Thomas,Plate,5.55,156,1,2020-05-14,austin.thomas@gmail.com
4,1000,Jonathan Medina,Lotion,4.44,23,2,2020-05-15,jonathan.medina@gmail.com
5,1000,Jonathan Medina,Toothbrush,5.96,123,1,2020-05-16,jonathan.medina@gmail.com
6,1019,Brandon Beasley,Sanitizer,2.23,12,3,2020-05-17,brandon.beasley@gmail.com
7,1011,Austin Thomas,Wine,15.67,122,1,2020-04-04,austin.thomas@gmail.com
8,1014,Susan Morales,Yoga mat,45.53,1212,2,2020-04-05,susan.morales@gmail.com
9,1001,Rachael Massey,Beer,2.45,42,4,2020-04-06,rachael.massey@gmail.com


In [54]:
df

Unnamed: 0,user_id,name,item_name,price,category_id,quantity,purchased_date,email,email_new
0,171115a4-59d3-403e-ba3c-af3735960be6,Robert Harper,Category_695 0,50.0,4e5fe3a3-18b2-4f11-99a6-ebb06909be24,1,2020-04-23,robert.harper@fakeemail.com,haley13@hotmail.com
1,171115a4-59d3-403e-ba3c-af3735960be6,Robert Harper,Category_695 1,26.14126,8625cfd1-5be4-40b6-8382-c64fdac21edc,2,2020-05-04,robert.harper@fakeemail.com,haley13@hotmail.com
2,d279af76-46b7-4c69-b819-d44999233806,Morgan Hawkins,Category_695 2,7.396571,9208afa1-602a-4c15-a025-8d5a8fc3649c,1,2020-05-01,morgan.hawkins@fakeemail.com,rodriguezshannon@yahoo.com
3,85dbeaf9-5a67-4aaf-9926-8e8dd99ecc4b,Daniel Jones,Category_695 3,5.755614,bd4a8b96-8d65-4551-8cc9-e003fa2a0d4a,1,2020-04-19,daniel.jones@fakeemail.com,yblair@bennett-gardner.net
4,22dc32e4-9170-4ae8-aa69-f647cf519a5d,Leah Harrell,Category_695 4,5.502984,a42c9f20-7595-4f43-adbb-e19ec22a8899,2,2020-04-04,leah.harrell@fakeemail.com,jennifer80@smith.biz
5,22dc32e4-9170-4ae8-aa69-f647cf519a5d,Leah Harrell,Category_695 5,5.848928,78fd542f-f88c-4fde-a1be-4c810b25ae5d,1,2020-04-16,leah.harrell@fakeemail.com,jennifer80@smith.biz
6,e38a7c8e-8df0-448b-a1e1-24151bd303db,Paul Cox,Category_695 6,5.0,e56ce237-da8a-4970-b1ff-0e42ec9404d2,4,2020-04-16,paul.cox@fakeemail.com,alison16@yahoo.com
7,85dbeaf9-5a67-4aaf-9926-8e8dd99ecc4b,Daniel Jones,Category_695 7,8.058871,11214f06-a2c2-48bc-8cea-e13db1ab05ae,1,2020-04-23,daniel.jones@fakeemail.com,yblair@bennett-gardner.net
8,9d623d03-a821-47fb-b8b1-0888f0241a73,Vincent White,Category_695 8,14.854845,082fa5f6-a10b-404e-b431-72492d680751,2,2020-04-08,vincent.white@fakeemail.com,jessicaklein@romero-ryan.com
9,3525444f-b98c-454a-9851-78df523144a0,William Young,Category_695 9,5.050071,1ae34ca8-146e-4af6-ad50-74a249e5cb28,6,2020-05-14,william.young@fakeemail.com,ericsmith@rose.info
