# First POC for dataset anonymization

## Imports

In [1]:
import pandas as pd
import random
import datetime as dt
from faker import Faker

In [2]:
fake = Faker()

In [3]:
for _ in range(20):
  print(fake.name())

Mitchell Scott
Danielle Young
Ryan Lopez
Matthew Morris
William Hall
Jonathan Francis
John Henry
Sarah Rodriguez
John Lyons
David Dennis
Martha Lamb
James Camacho
Cynthia Sims
Julia Gonzalez
Jonathan Martinez
Jack Benitez
Anthony Williams
Jacqueline Richards
Jessica Perez
Jeffrey Zuniga


## Create and import a sample dataset

In [4]:
df = pd.read_csv('../datasets/test_dataset1.csv', parse_dates=['purchased_date'])

In [5]:
df_original = df.copy()

In [6]:
df

Unnamed: 0,user_id,name,item_name,price,category_id,quantity,purchased_date
0,1004,Donna Martin,TV,199.95,14,1,2020-04-04
1,1004,Donna Martin,Vaccum,95.12,15,2,2020-05-12
2,1003,Bobby Bradley,Cup,12.76,44,1,2020-05-13
3,1011,Austin Thomas,Plate,5.55,156,1,2020-05-14
4,1000,Jonathan Medina,Lotion,4.44,23,2,2020-05-15
5,1000,Jonathan Medina,Toothbrush,5.96,123,1,2020-05-16
6,1019,Brandon Beasley,Sanitizer,2.23,12,3,2020-05-17
7,1011,Austin Thomas,Wine,15.67,122,1,2020-04-04
8,1014,Susan Morales,Yoga mat,45.53,1212,2,2020-04-05
9,1001,Rachael Massey,Beer,2.45,42,4,2020-04-06


## Replace names

In [7]:
unique_names = df['name'].unique()

In [8]:
unique_names

array(['Donna Martin', 'Bobby Bradley', 'Austin Thomas',
       'Jonathan Medina', 'Brandon Beasley', 'Susan Morales',
       'Rachael Massey', 'Ross Strickland', 'Jennifer Cox',
       'Danny Lynn DDS', 'Ashley Nichols', 'Angela Oneill', 'Cody Moreno',
       'Douglas Atkinson', 'Ellen Mccarthy'], dtype=object)

In [9]:
name_dict = {name: fake.name() for name in unique_names}

In [10]:
name_dict

{'Donna Martin': 'Eddie Bowman',
 'Bobby Bradley': 'Harry Ochoa',
 'Austin Thomas': 'Mary Davis',
 'Jonathan Medina': 'Michael Hicks',
 'Brandon Beasley': 'Dustin Perez',
 'Susan Morales': 'David Guerra',
 'Rachael Massey': 'Dylan Chavez',
 'Ross Strickland': 'Robin Fox',
 'Jennifer Cox': 'Thomas Byrd',
 'Danny Lynn DDS': 'Robert Smith',
 'Ashley Nichols': 'Ronald Landry',
 'Angela Oneill': 'Catherine Meyer',
 'Cody Moreno': 'Jose Jones',
 'Douglas Atkinson': 'Miranda Dodson',
 'Ellen Mccarthy': 'Andrew Yang'}

In [11]:
df['name'] = df['name'].map(name_dict)

## Replace ids

In [12]:
unique_ids = df['user_id'].unique()

In [13]:
id_dict = {id_: fake.uuid4() for id_ in unique_ids}

In [14]:
id_dict

{1004: 'ff32e534-1157-4eb8-8113-0aefd7c29ed6',
 1003: 'c950535f-bb76-4ec6-b999-0e10a287832e',
 1011: 'd21ed990-1320-441c-aced-971aa2949d1b',
 1000: 'e89df8e2-c426-4802-b4a7-96840bb9d41e',
 1019: 'f38b9915-b871-43ed-ac52-69d8f9cb0d18',
 1014: 'd5db4112-0ce3-49c7-81eb-a7a5fc5eff7e',
 1001: '01865500-de47-4c20-99f9-57e9fb7c7927',
 1005: '787a3ab8-be37-4987-afde-d668c9289f0a',
 1020: 'a0ee4f13-f8cf-44ad-afb9-b254c10c3ed6',
 1006: '4ff3b5bd-11ca-42e8-a450-0ea1bb364eb7',
 1015: '9571bbd2-c35c-4d8e-b11c-1d6c1ef8c1b8',
 1009: '18587227-51dc-490e-b582-e31437f10406',
 1016: '9fd11e7c-10b5-431f-b8fb-2ae79b517713'}

In [15]:
df['user_id'] = df['user_id'].map(id_dict)

## Replace whole numbers

In [16]:
whole_numbers = df['quantity']

In [17]:
whole_numbers

0     1
1     2
2     1
3     1
4     2
5     1
6     3
7     1
8     2
9     4
10    5
11    3
12    4
13    2
14    6
15    1
16    2
17    4
18    1
19    1
Name: quantity, dtype: int64

In [18]:
X_std = (whole_numbers - whole_numbers.min()) / (whole_numbers.max() - whole_numbers.min())
X_scaled = (X_std * (10 - 1) + 1) 
X_scaled_whole_randomized = (X_scaled * random.randint(1, 10)).astype(int)

In [19]:
X_scaled_whole_randomized

0      1
1      2
2      1
3      1
4      2
5      1
6      4
7      1
8      2
9      6
10     8
11     4
12     6
13     2
14    10
15     1
16     2
17     6
18     1
19     1
Name: quantity, dtype: int64

In [20]:
# whole_number_dict = {
#     number: fake.random_int(
#         min=min(whole_numbers), max=max(whole_numbers)
#     )
#     for number in whole_numbers
# }

In [21]:
# whole_number_dict

In [22]:
df['quantity'] = X_scaled_whole_randomized

## Replace decimal numbers

In [23]:
decimal_numbers = df['price']

In [24]:
X_std = (decimal_numbers - decimal_numbers.min()) / (decimal_numbers.max() - decimal_numbers.min())
X_scaled = (X_std * (10 - 1) + 1) 
X_scaled_decimal_randomized = (X_scaled * random.randint(1, 10))

In [25]:
# decimal_number_dict

In [26]:
df['price'] = X_scaled_decimal_randomized

## Categorical data

In [27]:
category = df['item_name']

In [28]:
category

0             TV
1         Vaccum
2            Cup
3          Plate
4         Lotion
5     Toothbrush
6      Sanitizer
7           Wine
8       Yoga mat
9           Beer
10        Coffee
11        Lotion
12        Coffee
13            TV
14        Vaccum
15     Sanitizer
16          Wine
17         Plate
18            TV
19        Vaccum
Name: item_name, dtype: object

In [29]:
unique_category = category.unique()

In [30]:
rand_category = random.randint(0,1000)
category_dict = {
    category:  "Category_" + str(rand_category) + " " +str(iteration)
    for iteration, category in enumerate(unique_category)
}

In [31]:
category_dict

{'TV': 'Category_625 0',
 'Vaccum': 'Category_625 1',
 'Cup': 'Category_625 2',
 'Plate': 'Category_625 3',
 'Lotion': 'Category_625 4',
 'Toothbrush': 'Category_625 5',
 'Sanitizer': 'Category_625 6',
 'Wine': 'Category_625 7',
 'Yoga mat': 'Category_625 8',
 'Beer': 'Category_625 9',
 'Coffee': 'Category_625 10'}

In [32]:
df['item_name'] = df['item_name'].map(category_dict)

### Dates

In [33]:
dates = df['purchased_date']

In [34]:
fake.date_between(start_date=dates.min(), end_date=dates.max())

datetime.date(2020, 4, 30)

In [35]:
dates_dict = {
    date: fake.date_between(start_date=dates.min(), end_date=dates.max()).strftime('%Y-%m-%d')
    for date in dates
}

In [36]:
dates_dict

{Timestamp('2020-04-04 00:00:00'): '2020-05-11',
 Timestamp('2020-05-12 00:00:00'): '2020-05-14',
 Timestamp('2020-05-13 00:00:00'): '2020-04-19',
 Timestamp('2020-05-14 00:00:00'): '2020-04-20',
 Timestamp('2020-05-15 00:00:00'): '2020-04-04',
 Timestamp('2020-05-16 00:00:00'): '2020-05-07',
 Timestamp('2020-05-17 00:00:00'): '2020-04-14',
 Timestamp('2020-04-05 00:00:00'): '2020-05-02',
 Timestamp('2020-04-06 00:00:00'): '2020-04-23',
 Timestamp('2020-04-07 00:00:00'): '2020-04-23',
 Timestamp('2020-04-08 00:00:00'): '2020-05-03',
 Timestamp('2020-04-09 00:00:00'): '2020-04-19',
 Timestamp('2020-04-10 00:00:00'): '2020-04-21',
 Timestamp('2020-05-06 00:00:00'): '2020-04-06',
 Timestamp('2020-05-07 00:00:00'): '2020-04-25',
 Timestamp('2020-05-08 00:00:00'): '2020-05-04',
 Timestamp('2020-05-09 00:00:00'): '2020-04-10',
 Timestamp('2020-05-10 00:00:00'): '2020-04-23',
 Timestamp('2020-05-11 00:00:00'): '2020-04-22'}

In [37]:
df['purchased_date'] = df['purchased_date'].map(dates_dict)

# Original vs Final

In [38]:
df_original

Unnamed: 0,user_id,name,item_name,price,category_id,quantity,purchased_date
0,1004,Donna Martin,TV,199.95,14,1,2020-04-04
1,1004,Donna Martin,Vaccum,95.12,15,2,2020-05-12
2,1003,Bobby Bradley,Cup,12.76,44,1,2020-05-13
3,1011,Austin Thomas,Plate,5.55,156,1,2020-05-14
4,1000,Jonathan Medina,Lotion,4.44,23,2,2020-05-15
5,1000,Jonathan Medina,Toothbrush,5.96,123,1,2020-05-16
6,1019,Brandon Beasley,Sanitizer,2.23,12,3,2020-05-17
7,1011,Austin Thomas,Wine,15.67,122,1,2020-04-04
8,1014,Susan Morales,Yoga mat,45.53,1212,2,2020-04-05
9,1001,Rachael Massey,Beer,2.45,42,4,2020-04-06


In [39]:
df

Unnamed: 0,user_id,name,item_name,price,category_id,quantity,purchased_date
0,ff32e534-1157-4eb8-8113-0aefd7c29ed6,Eddie Bowman,Category_625 0,40.0,14,1,2020-05-11
1,ff32e534-1157-4eb8-8113-0aefd7c29ed6,Eddie Bowman,Category_625 1,20.913008,15,2,2020-05-14
2,c950535f-bb76-4ec6-b999-0e10a287832e,Harry Ochoa,Category_625 2,5.917257,44,1,2020-04-19
3,d21ed990-1320-441c-aced-971aa2949d1b,Mary Davis,Category_625 3,4.604491,156,1,2020-04-20
4,e89df8e2-c426-4802-b4a7-96840bb9d41e,Michael Hicks,Category_625 4,4.402387,23,2,2020-04-04
5,e89df8e2-c426-4802-b4a7-96840bb9d41e,Michael Hicks,Category_625 5,4.679142,123,1,2020-05-07
6,f38b9915-b871-43ed-ac52-69d8f9cb0d18,Dustin Perez,Category_625 6,4.0,12,4,2020-04-14
7,d21ed990-1320-441c-aced-971aa2949d1b,Mary Davis,Category_625 7,6.447097,122,1,2020-05-11
8,d5db4112-0ce3-49c7-81eb-a7a5fc5eff7e,David Guerra,Category_625 8,11.883876,1212,2,2020-05-02
9,01865500-de47-4c20-99f9-57e9fb7c7927,Dylan Chavez,Category_625 9,4.040057,42,6,2020-04-23
