# First POC for dataset anonymization

## Imports

In [1]:
import pandas as pd
import random
import datetime as dt
from faker import Faker

In [2]:
fake = Faker()

In [3]:
for _ in range(20):
  print(fake.name())

Chad Stafford
Melissa Randolph
Victoria Webb
Laurie Brady
Angela Bates
Craig Gomez
Elizabeth Taylor
Shawn Garcia
Jeffrey Williams
Ruben Duran
Lynn Blackwell
George Thornton
Alyssa Williams
Miss Meghan Mejia
Claudia Rogers
Mandy White
Kenneth Andrews
Veronica Armstrong
Jeffrey Moore
Joseph Houston


## Create and import a sample dataset

In [4]:
df = pd.read_csv('../datasets/test_dataset1.csv', parse_dates=['purchased_date'])

In [5]:
df_original = df.copy()

In [6]:
df

Unnamed: 0,user_id,name,item,amount,category_id,purchased_quantity,purchased_date
0,1004,Donna Martin,TV,199.95,14,1,2020-04-04
1,1004,Donna Martin,Vaccum,95.12,15,2,2020-05-12
2,1003,Bobby Bradley,Cup,12.76,44,1,2020-05-13
3,1011,Austin Thomas,Plate,5.55,156,1,2020-05-14
4,1000,Jonathan Medina,Lotion,4.44,23,2,2020-05-15
5,1000,Jonathan Medina,Toothbrush,5.96,123,1,2020-05-16
6,1019,Brandon Beasley,Sanitizer,2.23,12,3,2020-05-17
7,1011,Austin Thomas,Wine,15.67,122,1,2020-04-04
8,1014,Susan Morales,Yoga mat,45.53,1212,2,2020-04-05
9,1001,Rachael Massey,Beer,2.45,42,4,2020-04-06


## Replace names

In [7]:
unique_names = df['name'].unique()

In [8]:
unique_names

array(['Donna Martin', 'Bobby Bradley', 'Austin Thomas',
       'Jonathan Medina', 'Brandon Beasley', 'Susan Morales',
       'Rachael Massey', 'Ross Strickland', 'Jennifer Cox',
       'Danny Lynn DDS', 'Ashley Nichols', 'Angela Oneill', 'Cody Moreno',
       'Douglas Atkinson', 'Ellen Mccarthy'], dtype=object)

In [9]:
name_dict = {name: fake.name() for name in unique_names}

In [10]:
name_dict

{'Donna Martin': 'Michael Coleman',
 'Bobby Bradley': 'David Vazquez',
 'Austin Thomas': 'Luis Smith',
 'Jonathan Medina': 'Sarah Calderon',
 'Brandon Beasley': 'Kristine Smith',
 'Susan Morales': 'Kyle Farmer',
 'Rachael Massey': 'Dr. Ashley Berry',
 'Ross Strickland': 'Anna Kennedy',
 'Jennifer Cox': 'Rebecca Guerra',
 'Danny Lynn DDS': 'Carmen Adkins',
 'Ashley Nichols': 'Kathy Mclaughlin',
 'Angela Oneill': 'Ricky Chang',
 'Cody Moreno': 'Alicia Benton',
 'Douglas Atkinson': 'Meagan Davidson',
 'Ellen Mccarthy': 'Michael Johnson'}

In [11]:
df['name'] = df['name'].map(name_dict)

## Replace ids

In [12]:
unique_ids = df['user_id'].unique()

In [13]:
id_dict = {id_: fake.uuid4() for id_ in unique_ids}

In [14]:
id_dict

{1004: 'a97f8a93-46bf-4017-93bf-abb92515a7c2',
 1003: 'b972d5b3-3bfa-4c6f-915a-3fc0d3143f65',
 1011: '7049726f-0dc9-4de3-b67a-9a55f012aed1',
 1000: '51628526-b638-495e-ba1c-24e93ae1d9ca',
 1019: 'bdb2fa65-9b80-4331-851b-e1c9450c92d7',
 1014: 'e4d2fa90-181c-42bf-87be-2259ff852ed1',
 1001: 'cac61fda-204e-4024-a7db-016552871425',
 1005: '488cf080-4b4e-42df-a934-89709bb40c64',
 1020: '0e313785-3b48-48d0-852f-a327553426f0',
 1006: 'add96325-ed96-4b9f-b8c2-ccdf21629e52',
 1015: '224c4c78-4787-45b0-b289-3a464e0c97f4',
 1009: 'd8ed66ee-fa3b-4843-a567-dc32babeb944',
 1016: '13f3a001-f60f-40c3-9885-7d147909c585'}

In [15]:
df['user_id'] = df['user_id'].map(id_dict)

## Replace whole numbers

In [16]:
whole_numbers = df['purchased_quantity']

In [17]:
whole_numbers

0     1
1     2
2     1
3     1
4     2
5     1
6     3
7     1
8     2
9     4
10    5
11    3
12    4
13    2
14    6
15    1
16    2
17    4
18    1
19    1
Name: purchased_quantity, dtype: int64

In [52]:
X_std = (whole_numbers - whole_numbers.min()) / (whole_numbers.max() - whole_numbers.min())
X_scaled = (X_std * (10 - 1) + 1) 
X_scaled_whole_randomized = (X_scaled * random.randint(1, 10)).astype(int)

In [53]:
X_scaled_whole_randomized

0      7
1     19
2      7
3      7
4     19
5      7
6     32
7      7
8     19
9     44
10    57
11    32
12    44
13    19
14    70
15     7
16    19
17    44
18     7
19     7
Name: purchased_quantity, dtype: int64

In [91]:
# whole_number_dict = {
#     number: fake.random_int(
#         min=min(whole_numbers), max=max(whole_numbers)
#     )
#     for number in whole_numbers
# }

In [54]:
# whole_number_dict

In [56]:
df['purchased_quantity'] = X_scaled_whole_randomized

## Replace decimal numbers

In [58]:
decimal_numbers = df['amount']

In [59]:
X_std = (decimal_numbers - decimal_numbers.min()) / (decimal_numbers.max() - decimal_numbers.min())
X_scaled = (X_std * (10 - 1) + 1) 
X_scaled_decimal_randomized = (X_scaled * random.randint(1, 10))

In [60]:
# decimal_number_dict

In [61]:
df['amount'] = X_scaled_decimal_randomized

## Categorical data

In [62]:
category = df['item']

In [63]:
category

0             TV
1         Vaccum
2            Cup
3          Plate
4         Lotion
5     Toothbrush
6      Sanitizer
7           Wine
8       Yoga mat
9           Beer
10        Coffee
11        Lotion
12        Coffee
13            TV
14        Vaccum
15     Sanitizer
16          Wine
17         Plate
18            TV
19        Vaccum
Name: item, dtype: object

In [64]:
unique_category = category.unique()

In [65]:
rand_category = random.randint(0,1000)
category_dict = {
    category:  "Category_" + str(rand_category) + " " +str(iteration)
    for iteration, category in enumerate(unique_category)
}

In [66]:
category_dict

{'TV': 'Category_722 0',
 'Vaccum': 'Category_722 1',
 'Cup': 'Category_722 2',
 'Plate': 'Category_722 3',
 'Lotion': 'Category_722 4',
 'Toothbrush': 'Category_722 5',
 'Sanitizer': 'Category_722 6',
 'Wine': 'Category_722 7',
 'Yoga mat': 'Category_722 8',
 'Beer': 'Category_722 9',
 'Coffee': 'Category_722 10'}

In [67]:
df['item'] = df['item'].map(category_dict)

### Dates

In [68]:
dates = df['purchased_date']

In [69]:
fake.date_between(start_date=dates.min(), end_date=dates.max())

datetime.date(2020, 4, 9)

In [70]:
dates_dict = {
    date: fake.date_between(start_date=dates.min(), end_date=dates.max()).strftime('%Y-%m-%d')
    for date in dates
}

In [71]:
dates_dict

{Timestamp('2020-04-04 00:00:00'): '2020-05-02',
 Timestamp('2020-05-12 00:00:00'): '2020-05-09',
 Timestamp('2020-05-13 00:00:00'): '2020-05-10',
 Timestamp('2020-05-14 00:00:00'): '2020-04-22',
 Timestamp('2020-05-15 00:00:00'): '2020-05-14',
 Timestamp('2020-05-16 00:00:00'): '2020-05-05',
 Timestamp('2020-05-17 00:00:00'): '2020-05-01',
 Timestamp('2020-04-05 00:00:00'): '2020-04-26',
 Timestamp('2020-04-06 00:00:00'): '2020-04-27',
 Timestamp('2020-04-07 00:00:00'): '2020-04-07',
 Timestamp('2020-04-08 00:00:00'): '2020-04-18',
 Timestamp('2020-04-09 00:00:00'): '2020-04-17',
 Timestamp('2020-04-10 00:00:00'): '2020-04-23',
 Timestamp('2020-05-06 00:00:00'): '2020-05-12',
 Timestamp('2020-05-07 00:00:00'): '2020-04-05',
 Timestamp('2020-05-08 00:00:00'): '2020-04-07',
 Timestamp('2020-05-09 00:00:00'): '2020-04-28',
 Timestamp('2020-05-10 00:00:00'): '2020-04-26',
 Timestamp('2020-05-11 00:00:00'): '2020-05-11'}

In [72]:
df['purchased_date'] = df['purchased_date'].map(dates_dict)

# Original vs Final

In [73]:
df_original

Unnamed: 0,user_id,name,item,amount,category_id,purchased_quantity,purchased_date
0,1004,Donna Martin,TV,199.95,14,1,2020-04-04
1,1004,Donna Martin,Vaccum,95.12,15,2,2020-05-12
2,1003,Bobby Bradley,Cup,12.76,44,1,2020-05-13
3,1011,Austin Thomas,Plate,5.55,156,1,2020-05-14
4,1000,Jonathan Medina,Lotion,4.44,23,2,2020-05-15
5,1000,Jonathan Medina,Toothbrush,5.96,123,1,2020-05-16
6,1019,Brandon Beasley,Sanitizer,2.23,12,3,2020-05-17
7,1011,Austin Thomas,Wine,15.67,122,1,2020-04-04
8,1014,Susan Morales,Yoga mat,45.53,1212,2,2020-04-05
9,1001,Rachael Massey,Beer,2.45,42,4,2020-04-06


In [74]:
df

Unnamed: 0,user_id,name,item,amount,category_id,purchased_quantity,purchased_date
0,a97f8a93-46bf-4017-93bf-abb92515a7c2,Michael Coleman,Category_722 0,70.0,14,7,2020-05-02
1,a97f8a93-46bf-4017-93bf-abb92515a7c2,Michael Coleman,Category_722 1,36.597765,15,19,2020-05-09
2,b972d5b3-3bfa-4c6f-915a-3fc0d3143f65,David Vazquez,Category_722 2,10.355199,44,7,2020-05-10
3,7049726f-0dc9-4de3-b67a-9a55f012aed1,Luis Smith,Category_722 3,8.05786,156,7,2020-04-22
4,51628526-b638-495e-ba1c-24e93ae1d9ca,Sarah Calderon,Category_722 4,7.704178,23,19,2020-05-14
5,51628526-b638-495e-ba1c-24e93ae1d9ca,Sarah Calderon,Category_722 5,8.188499,123,7,2020-05-05
6,bdb2fa65-9b80-4331-851b-e1c9450c92d7,Kristine Smith,Category_722 6,7.0,12,32,2020-05-01
7,7049726f-0dc9-4de3-b67a-9a55f012aed1,Luis Smith,Category_722 7,11.28242,122,7,2020-05-02
8,e4d2fa90-181c-42bf-87be-2259ff852ed1,Kyle Farmer,Category_722 8,20.796783,1212,19,2020-04-26
9,cac61fda-204e-4024-a7db-016552871425,Dr. Ashley Berry,Category_722 9,7.070099,42,44,2020-04-27
