# First POC for dataset anonymization

## Imports

In [72]:
import pandas as pd
import random
import datetime as dt
from faker import Faker

In [73]:
fake = Faker()

In [74]:
for _ in range(20):
  print(fake.name())

William Hobbs
Lauren Thompson
Kevin Williams
Nicole Rojas
Dr. Patrick Cruz PhD
Tami Sullivan
Steven Rogers
Gabriella Brown
Mallory Watson
Julie Adams
Glen Park
Gary Lee
Kirsten Archer
Yesenia Tate
Mark Ward
Susan Lopez
Catherine Abbott
Holly Ballard
William Horn
Jay Nguyen


## Create and import a sample dataset

In [75]:
df = pd.read_csv('../datasets/test_dataset1.csv', parse_dates=['purchased_date'])

In [76]:
df_original = df.copy()

In [77]:
df

Unnamed: 0,user_id,name,item,amount,category_id,purchased_quantity,purchased_date
0,1004,Donna Martin,TV,199.95,14,1,2020-04-04
1,1004,Donna Martin,Vaccum,95.12,15,2,2020-05-12
2,1003,Bobby Bradley,Cup,12.76,44,1,2020-05-13
3,1011,Austin Thomas,Plate,5.55,156,1,2020-05-14
4,1000,Jonathan Medina,Lotion,4.44,23,2,2020-05-15
5,1000,Jonathan Medina,Toothbrush,5.96,123,1,2020-05-16
6,1019,Brandon Beasley,Sanitizer,2.23,12,3,2020-05-17
7,1011,Austin Thomas,Wine,15.67,122,1,2020-04-04
8,1014,Susan Morales,Yoga mat,45.53,1212,2,2020-04-05
9,1001,Rachael Massey,Beer,2.45,42,4,2020-04-06


## Replace names

In [78]:
unique_names = df['name'].unique()

In [79]:
unique_names

array(['Donna Martin', 'Bobby Bradley', 'Austin Thomas',
       'Jonathan Medina', 'Brandon Beasley', 'Susan Morales',
       'Rachael Massey', 'Ross Strickland', 'Jennifer Cox',
       'Danny Lynn DDS', 'Ashley Nichols', 'Angela Oneill', 'Cody Moreno',
       'Douglas Atkinson', 'Ellen Mccarthy'], dtype=object)

In [80]:
name_dict = {name: fake.name() for name in unique_names}

In [81]:
name_dict

{'Donna Martin': 'Kristin Gonzalez',
 'Bobby Bradley': 'Katie Mccormick',
 'Austin Thomas': 'Robert Miller',
 'Jonathan Medina': 'Taylor Smith',
 'Brandon Beasley': 'William Rosales',
 'Susan Morales': 'Angela Martinez',
 'Rachael Massey': 'Jason Carter',
 'Ross Strickland': 'Nancy Harris',
 'Jennifer Cox': 'Andrew Mcintosh',
 'Danny Lynn DDS': 'Tina Mitchell',
 'Ashley Nichols': 'Mark Garcia',
 'Angela Oneill': 'Whitney Brock',
 'Cody Moreno': 'Amanda Martin MD',
 'Douglas Atkinson': 'Mr. Ivan Salazar MD',
 'Ellen Mccarthy': 'Sheena Atkinson'}

In [82]:
df['name'] = df['name'].map(name_dict)

## Replace ids

In [83]:
unique_ids = df['user_id'].unique()

In [84]:
id_dict = {id_: fake.uuid4() for id_ in unique_ids}

In [85]:
id_dict

{1004: 'cb57a54b-4ee0-4e3f-9ecc-d917b368bcfd',
 1003: 'f4d555f4-f715-4f54-9e17-18dde69ad6f4',
 1011: '581d2436-9a71-4329-84b6-8f648a6af777',
 1000: '172cc469-9271-4650-926b-455b9b337411',
 1019: 'dec64b5f-8a3e-48b4-8143-78cefa4a6283',
 1014: '5c2041ba-c4fb-48ce-859a-8c0bfc0652c0',
 1001: 'f26141f9-a74c-442e-8545-5ce9c0d6b9d2',
 1005: '3fbf0820-4f3a-4566-aced-a635542df823',
 1020: 'a4b75364-804c-4f95-b927-f31f95fdc48a',
 1006: '2c80313d-3b0b-464e-9071-21a7ddbce865',
 1015: '1e7db155-07b7-4f03-9e8f-a3d5d9ae23d4',
 1009: 'bc01178b-de82-48f1-8a11-fb83745f04eb',
 1016: '1ac52da8-9810-460d-9c9e-3c439a46e5c7'}

In [86]:
df['user_id'] = df['user_id'].map(id_dict)

## Replace whole numbers

In [90]:
whole_numbers = df['purchased_quantity']

In [91]:
whole_number_dict = {
    number: fake.random_int(
        min=min(whole_numbers), max=max(whole_numbers)
    )
    for number in whole_numbers
}

In [92]:
whole_number_dict

{1: 6, 2: 2, 3: 5, 4: 1, 5: 3, 6: 2}

In [93]:
df['purchased_quantity'] = df['purchased_quantity'].map(whole_number_dict)

## Replace decimal numbers

In [94]:
decimal_numbers = df['amount']

In [95]:
decimal_number_dict = {
    number: round(random.uniform(
        min(decimal_numbers), max(decimal_numbers)
    ), 2)
    for number in decimal_numbers
}

In [96]:
decimal_number_dict

{199.95: 97.98,
 95.12: 10.5,
 12.76: 39.9,
 5.55: 44.76,
 4.44: 130.12,
 5.96: 71.62,
 2.23: 164.47,
 15.67: 132.14,
 45.53: 76.27,
 2.45: 78.86,
 8.45: 142.59}

In [29]:
df['amount'] = df['amount'].map(decimal_number_dict)

## Categorical data

In [97]:
category = df['item']

In [98]:
category

0             TV
1         Vaccum
2            Cup
3          Plate
4         Lotion
5     Toothbrush
6      Sanitizer
7           Wine
8       Yoga mat
9           Beer
10        Coffee
11        Lotion
12        Coffee
13            TV
14        Vaccum
15     Sanitizer
16          Wine
17         Plate
18            TV
19        Vaccum
Name: item, dtype: object

In [99]:
unique_category = category.unique()

In [100]:
rand_category = random.randint(0,1000)
category_dict = {
    category:  "Category_" + str(rand_category) + " " +str(iteration)
    for iteration, category in enumerate(unique_category)
}

In [101]:
category_dict

{'TV': 'Category_179 0',
 'Vaccum': 'Category_179 1',
 'Cup': 'Category_179 2',
 'Plate': 'Category_179 3',
 'Lotion': 'Category_179 4',
 'Toothbrush': 'Category_179 5',
 'Sanitizer': 'Category_179 6',
 'Wine': 'Category_179 7',
 'Yoga mat': 'Category_179 8',
 'Beer': 'Category_179 9',
 'Coffee': 'Category_179 10'}

In [102]:
df['item'] = df['item'].map(category_dict)

### Dates

In [104]:
dates = df['purchased_date']

In [105]:
fake.date_between(start_date=dates.min(), end_date=dates.max())

datetime.date(2020, 5, 16)

In [106]:
dates_dict = {
    date: fake.date_between(start_date=dates.min(), end_date=dates.max()).strftime('%Y-%m-%d')
    for date in dates
}

In [107]:
dates_dict

{Timestamp('2020-04-04 00:00:00'): '2020-04-09',
 Timestamp('2020-05-12 00:00:00'): '2020-05-10',
 Timestamp('2020-05-13 00:00:00'): '2020-05-04',
 Timestamp('2020-05-14 00:00:00'): '2020-05-16',
 Timestamp('2020-05-15 00:00:00'): '2020-05-06',
 Timestamp('2020-05-16 00:00:00'): '2020-04-25',
 Timestamp('2020-05-17 00:00:00'): '2020-04-30',
 Timestamp('2020-04-05 00:00:00'): '2020-05-09',
 Timestamp('2020-04-06 00:00:00'): '2020-04-16',
 Timestamp('2020-04-07 00:00:00'): '2020-04-27',
 Timestamp('2020-04-08 00:00:00'): '2020-04-10',
 Timestamp('2020-04-09 00:00:00'): '2020-04-14',
 Timestamp('2020-04-10 00:00:00'): '2020-04-26',
 Timestamp('2020-05-06 00:00:00'): '2020-04-12',
 Timestamp('2020-05-07 00:00:00'): '2020-04-28',
 Timestamp('2020-05-08 00:00:00'): '2020-04-28',
 Timestamp('2020-05-09 00:00:00'): '2020-04-17',
 Timestamp('2020-05-10 00:00:00'): '2020-04-11',
 Timestamp('2020-05-11 00:00:00'): '2020-04-22'}

In [108]:
df['purchased_date'] = df['purchased_date'].map(dates_dict)

# Original vs Final

In [110]:
df_original

Unnamed: 0,user_id,name,item,amount,category_id,purchased_quantity,purchased_date
0,1004,Donna Martin,TV,199.95,14,1,2020-04-04
1,1004,Donna Martin,Vaccum,95.12,15,2,2020-05-12
2,1003,Bobby Bradley,Cup,12.76,44,1,2020-05-13
3,1011,Austin Thomas,Plate,5.55,156,1,2020-05-14
4,1000,Jonathan Medina,Lotion,4.44,23,2,2020-05-15
5,1000,Jonathan Medina,Toothbrush,5.96,123,1,2020-05-16
6,1019,Brandon Beasley,Sanitizer,2.23,12,3,2020-05-17
7,1011,Austin Thomas,Wine,15.67,122,1,2020-04-04
8,1014,Susan Morales,Yoga mat,45.53,1212,2,2020-04-05
9,1001,Rachael Massey,Beer,2.45,42,4,2020-04-06


In [111]:
df

Unnamed: 0,user_id,name,item,amount,category_id,purchased_quantity,purchased_date
0,cb57a54b-4ee0-4e3f-9ecc-d917b368bcfd,Kristin Gonzalez,Category_179 0,199.95,14,6,2020-04-09
1,cb57a54b-4ee0-4e3f-9ecc-d917b368bcfd,Kristin Gonzalez,Category_179 1,95.12,15,2,2020-05-10
2,f4d555f4-f715-4f54-9e17-18dde69ad6f4,Katie Mccormick,Category_179 2,12.76,44,6,2020-05-04
3,581d2436-9a71-4329-84b6-8f648a6af777,Robert Miller,Category_179 3,5.55,156,6,2020-05-16
4,172cc469-9271-4650-926b-455b9b337411,Taylor Smith,Category_179 4,4.44,23,2,2020-05-06
5,172cc469-9271-4650-926b-455b9b337411,Taylor Smith,Category_179 5,5.96,123,6,2020-04-25
6,dec64b5f-8a3e-48b4-8143-78cefa4a6283,William Rosales,Category_179 6,2.23,12,5,2020-04-30
7,581d2436-9a71-4329-84b6-8f648a6af777,Robert Miller,Category_179 7,15.67,122,6,2020-04-09
8,5c2041ba-c4fb-48ce-859a-8c0bfc0652c0,Angela Martinez,Category_179 8,45.53,1212,2,2020-05-09
9,f26141f9-a74c-442e-8545-5ce9c0d6b9d2,Jason Carter,Category_179 9,2.45,42,1,2020-04-16
