# First POC for dataset anonymization

## Imports

In [1]:
import pandas as pd
import random
import datetime as dt
from faker import Faker

In [2]:
fake = Faker()

In [3]:
for _ in range(20):
  print(fake.name())

Madison Barnett
Debra Andrade
Tara Reynolds
Eric Aguilar
Bradley Walker
Michael Mejia
Cassandra Bates
Sarah Mitchell
Dr. Thomas Powell
Lisa Garner
Thomas Torres
Julia Ryan
Donna Martinez
David Hanson
Robert Greer
Stephen Dawson
Lisa Campbell
Thomas Morgan
Charlene Lewis
Travis Mathews


## Create and import a sample dataset

In [4]:
df = pd.read_csv('../datasets/test_dataset1.csv', parse_dates=['purchased_date'])

In [5]:
df_original = df.copy()

In [6]:
df

Unnamed: 0,user_id,name,item_name,price,category_id,quantity,purchased_date
0,1004,Donna Martin,TV,199.95,14,1,2020-04-04
1,1004,Donna Martin,Vaccum,95.12,15,2,2020-05-12
2,1003,Bobby Bradley,Cup,12.76,44,1,2020-05-13
3,1011,Austin Thomas,Plate,5.55,156,1,2020-05-14
4,1000,Jonathan Medina,Lotion,4.44,23,2,2020-05-15
5,1000,Jonathan Medina,Toothbrush,5.96,123,1,2020-05-16
6,1019,Brandon Beasley,Sanitizer,2.23,12,3,2020-05-17
7,1011,Austin Thomas,Wine,15.67,122,1,2020-04-04
8,1014,Susan Morales,Yoga mat,45.53,1212,2,2020-04-05
9,1001,Rachael Massey,Beer,2.45,42,4,2020-04-06


## Replace names

In [7]:
unique_names = df['name'].unique()

In [8]:
unique_names

array(['Donna Martin', 'Bobby Bradley', 'Austin Thomas',
       'Jonathan Medina', 'Brandon Beasley', 'Susan Morales',
       'Rachael Massey', 'Ross Strickland', 'Jennifer Cox',
       'Danny Lynn DDS', 'Ashley Nichols', 'Angela Oneill', 'Cody Moreno',
       'Douglas Atkinson', 'Ellen Mccarthy'], dtype=object)

In [9]:
name_dict = {name: fake.name() for name in unique_names}

In [10]:
name_dict

{'Donna Martin': 'Roger Mays',
 'Bobby Bradley': 'Brian Wagner',
 'Austin Thomas': 'Natalie Foster',
 'Jonathan Medina': 'Deborah White',
 'Brandon Beasley': 'Daniel Shaw',
 'Susan Morales': 'Ellen Ward',
 'Rachael Massey': 'Robin Yang',
 'Ross Strickland': 'Samantha Smith',
 'Jennifer Cox': 'John Glover',
 'Danny Lynn DDS': 'Michael Johnson',
 'Ashley Nichols': 'Jean Williams',
 'Angela Oneill': 'Maria Mitchell',
 'Cody Moreno': 'David Lawrence',
 'Douglas Atkinson': 'Richard Brady',
 'Ellen Mccarthy': 'Maria Miller'}

In [11]:
df['name'] = df['name'].map(name_dict)

## Replace ids

In [12]:
unique_ids = df['user_id'].unique()

In [13]:
id_dict = {id_: fake.uuid4() for id_ in unique_ids}

In [14]:
id_dict

{1004: '6d5e681b-d159-4c8b-9f3e-5d59831e7c17',
 1003: '421cc376-3dbd-4f3e-b518-d6d952907290',
 1011: '334d6575-ad67-4259-950f-e3292f74ef4f',
 1000: '489c2597-63d8-47ad-9709-9605746437ca',
 1019: '49fee7f4-997f-4ed7-a939-6eda703c3b28',
 1014: '6596ee2d-408c-4d3a-bcd8-d5c2d3ee8726',
 1001: '98674a13-1c1b-4188-b1f4-826dbe35871a',
 1005: '1a09f20d-accf-4339-858f-73373ed25df2',
 1020: '02ee4f7f-8ab5-4ee1-9fdc-ddf77bd0f80e',
 1006: 'd271a72d-e7c9-4d92-9af8-128da3f00e89',
 1015: 'dc51580e-1756-4221-8138-985825da4c98',
 1009: '9473e8d3-6dba-446c-b333-3bfdc923e775',
 1016: '9954b820-705e-4a19-b2b0-bfb67e98524c'}

In [15]:
df['user_id'] = df['user_id'].map(id_dict)

## Replace whole numbers

In [17]:
whole_numbers = df['quantity']

In [18]:
whole_number_dict = {
    number: fake.random_int(
        min=min(whole_numbers), max=max(whole_numbers)
    )
    for number in whole_numbers
}

In [19]:
whole_number_dict

{1: 6, 2: 3, 3: 2, 4: 5, 5: 2, 6: 6}

In [21]:
df['quantity'] = df['quantity'].map(whole_number_dict)

## Replace decimal numbers

In [23]:
decimal_numbers = df['price']

In [24]:
decimal_number_dict = {
    number: round(random.uniform(
        min(decimal_numbers), max(decimal_numbers)
    ), 2)
    for number in decimal_numbers
}

In [25]:
decimal_number_dict

{199.95: 40.9,
 95.12: 21.84,
 12.76: 43.75,
 5.55: 119.72,
 4.44: 82.54,
 5.96: 53.85,
 2.23: 148.61,
 15.67: 16.35,
 45.53: 19.98,
 2.45: 131.72,
 8.45: 11.62}

In [26]:
df['price'] = df['price'].map(decimal_number_dict)

## Categorical data

In [28]:
category = df['item_name']

In [29]:
category

0             TV
1         Vaccum
2            Cup
3          Plate
4         Lotion
5     Toothbrush
6      Sanitizer
7           Wine
8       Yoga mat
9           Beer
10        Coffee
11        Lotion
12        Coffee
13            TV
14        Vaccum
15     Sanitizer
16          Wine
17         Plate
18            TV
19        Vaccum
Name: item_name, dtype: object

In [30]:
unique_category = category.unique()

In [31]:
rand_category = random.randint(0,1000)
category_dict = {
    category:  "Category_" + str(rand_category) + " " +str(iteration)
    for iteration, category in enumerate(unique_category)
}

In [32]:
category_dict

{'TV': 'Category_547 0',
 'Vaccum': 'Category_547 1',
 'Cup': 'Category_547 2',
 'Plate': 'Category_547 3',
 'Lotion': 'Category_547 4',
 'Toothbrush': 'Category_547 5',
 'Sanitizer': 'Category_547 6',
 'Wine': 'Category_547 7',
 'Yoga mat': 'Category_547 8',
 'Beer': 'Category_547 9',
 'Coffee': 'Category_547 10'}

In [34]:
df['item_name'] = df['item_name'].map(category_dict)

### Dates

In [35]:
dates = df['purchased_date']

In [36]:
fake.date_between(start_date=dates.min(), end_date=dates.max())

datetime.date(2020, 4, 17)

In [37]:
dates_dict = {
    date: fake.date_between(start_date=dates.min(), end_date=dates.max()).strftime('%Y-%m-%d')
    for date in dates
}

In [38]:
dates_dict

{Timestamp('2020-04-04 00:00:00'): '2020-04-28',
 Timestamp('2020-05-12 00:00:00'): '2020-05-05',
 Timestamp('2020-05-13 00:00:00'): '2020-04-21',
 Timestamp('2020-05-14 00:00:00'): '2020-04-24',
 Timestamp('2020-05-15 00:00:00'): '2020-05-13',
 Timestamp('2020-05-16 00:00:00'): '2020-04-26',
 Timestamp('2020-05-17 00:00:00'): '2020-04-12',
 Timestamp('2020-04-05 00:00:00'): '2020-05-11',
 Timestamp('2020-04-06 00:00:00'): '2020-05-04',
 Timestamp('2020-04-07 00:00:00'): '2020-05-08',
 Timestamp('2020-04-08 00:00:00'): '2020-05-13',
 Timestamp('2020-04-09 00:00:00'): '2020-04-19',
 Timestamp('2020-04-10 00:00:00'): '2020-04-10',
 Timestamp('2020-05-06 00:00:00'): '2020-05-02',
 Timestamp('2020-05-07 00:00:00'): '2020-04-08',
 Timestamp('2020-05-08 00:00:00'): '2020-04-16',
 Timestamp('2020-05-09 00:00:00'): '2020-05-15',
 Timestamp('2020-05-10 00:00:00'): '2020-04-20',
 Timestamp('2020-05-11 00:00:00'): '2020-04-17'}

In [39]:
df['purchased_date'] = df['purchased_date'].map(dates_dict)

# Original vs Final

In [40]:
df_original

Unnamed: 0,user_id,name,item_name,price,category_id,quantity,purchased_date
0,1004,Donna Martin,TV,199.95,14,1,2020-04-04
1,1004,Donna Martin,Vaccum,95.12,15,2,2020-05-12
2,1003,Bobby Bradley,Cup,12.76,44,1,2020-05-13
3,1011,Austin Thomas,Plate,5.55,156,1,2020-05-14
4,1000,Jonathan Medina,Lotion,4.44,23,2,2020-05-15
5,1000,Jonathan Medina,Toothbrush,5.96,123,1,2020-05-16
6,1019,Brandon Beasley,Sanitizer,2.23,12,3,2020-05-17
7,1011,Austin Thomas,Wine,15.67,122,1,2020-04-04
8,1014,Susan Morales,Yoga mat,45.53,1212,2,2020-04-05
9,1001,Rachael Massey,Beer,2.45,42,4,2020-04-06


In [41]:
df

Unnamed: 0,user_id,name,item_name,price,category_id,quantity,purchased_date
0,6d5e681b-d159-4c8b-9f3e-5d59831e7c17,Roger Mays,Category_547 0,40.9,14,6,2020-04-28
1,6d5e681b-d159-4c8b-9f3e-5d59831e7c17,Roger Mays,Category_547 1,21.84,15,3,2020-05-05
2,421cc376-3dbd-4f3e-b518-d6d952907290,Brian Wagner,Category_547 2,43.75,44,6,2020-04-21
3,334d6575-ad67-4259-950f-e3292f74ef4f,Natalie Foster,Category_547 3,119.72,156,6,2020-04-24
4,489c2597-63d8-47ad-9709-9605746437ca,Deborah White,Category_547 4,82.54,23,3,2020-05-13
5,489c2597-63d8-47ad-9709-9605746437ca,Deborah White,Category_547 5,53.85,123,6,2020-04-26
6,49fee7f4-997f-4ed7-a939-6eda703c3b28,Daniel Shaw,Category_547 6,148.61,12,2,2020-04-12
7,334d6575-ad67-4259-950f-e3292f74ef4f,Natalie Foster,Category_547 7,16.35,122,6,2020-04-28
8,6596ee2d-408c-4d3a-bcd8-d5c2d3ee8726,Ellen Ward,Category_547 8,19.98,1212,3,2020-05-11
9,98674a13-1c1b-4188-b1f4-826dbe35871a,Robin Yang,Category_547 9,131.72,42,5,2020-05-04
