# Anonymise data
The objective is to remove nominative data from a DataFrame and replace them by numbers. These numbers correspond to the index of the data in an ordered list of unique values.

## Process
### Import useful library and load a sample dataset

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('/kaggle/input/anonymous-input-data/anon_input.csv', sep=';')

In [3]:
df

Unnamed: 0,Name,Order
0,Thomas,chien
1,Paul,chat
2,Marie,souris
3,Pierre,souris
4,Sophie,aligator
5,Pierre,chat
6,Thomas,chat
7,Marie,singe
8,Marie,chien


### Extract sorted unique names and orders

In [4]:
names = list(df['Name'].unique())
orders = list(df['Order'].unique())
names.sort()
orders.sort()
print("names:", names)
print("orders:", orders)

names: ['Marie', 'Paul', 'Pierre', 'Sophie', 'Thomas']
orders: ['aligator', 'chat', 'chien', 'singe', 'souris']


In [5]:
orders.index('singe')

3

### Replace names and orders by anonymised values

In [6]:
df['Name'] = df['Name'].apply(lambda x: names.index(x))
df['Order'] = df['Order'].apply(lambda x: orders.index(x))

In [7]:
df

Unnamed: 0,Name,Order
0,4,2
1,1,1
2,0,4
3,2,4
4,3,0
5,2,1
6,4,1
7,0,3
8,0,2


## Corresponding function

In [8]:
def anonymise(df):
    #create sorted list of unique Name and Order values
    names = list(df['Name'].unique())
    order = list(df['Order'].unique())
    names.sort()
    order.sort()
    
    #replace original values by anonymised ones
    df['Name']  = df['Name'].apply(lambda x: names.index(x))
    df['Order'] = df['Order'].apply(lambda x: orders.index(x)) 
    
    return df

### _(tests section)_

In [9]:
df = pd.read_csv('/kaggle/input/anonymous-input-data/anon_input.csv', sep=';')

In [10]:
ano_df = anonymise(df)
ano_df

Unnamed: 0,Name,Order
0,4,2
1,1,1
2,0,4
3,2,4
4,3,0
5,2,1
6,4,1
7,0,3
8,0,2


In [11]:
print('done')

done
