In [None]:
import pandas as pd

In [None]:
df = pd.read_csv('Customers.csv')

In [None]:
df.head()

In [None]:
df.describe()

In [None]:
df['Age'].hist(bins=20)

In [None]:
df['Annual Income ($)'].hist(bins=15)

In [None]:
df.columns

### PII
- CustomerID

### SPII/ Senstivie
- Annual Income ($)
- Spending Score (1-100)
- Family Size

### Safe
- Gender
- Age
- Work Experince (could be SPII)

In [None]:
import re

df.columns = [re.sub(r"\(.*?\)", "", col) for col in df.columns]
df.columns = [re.sub(r"\$", "", col) for col in df.columns]
df.columns = [col.strip() for col in df.columns]

In [None]:
df.columns

In [None]:
df_suupressed = df.drop(columns='CustomerID')
df_suupressed.head()

In [None]:
df_suupressed = df_suupressed.loc[:, ~df_suupressed.columns.duplicated()]

df_suupressed.dtypes


In [None]:
df_suupressed['Age'].plot(kind='box')

In [None]:
df_suupressed['Annual Income'].plot(kind='box')

In [None]:
df_masked = df_suupressed.copy()

df_masked['Annual Income'] = df_masked['Annual Income'].astype(str).apply(lambda x: '*' * (len(x)-2) + x[-3:])
df_masked['Annual Income'].head()

In [None]:
!pip install faker

In [None]:
from faker import Faker

fake = Faker()

df_faker = df_masked.copy()

df_faker['Email'] = [fake.email() for _ in range(len(df_faker))]
df_faker.head()

In [None]:
df_faker['Profession'] = [fake.job() for _ in range(len(df_faker))]
df_faker['Profession'].head()

In [None]:
bins = [0, 20, 40, 60, 80, 100] 
labels = ["<=20", "20-40", "40-60", "60-80", "80+"]
df_gen = df.copy()
df_gen['Age Group'] = pd.cut(df_gen['Age'], bins=bins, labels=labels)

df_gen[['Age', 'Age Group']].head(10)

In [None]:
df_gen['Annual Income'] = df_gen['Annual Income'].astype(int)


In [None]:
bins_income = [0, 50000, 100000, 200000]
labels_income = ["Low", "Medium", "High"]

df_gen['Income Group'] = pd.cut(df_gen['Annual Income'], bins=bins_income, labels=labels_income)

df_gen[['Annual Income', 'Income Group']] 


In [None]:
df_tbc = df.copy()

df_tbc['Age TBC'] = df_tbc['Age'].apply(lambda x: "<20" if x < 20 else ("70+" if x > 70 else x))
df_tbc[['Age', 'Age TBC']]


In [None]:
df['Profession'].unique()

In [None]:
profession_map = {
    'Healthcare': 'Medical',
    'Doctor': 'Medical',
    'Engineer': 'Technical',
    'Executive': 'Management',
    'Lawyer': 'Legal',
    'Artist': 'Creative',
    'Entertainment': 'Creative',
    'Marketing': 'Business',
    'Homemaker': 'Other'
}

df_categorical = df_tbc.copy()
df_categorical['Profession Group'] = df_categorical['Profession'].map(profession_map).fillna('Other')

df_categorical[['Profession', 'Profession Group']].head(10)


In [None]:
df_categorical.columns

In [None]:
qi_cols = ['Age TBC', 'Gender', 'Profession Group']
k_values = df_categorical.groupby(qi_cols).size()
min_k = k_values.min()
min_k


In [None]:
violations = k_values[k_values < 3]
violations.index

In [None]:
rare_groups = violations.index
df_k = df_categorical.set_index(qi_cols)
df_k = df_k[~df_k.index.isin(rare_groups)]

df_k.index

In [None]:
k_values_new = df_k.groupby(qi_cols).size()

min_k_new = k_values_new.min()
print(min_k_new)


In [None]:
print(len(df_categorical) , "|", len(df_k))

In [None]:
df_k.reset_index()[qi_cols].tail(10)

In [None]:
df_original = pd.read_csv("Customers.csv")
df = df_original.copy()

df.columns

In [None]:
df['CustomerID']

In [None]:
import hashlib

df['Hashed_CustomerID'] = df['CustomerID'].astype(str).apply(
    lambda x: hashlib.sha256(x.encode()).hexdigest()
)

df[['CustomerID', 'Hashed_CustomerID']].head(5)


In [None]:
from cryptography.fernet import Fernet

key = Fernet.generate_key()
cipher = Fernet(key)

df['Encrypted_CustomerID'] = df['CustomerID'].astype(str).apply(
    lambda x: cipher.encrypt(x.encode())
)

df['Encrypted_CustomerID']


In [None]:
decrypted = df['Encrypted_CustomerID'].iloc[2]
cipher.decrypt(decrypted).decode()

In [None]:
import base64

df['Encoded_CustomerID'] = df['CustomerID'].astype(str).apply(
    lambda x: base64.b64encode(x.encode()).decode()
)

df['Encoded_CustomerID']

In [None]:
decoded = base64.b64decode(df['Encoded_CustomerID'].iloc[0]).decode()
decoded

In [None]:
df.columns