In [1]:
import pandas as pd
import numpy as np


In [2]:
# Load dataset
df = pd.read_csv("adult.csv")

# Select numerical columns to perturb
numerical_columns = ["age", "hours_per_week", "capital_gain", "capital_loss"]

# Display original dataset before perturbation (first 5 rows)
print("Original Dataset (First 5 rows):")
print(df[df.columns.tolist()].head(100))

Original Dataset (First 5 rows):
    age         workclass  fnlwgt     education  education_num  \
0    39         State-gov   77516     Bachelors             13   
1    50  Self-emp-not-inc   83311     Bachelors             13   
2    38           Private  215646       HS-grad              9   
3    53           Private  234721          11th              7   
4    28           Private  338409     Bachelors             13   
..  ...               ...     ...           ...            ...   
95   29         Local-gov  115585  Some-college             10   
96   48  Self-emp-not-inc  191277     Doctorate             16   
97   37           Private  202683  Some-college             10   
98   48           Private  171095    Assoc-acdm             12   
99   32       Federal-gov  249409       HS-grad              9   

        marital_status         occupation   relationship   race     sex  \
0        Never-married       Adm-clerical  Not-in-family  White    Male   
1   Married-civ-spouse  

In [3]:
# Define noise parameters (Standard deviation for each attribute)
noise_params = {
    "age": 5,
    "hours_per_week": 5,
    "capital_gain": 1000,  # Higher noise for financial data
    "capital_loss": 500
}

In [4]:
# Apply noise to each selected column
for col in numerical_columns:
    df[col + "_perturbed"] = df[col] + np.random.normal(0, noise_params[col], df.shape[0])

    # Ensure values remain within reasonable bounds (e.g., no negative ages or financial values)
    df[col + "_perturbed"] = df[col + "_perturbed"].clip(lower=0)

# Extract perturbed columns and rename them to match original column names
df_perturbed = df[[col + "_perturbed" for col in numerical_columns]].copy()
df_perturbed.columns = numerical_columns  # Rename to match original column names

In [5]:
# Identify affected rows where at least one value has changed
df["affected"] = (df[numerical_columns] != df_perturbed).any(axis=1)

# Extract only the affected rows
affected_rows = df[df["affected"]]

In [6]:
# Display the first few affected rows with changes
print(affected_rows[numerical_columns + [col + "_perturbed" for col in numerical_columns]].head(100))

    age  hours_per_week  capital_gain  capital_loss  age_perturbed  \
0    39              40          2174             0      39.823239   
1    50              13             0             0      61.816262   
2    38              40             0             0      38.422463   
3    53              40             0             0      56.128772   
4    28              40             0             0      29.043660   
..  ...             ...           ...           ...            ...   
95   29              50             0             0      35.301448   
96   48              60             0          1902      43.657491   
97   37              48             0             0      32.461603   
98   48              40             0             0      49.576797   
99   32              40             0             0      24.967738   

    hours_per_week_perturbed  capital_gain_perturbed  capital_loss_perturbed  
0                  46.650190             1719.361055                0.000000  
1

In [7]:
# Save the perturbed dataset
df.to_csv("adult_noisy.csv", index=False)

print("\n Additive Noise Perturbation Applied. File saved as 'adult_noisy.csv'.")


 Additive Noise Perturbation Applied. File saved as 'adult_noisy.csv'.
