In [1]:
import pandas as pd
import numpy as np

In [2]:
# Load dataset
df = pd.read_csv("adult.csv")

# Normalize column names (fixes dashes, spaces, and case issues)
df.columns = df.columns.str.strip().str.lower().str.replace("-", "_")

In [3]:
# Print available columns to verify correct names
print("Available columns in dataset:", df.columns.tolist())

Available columns in dataset: ['age', 'workclass', 'fnlwgt', 'education', 'education_num', 'marital_status', 'occupation', 'relationship', 'race', 'sex', 'capital_gain', 'capital_loss', 'hours_per_week', 'native_country', 'income']


In [4]:
# Display original dataset before perturbation (first 5 rows)
print("Original Dataset (First 5 rows):")
print(df[df.columns.tolist()].head(100))

Original Dataset (First 5 rows):
    age         workclass  fnlwgt     education  education_num  \
0    39         State-gov   77516     Bachelors             13   
1    50  Self-emp-not-inc   83311     Bachelors             13   
2    38           Private  215646       HS-grad              9   
3    53           Private  234721          11th              7   
4    28           Private  338409     Bachelors             13   
..  ...               ...     ...           ...            ...   
95   29         Local-gov  115585  Some-college             10   
96   48  Self-emp-not-inc  191277     Doctorate             16   
97   37           Private  202683  Some-college             10   
98   48           Private  171095    Assoc-acdm             12   
99   32       Federal-gov  249409       HS-grad              9   

        marital_status         occupation   relationship   race     sex  \
0        Never-married       Adm-clerical  Not-in-family  White    Male   
1   Married-civ-spouse  

In [5]:
# Choose categorical columns for swapping
categorical_columns = ["workclass", "education", "marital_status", "occupation"]

# Probability of swapping a value
swap_prob = 0.2  # 20% chance of swapping each categorical value

In [6]:
# Function to perform randomized response (data swapping)
def swap_values(series, swap_prob):
    n = len(series)
    swap_mask = np.random.rand(n) < swap_prob  # Create a mask for swapping
    shuffled_series = series.sample(frac=1, random_state=42).reset_index(drop=True)  # Shuffle column values

    # Apply swapping based on the mask
    swapped_series = series.copy()
    swapped_series.loc[swap_mask] = shuffled_series.loc[swap_mask]
    
    return swapped_series

In [7]:
# Apply swapping to selected categorical columns
for col in categorical_columns:
    df[col + "_swapped"] = swap_values(df[col], swap_prob)

#  Ensure index alignment for comparison
df = df.reset_index(drop=True)

In [8]:
#  Create affected column correctly
df["affected"] = False
for col in categorical_columns:
    df["affected"] |= df[col] != df[col + "_swapped"]
# Extract affected rows for analysis
affected_rows = df[df["affected"]]

In [9]:
# Display first few affected rows
print("\n Affected Rows (First 10 rows where values changed):")
print(affected_rows[categorical_columns + [col + "_swapped" for col in categorical_columns]].head(10))


 Affected Rows (First 10 rows where values changed):
           workclass     education      marital_status         occupation  \
2            Private       HS-grad            Divorced  Handlers-cleaners   
3            Private          11th  Married-civ-spouse  Handlers-cleaners   
4            Private     Bachelors  Married-civ-spouse     Prof-specialty   
5            Private       Masters  Married-civ-spouse    Exec-managerial   
7   Self-emp-not-inc       HS-grad  Married-civ-spouse    Exec-managerial   
10           Private  Some-college  Married-civ-spouse    Exec-managerial   
13           Private    Assoc-acdm       Never-married              Sales   
16  Self-emp-not-inc       HS-grad       Never-married    Farming-fishing   
17           Private       HS-grad       Never-married  Machine-op-inspct   
18           Private          11th  Married-civ-spouse              Sales   

   workclass_swapped education_swapped marital_status_swapped  \
2            Private           HS

In [10]:
# Save the swapped dataset
df.to_csv("adult_swapped.csv", index=False)

print("\n Randomized Response (Data Swapping) Applied. File saved as 'adult_swapped.csv'.")


 Randomized Response (Data Swapping) Applied. File saved as 'adult_swapped.csv'.
