In [7]:
import pandas as pd

In [8]:
# Define column names based on the description from 'adult.names'
column_names = [
    "age", "workclass", "fnlwgt", "education", "education_num",
    "marital_status", "occupation", "relationship", "race", "sex",
    "capital_gain", "capital_loss", "hours_per_week", "native_country", "salary"
]

adult_data = pd.read_csv('adult.data', header=None, names=column_names, na_values=" ?")

adult_test = pd.read_csv('adult.test', header=None, names=column_names, skiprows=1, na_values=" ?") # skipping the first line bc it was not important


In [9]:
print(adult_data.head())
print(adult_test.head())


   age          workclass  fnlwgt   education  education_num  \
0   39          State-gov   77516   Bachelors             13   
1   50   Self-emp-not-inc   83311   Bachelors             13   
2   38            Private  215646     HS-grad              9   
3   53            Private  234721        11th              7   
4   28            Private  338409   Bachelors             13   

        marital_status          occupation    relationship    race      sex  \
0        Never-married        Adm-clerical   Not-in-family   White     Male   
1   Married-civ-spouse     Exec-managerial         Husband   White     Male   
2             Divorced   Handlers-cleaners   Not-in-family   White     Male   
3   Married-civ-spouse   Handlers-cleaners         Husband   Black     Male   
4   Married-civ-spouse      Prof-specialty            Wife   Black   Female   

   capital_gain  capital_loss  hours_per_week  native_country  salary  
0          2174             0              40   United-States   <=50

In [10]:
with open('adult.names', 'r') as f:
    names_content = f.read()

# Print the content of the 'adult.names' file
# print(names_content)

In [11]:
print(adult_data['salary'])

0         <=50K
1         <=50K
2         <=50K
3         <=50K
4         <=50K
          ...  
32556     <=50K
32557      >50K
32558     <=50K
32559     <=50K
32560      >50K
Name: salary, Length: 32561, dtype: object


In [12]:
adult_data['age'].sort_values(ascending=False)

24043    90
31696    90
18277    90
12451    90
5406     90
         ..
12678    17
22107    17
12838    17
3605     17
3618     17
Name: age, Length: 32561, dtype: int64

Generalize the age based on the hierarchy in hierarchy.txt

In [13]:
def generalize_age(age, level):
    if level == 1:
        # Level 1: No generalization, return the precise age
        return age
    elif level == 2:
        # Level 2: Group into small age ranges
        if 17 <= age <= 19:
            return '17-19'
        elif 20 <= age <= 29:
            return '20-29'
        elif 30 <= age <= 39:
            return '30-39'
        elif 40 <= age <= 49:
            return '40-49'
        elif 50 <= age <= 59:
            return '50-59'
        elif 60 <= age <= 69:
            return '60-69'
        elif 70 <= age <= 79:
            return '70-79'
        elif 80 <= age <= 89:
            return '80-89'
        elif age >= 90:
            return '90+'
    elif level == 3:
        # Level 3: Group into medium ranges
        if 17 <= age <= 29:
            return '17-29'
        elif 30 <= age <= 49:
            return '30-49'
        elif 50 <= age <= 69:
            return '50-69'
        elif age >= 70:
            return '70+'
    elif level == 4:
        # Level 4: Group into broad categories
        if 17 <= age <= 29:
            return 'Young Adult (17-29)'
        elif 30 <= age <= 49:
            return 'Middle Age (30-49)'
        elif 50 <= age <= 69:
            return 'Senior (50-69)'
        elif age >= 70:
            return 'Elderly (70+)'
    elif level == 5:
        # Level 5: Generalize to "All Ages"
        return 'All Ages'
    
    return 'All Ages'

age = 91
generalized_age = generalize_age(age, 2)
print(f'Generalized age at level 2: {generalized_age}')

age = 35
generalized_age = generalize_age(age, 3)
print(f'Generalized age at level 3: {generalized_age}')


Generalized age at level 2: 90+
Generalized age at level 3: 30-49


In [14]:
adult_data['education'].unique()


array([' Bachelors', ' HS-grad', ' 11th', ' Masters', ' 9th',
       ' Some-college', ' Assoc-acdm', ' Assoc-voc', ' 7th-8th',
       ' Doctorate', ' Prof-school', ' 5th-6th', ' 10th', ' 1st-4th',
       ' Preschool', ' 12th'], dtype=object)

Education generaliztion

In [15]:
def generalize_education(education, level):
    education = education.strip()
    
    if level == 1:
        return education
    
    elif level == 2:
        if education in ['Preschool', '1st-4th', '5th-6th']:
            return 'Primary School'
        elif education in ['7th-8th', '9th']:
            return 'Middle School'
        elif education in ['10th', '11th', '12th', 'HS-grad']:
            return 'High School'
        elif education in ['Assoc-voc', 'Prof-school', 'Some-college']:
            return 'Vocational School'
        elif education in ['Assoc-acdm', 'Bachelors']:
            return 'Undergraduate School'
        elif education in ['Masters', 'Doctorate']:
            return 'Graduate School'
    
    elif level == 3:
        if education in ['Preschool', '1st-4th', '5th-6th', '7th-8th', '9th', '10th', '11th', '12th', 'HS-grad']:
            return 'Compulsory School'
        elif education in ['Assoc-voc', 'Prof-school', 'Some-college', 'Assoc-acdm', 'Bachelors']:
            return 'Basic Degree'
        elif education in ['Masters', 'Doctorate']:
            return 'Advanced Degree'
    
    elif level == 4:
        if education == 'Compulsory School':
            return 'Compulsory School'
        elif education in ['Basic Degree', 'Advanced Degree']:
            return 'Advanced School'
    
    elif level == 5:
        return 'Education'

    return 'Education'

education = 'Bachelors'
generalized_education = generalize_education(education, 3)
print(f'Generalized education at level 3: {generalized_education}')


Generalized education at level 3: Basic Degree


Marital Status Generalization

In [16]:
def generalize_marital_status(status, level):
    status = status.strip()
    
    if level == 1:
        return status
    
    elif level == 2:
        if status in ['Separated', 'Married-spouse-absent']:
            return 'Separated-Married'
        elif status in ['Never-married', 'Widowed', 'Divorced']:
            return 'Single'
        elif status in ['Married-civ-spouse', 'Married-AF-spouse']:
            return 'Married-Together'
    
    elif level == 3:
        if status in ['Separated', 'Married-spouse-absent', 'Married-civ-spouse', 'Married-AF-spouse']:
            return 'Married'
        elif status in ['Never-married', 'Widowed', 'Divorced']:
            return 'Not-Married'
    
    elif level == 4:
        return '*'
    
    return '*'

marital_status = 'Married-civ-spouse'
generalized_status = generalize_marital_status(marital_status, 3)
print(f'Generalized marital status at level 3: {generalized_status}')


Generalized marital status at level 3: Married


Race Generalization

In [17]:
def generalize_race(race, level):
    race = race.strip()
    
    if level == 1:
        return race
    
    elif level == 2:
        if race == 'White':
            return 'Caucasian'
        elif race == 'Asian-Pac-Islander':
            return 'Asian'
        elif race == 'Amer-Indian-Eskimo':
            return 'Indigenous'
        elif race == 'Black':
            return 'African Descent'
        elif race == 'Other':
            return 'Other'
    
    elif level == 3:
        if race in ['White', 'Black']:
            return 'Western Origin'
        elif race in ['Asian-Pac-Islander', 'Amer-Indian-Eskimo']:
            return 'Eastern Origin'
        elif race == 'Other':
            return 'Other'
    
    elif level == 4:
        if race in ['White', 'Black']:
            return 'Western Origin'
        else:
            return 'Non-Western Origin'
    
    elif level == 5:
        return 'Race'

race = 'Asian-Pac-Islander'
generalized_race = generalize_race(race, 3)
print(f'Generalized race at level 3: {generalized_race}')

Generalized race at level 3: Eastern Origin


In [18]:
# create a diffent dataset with the generalized data
generalized_data = adult_data.copy()
generalized_data

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,salary
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32557,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32558,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
32559,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K
