In [1]:
import pandas as pd

# Load the dataset
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data'
df = pd.read_csv(url, header=None, names=[
    'age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status',
    'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss',
    'hours-per-week', 'native-country', 'salary'
])

# Function to perform the required analysis
def analyze_demographic_data(df):
    # How many people of each race are represented in this dataset?
    race_count = df['race'].value_counts()

    # What is the average age of men?
    average_age_men = round(df[df['sex'] == 'Male']['age'].mean(), 1)

    # What is the percentage of people who have a Bachelor's degree?
    percentage_bachelors = round((df['education'] == 'Bachelors').mean() * 100, 1)

    # What percentage of people with advanced education (Bachelors, Masters, or Doctorate) make more than 50K?
    advanced_education = df['education'].isin(['Bachelors', 'Masters', 'Doctorate'])
    percentage_advanced_education_gt_50k = round((df[advanced_education & (df['salary'] == '>50K')].shape[0] / df[advanced_education].shape[0]) * 100, 1)

    # What percentage of people without advanced education make more than 50K?
    non_advanced_education = ~df['education'].isin(['Bachelors', 'Masters', 'Doctorate'])
    percentage_non_advanced_education_gt_50k = round((df[non_advanced_education & (df['salary'] == '>50K')].shape[0] / df[non_advanced_education].shape[0]) * 100, 1)

    # What is the minimum number of hours a person works per week?
    min_work_hours = df['hours-per-week'].min()

    # What percentage of the people who work the minimum number of hours per week have a salary of more than 50K?
    num_min_workers = df[df['hours-per-week'] == min_work_hours].shape[0]
    num_min_workers_gt_50k = df[(df['hours-per-week'] == min_work_hours) & (df['salary'] == '>50K')].shape[0]
    percentage_min_workers_gt_50k = round((num_min_workers_gt_50k / num_min_workers) * 100, 1)

    # What country has the highest percentage of people that earn >50K and what is that percentage?
    countries = df[df['salary'] == '>50K']['native-country'].value_counts()
    country_percentage = (countries / df['native-country'].value_counts()) * 100
    highest_percentage_country = country_percentage.idxmax()
    highest_percentage = round(country_percentage.max(), 1)

    # Identify the most popular occupation for those who earn >50K in India.
    india_df = df[(df['native-country'] == 'India') & (df['salary'] == '>50K')]
    top_in_occupation_india_gt_50k = india_df['occupation'].value_counts().idxmax()

    return {
        'race_count': race_count,
        'average_age_men': average_age_men,
        'percentage_bachelors': percentage_bachelors,
        'percentage_advanced_education_gt_50k': percentage_advanced_education_gt_50k,
        'percentage_non_advanced_education_gt_50k': percentage_non_advanced_education_gt_50k,
        'min_work_hours': min_work_hours,
        'percentage_min_workers_gt_50k': percentage_min_workers_gt_50k,
        'highest_percentage_country': highest_percentage_country,
        'highest_percentage': highest_percentage,
        'top_in_occupation_india_gt_50k': top_in_occupation_india_gt_50k
    }

# Run the analysis
results = analyze_demographic_data(df)

# Print the results
for key, value in results.items():
    print(f'{key}: {value}\n')


ZeroDivisionError: division by zero

In [2]:
import pandas as pd

# Load the dataset
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data'
df = pd.read_csv(url, header=None, names=[
    'age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status',
    'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss',
    'hours-per-week', 'native-country', 'salary'
])

# Display the first few rows of the dataset to check the data
df.head()


Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [3]:
def analyze_demographic_data(df):
    # How many people of each race are represented in this dataset?
    race_count = df['race'].value_counts()

    # What is the average age of men?
    average_age_men = round(df[df['sex'] == 'Male']['age'].mean(), 1)

    # What is the percentage of people who have a Bachelor's degree?
    percentage_bachelors = round((df['education'] == 'Bachelors').mean() * 100, 1)

    # What percentage of people with advanced education (Bachelors, Masters, or Doctorate) make more than 50K?
    advanced_education = df['education'].isin(['Bachelors', 'Masters', 'Doctorate'])
    if advanced_education.sum() == 0:
        percentage_advanced_education_gt_50k = 0.0
    else:
        percentage_advanced_education_gt_50k = round(
            (df[advanced_education & (df['salary'] == '>50K')].shape[0] / df[advanced_education].shape[0]) * 100, 1
        )

    # What percentage of people without advanced education make more than 50K?
    non_advanced_education = ~df['education'].isin(['Bachelors', 'Masters', 'Doctorate'])
    if non_advanced_education.sum() == 0:
        percentage_non_advanced_education_gt_50k = 0.0
    else:
        percentage_non_advanced_education_gt_50k = round(
            (df[non_advanced_education & (df['salary'] == '>50K')].shape[0] / df[non_advanced_education].shape[0]) * 100, 1
        )

    # What is the minimum number of hours a person works per week?
    min_work_hours = df['hours-per-week'].min()

    # What percentage of the people who work the minimum number of hours per week have a salary of more than 50K?
    min_hours_workers = df[df['hours-per-week'] == min_work_hours]
    if min_hours_workers.shape[0] == 0:
        percentage_min_workers_gt_50k = 0.0
    else:
        percentage_min_workers_gt_50k = round(
            (min_hours_workers[min_hours_workers['salary'] == '>50K'].shape[0] / min_hours_workers.shape[0]) * 100, 1
        )

    # What country has the highest percentage of people that earn >50K and what is that percentage?
    country_salary_percentage = (
        df[df['salary'] == '>50K']['native-country'].value_counts() / df['native-country'].value_counts()
    ) * 100
    if country_salary_percentage.empty:
        highest_percentage_country = 'None'
        highest_percentage = 0.0
    else:
        highest_percentage_country = country_salary_percentage.idxmax()
        highest_percentage = round(country_salary_percentage.max(), 1)

    # Identify the most popular occupation for those who earn >50K in India.
    india_df = df[(df['native-country'] == 'India') & (df['salary'] == '>50K')]
    if india_df.empty:
        top_in_occupation_india_gt_50k = 'None'
    else:
        top_in_occupation_india_gt_50k = india_df['occupation'].value_counts().idxmax()

    return {
        'race_count': race_count,
        'average_age_men': average_age_men,
        'percentage_bachelors': percentage_bachelors,
        'percentage_advanced_education_gt_50k': percentage_advanced_education_gt_50k,
        'percentage_non_advanced_education_gt_50k': percentage_non_advanced_education_gt_50k,
        'min_work_hours': min_work_hours,
        'percentage_min_workers_gt_50k': percentage_min_workers_gt_50k,
        'highest_percentage_country': highest_percentage_country,
        'highest_percentage': highest_percentage,
        'top_in_occupation_india_gt_50k': top_in_occupation_india_gt_50k
    }


In [4]:
# Run the analysis
results = analyze_demographic_data(df)

# Print the results
for key, value in results.items():
    print(f'{key}: {value}\n')


race_count: race
 White                 27816
 Black                  3124
 Asian-Pac-Islander     1039
 Amer-Indian-Eskimo      311
 Other                   271
Name: count, dtype: int64

average_age_men: nan

percentage_bachelors: 0.0

percentage_advanced_education_gt_50k: 0.0

percentage_non_advanced_education_gt_50k: 0.0

min_work_hours: 1

percentage_min_workers_gt_50k: 0.0

highest_percentage_country: nan

highest_percentage: nan

top_in_occupation_india_gt_50k: None



  highest_percentage_country = country_salary_percentage.idxmax()
