In [1]:
import pandas as pd

# Define the data
data = {
    'age': [39, 50, 38, 53, 28],
    'workclass': ['State-gov', 'Self-emp-not-inc', 'Private', 'Private', 'Private'],
    'fnlwgt': [77516, 83311, 215646, 234721, 338409],
    'education': ['Bachelors', 'Bachelors', 'HS-grad', '11th', 'Bachelors'],
    'education-num': [13, 13, 9, 7, 13],
    'marital-status': ['Never-married', 'Married-civ-spouse', 'Divorced', 'Married-civ-spouse', 'Married-civ-spouse'],
    'occupation': ['Adm-clerical', 'Exec-managerial', 'Handlers-cleaners', 'Handlers-cleaners', 'Prof-specialty'],
    'relationship': ['Not-in-family', 'Husband', 'Not-in-family', 'Husband', 'Wife'],
    'race': ['White', 'White', 'White', 'Black', 'Black'],
    'sex': ['Male', 'Male', 'Male', 'Male', 'Female'],
    'capital-gain': [2174, 0, 0, 0, 0],
    'capital-loss': [0, 0, 0, 0, 0],
    'hours-per-week': [40, 13, 40, 40, 40],
    'native-country': ['United-States', 'United-States', 'United-States', 'United-States', 'Cuba'],
    'salary': ['<=50K', '<=50K', '<=50K', '<=50K', '<=50K']
}

# Create DataFrame
df = pd.DataFrame(data)

# Display the DataFrame
print(df)


   age         workclass  fnlwgt  education  education-num  \
0   39         State-gov   77516  Bachelors             13   
1   50  Self-emp-not-inc   83311  Bachelors             13   
2   38           Private  215646    HS-grad              9   
3   53           Private  234721       11th              7   
4   28           Private  338409  Bachelors             13   

       marital-status         occupation   relationship   race     sex  \
0       Never-married       Adm-clerical  Not-in-family  White    Male   
1  Married-civ-spouse    Exec-managerial        Husband  White    Male   
2            Divorced  Handlers-cleaners  Not-in-family  White    Male   
3  Married-civ-spouse  Handlers-cleaners        Husband  Black    Male   
4  Married-civ-spouse     Prof-specialty           Wife  Black  Female   

   capital-gain  capital-loss  hours-per-week native-country salary  
0          2174             0              40  United-States  <=50K  
1             0             0             

In [2]:
df.to_csv('dataset.csv', index=False)


In [32]:
import pandas as pd

def demographic_data_analyzer(file_path):
    data = pd.read_csv(file_path)

    # 1. How many people of each race are represented in this dataset?
    race_counts = data['race'].value_counts()

    # 2. What is the average age of men?
    avg_men_age = data[data['sex'] == 'Male']['age'].mean()

    # 3. What is the percentage of people who have a Bachelor's degree?
    total_people = data.shape [0]
    bachelors_count = data[data['education'] == 'Bachelors'].shape[0]
    percentage_bachelors = (bachelors_count / total_people) * 100

    # 4. What percentage of people with advanced education (Bachelors, Masters, or Doctorate) make more than 50K?
    advanced_education = data[data['education'].isin(['Bachelors', 'Masters', 'Doctorate'])]
    advanced_education_rich = advanced_education[advanced_education['salary'] == '<=50K']
    percentage_advanced_education_rich = (advanced_education_rich.shape[0] / advanced_education.shape[0]) * 100

    # 5. What percentage of people without advanced education make more than 50K?
    non_advanced_education = data[~data['education'].isin(['Bachelors', 'Masters', 'Doctorate'])]
    non_advanced_education_rich = non_advanced_education[non_advanced_education['salary'] == '<=50K']
    percentage_non_advanced_education_rich = (non_advanced_education_rich.shape[0] / non_advanced_education.shape[0]) * 100

    # 6. What is the minimum number of hours a person works per week?
    min_work_hours = data['hours-per-week'].min()

    # 7. What percentage of the people who work the minimum number of hours per week have a salary of more than 50K?
    min_work_people = data[data['hours-per-week'] == min_work_hours]
    rich_min_work_people = min_work_people[min_work_people['salary'] == '<=50k']
    percentage_rich_min_work_people = (rich_min_work_people.shape[0] / min_work_people.shape[0]) * 100

    # 8. What country has the highest percentage of people that earn <=50K and what is that percentage?
    country = data[data['salary'] == '<=50k']['native-country'].value_counts()
    total_country = data['native-country'].value_counts()
    highest_percentage_country = (country / total_country * 100)
    highest_percentage = (country / total_country * 100).max()

    # 9. Identify the most popular occupation for those who earn >50K in India.
    india_rich = data[(data['native-country'] == 'India') & (data['salary'] == '>50K')]
    top_occupation_india = india_rich['occupation'].value_counts()

    # Return results
    return {
        'Race Counts': race_counts,
        'Average age of men': round(avg_men_age),
        'Percentage of Bachelors': round(percentage_bachelors),
        'Percentage of Upper Graduated with more Salary': round(percentage_advanced_education_rich),
        'Percentage of Lower Graduated with more Salary': round(percentage_non_advanced_education_rich),
        'Minimum Hours per week': min_work_hours,
        'Percentage of Rich min work People': round(percentage_rich_min_work_people),
        'Highest Percentage country': highest_percentage_country,
        'Highest Percentage': round(highest_percentage, 1),
        'Top Occupation India': top_occupation_india
    }

file_path = '/Users/anks/Jupyter Files/dataset.csv'
result = demographic_data_analyzer(file_path)
for key, value in result.items():
    print(f'{key}: {value}')

Race Counts: race
White    3
Black    2
Name: count, dtype: int64
Average age of men: 45
Percentage of Bachelors: 60
Percentage of Upper Graduated with more Salary: 100
Percentage of Lower Graduated with more Salary: 100
Minimum Hours per week: 13
Percentage of Rich min work People: 0
Highest Percentage country: native-country
Cuba            NaN
United-States   NaN
Name: count, dtype: float64
Highest Percentage: nan
Top Occupation India: Series([], Name: count, dtype: int64)
