# Demographic Data Analyzer

In [1]:
import pandas as pd

In [2]:
#Read the data from the CSV file
df = pd.read_csv('adult.data.csv')

In [3]:
# Display the first few rows of the dataframe
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [4]:
#Â 1. Number of each race represented in the dataset
race_count = df['race'].value_counts()
print("Number of each race:\n", race_count)

Number of each race:
 race
White                 27816
Black                  3124
Asian-Pac-Islander     1039
Amer-Indian-Eskimo      311
Other                   271
Name: count, dtype: int64


In [5]:
# 2. Average age of men
average_age_men = round(df[df['sex'] == 'Male']['age'].mean(), 1)
print(f"Average age of men: ", average_age_men, "years")

Average age of men:  39.4 years


In [6]:
# 3. Percentage with Bachelors degree
percentage_bachelors = df[df['education'] == 'Bachelors'].shape[0] / df.shape[0] * 100
print(f"Percentage of people with a Bachelor's degree: {percentage_bachelors:.2f}%")



Percentage of people with a Bachelor's degree: 16.45%


In [7]:
percentage_bachelors_1 = round(
    (df['education'] == 'Bachelors').mean() * 100, 1
)
print(f"Percentage of people with a Bachelor's degree: {percentage_bachelors_1:.2f}%")

Percentage of people with a Bachelor's degree: 16.40%


In [8]:
# 3. Higher education vs lower education
higher_education = df['education'].isin(['Bachelors', 'Masters', 'Doctorate'])
lower_education = ~higher_education

print("Higher education: ", higher_education.sum())
print("Lower education: ", lower_education.sum())


Higher education:  7491
Lower education:  25070


In [9]:
# 5. Percentage with salary >50K
higher_education_rich = round(
    (df[higher_education]['salary'] == '>50K').mean() * 100, 1
)
lower_education_rich = round(
    (df[lower_education]['salary'] == '>50K').mean() * 100, 1
)
print(f"Percentage of higher education with salary >50K: {higher_education_rich:.2f}%")
print(f"Percentage of lower education with salary >50K: {lower_education_rich:.2f}%")

Percentage of higher education with salary >50K: 46.50%
Percentage of lower education with salary >50K: 17.40%


In [14]:
rich = df['salary'] == '>50K'
higher_education_rich = round(
    (df[higher_education & rich].shape[0] / higher_education.sum()) * 100, 1
)
lower_education_rich = round(
    (df[lower_education & rich].shape[0] / lower_education.sum()) * 100, 1
)
print(f"Percentage of higher education with salary >50K: {higher_education_rich:.2f}%")
print(f"Percentage of lower education with salary >50K: {lower_education_rich:.2f}%")

Percentage of higher education with salary >50K: 46.50%
Percentage of lower education with salary >50K: 17.40%


In [15]:
# 6. Minimum work hours
min_work_hours = df['hours-per-week'].min()
print(f"Minimum work hours: {min_work_hours} hours/week")

Minimum work hours: 1 hours/week


In [16]:
 # 7. Percentage of rich among those who work min hours
min_workers = df[df['hours-per-week'] == min_work_hours]
rich_percentage = round(
    (min_workers['salary'] == '>50K').mean() * 100, 1
)
print(f"Percentage of rich among those who work {min_work_hours} hours/week: {rich_percentage:.2f}%")   

Percentage of rich among those who work 1 hours/week: 10.00%


In [17]:
# What percentage of the people who work the minimum number of hours per week have a salary of >50K?
num_min_workers = df[df['hours-per-week'] == min_work_hours]

rich_percentage_1 = round(
    (num_min_workers[num_min_workers['salary'] == '>50K'].shape[0] / num_min_workers.shape[0]) * 100, 1
)
print(f"Percentage of rich among those who work {min_work_hours} hours/week: {rich_percentage_1:.2f}%")

Percentage of rich among those who work 1 hours/week: 10.00%


In [None]:
 # 8. Country with highest percentage of people earning >50K
country_rich = (
    df[df['salary'] == '>50K']['native-country']
    .value_counts() / df['native-country'].value_counts()
) * 100
highest_earning_country = country_rich.idxmax()
highest_earning_country_percentage = round(country_rich.max(), 1)
print(f"Country with highest percentage of people earning >50K: {highest_earning_country} ({highest_earning_country_percentage:.2f}%)")

Country with highest percentage of people earning >50K: Iran (41.90%)


In [19]:
print(country_rich)

native-country
?                             25.042882
Cambodia                      36.842105
Canada                        32.231405
China                         26.666667
Columbia                       3.389831
Cuba                          26.315789
Dominican-Republic             2.857143
Ecuador                       14.285714
El-Salvador                    8.490566
England                       33.333333
France                        41.379310
Germany                       32.116788
Greece                        27.586207
Guatemala                      4.687500
Haiti                          9.090909
Holand-Netherlands                  NaN
Honduras                       7.692308
Hong                          30.000000
Hungary                       23.076923
India                         40.000000
Iran                          41.860465
Ireland                       20.833333
Italy                         34.246575
Jamaica                       12.345679
Japan                    

In [22]:
# 9. Most popular occupation for >50K earners in India
top_IN_occupation = (
    df[(df['native-country'] == 'India') & (df['salary'] == '>50K')]
    ['occupation']
    .value_counts()
    .idxmax()
)
print(f"Most popular occupation for >50K earners in India: {top_IN_occupation}")

Most popular occupation for >50K earners in India: Prof-specialty
