In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Set display options for better viewing of DataFrames
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)

In [2]:
# Load the dataset from the 'data' folder
# The '..' means go up one directory from 'notebooks' to 'salary_prediction_project'
# Then go into the 'data' directory
df = pd.read_csv('../data/adult.csv')

print("Dataset loaded successfully!")

Dataset loaded successfully!


In [3]:
# Display the shape of the dataset (rows, columns)
print("Dataset Shape:", df.shape)

# Display the first 5 rows of the dataset to get a quick glance at the data
print("\nFirst 5 rows of the dataset:")
print(df.head())

# You might also want to check the last few rows
# print("\nLast 5 rows of the dataset:")
# print(df.tail())

Dataset Shape: (48842, 15)

First 5 rows of the dataset:
   age  workclass  fnlwgt     education  educational-num      marital-status         occupation relationship   race  gender  capital-gain  capital-loss  hours-per-week native-country income
0   25    Private  226802          11th                7       Never-married  Machine-op-inspct    Own-child  Black    Male             0             0              40  United-States  <=50K
1   38    Private   89814       HS-grad                9  Married-civ-spouse    Farming-fishing      Husband  White    Male             0             0              50  United-States  <=50K
2   28  Local-gov  336951    Assoc-acdm               12  Married-civ-spouse    Protective-serv      Husband  White    Male             0             0              40  United-States   >50K
3   44    Private  160323  Some-college               10  Married-civ-spouse  Machine-op-inspct      Husband  Black    Male          7688             0              40  United-States 

In [4]:
# Get a summary of the DataFrame, including data types and non-null values
print("\nDataset Information:")
df.info()


Dataset Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48842 entries, 0 to 48841
Data columns (total 15 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   age              48842 non-null  int64 
 1   workclass        48842 non-null  object
 2   fnlwgt           48842 non-null  int64 
 3   education        48842 non-null  object
 4   educational-num  48842 non-null  int64 
 5   marital-status   48842 non-null  object
 6   occupation       48842 non-null  object
 7   relationship     48842 non-null  object
 8   race             48842 non-null  object
 9   gender           48842 non-null  object
 10  capital-gain     48842 non-null  int64 
 11  capital-loss     48842 non-null  int64 
 12  hours-per-week   48842 non-null  int64 
 13  native-country   48842 non-null  object
 14  income           48842 non-null  object
dtypes: int64(6), object(9)
memory usage: 5.6+ MB


In [5]:
# Get descriptive statistics for numerical columns
print("\nDescriptive Statistics for Numerical Columns:")
print(df.describe())


Descriptive Statistics for Numerical Columns:
                age        fnlwgt  educational-num  capital-gain  capital-loss  hours-per-week
count  48842.000000  4.884200e+04     48842.000000  48842.000000  48842.000000    48842.000000
mean      38.643585  1.896641e+05        10.078089   1079.067626     87.502314       40.422382
std       13.710510  1.056040e+05         2.570973   7452.019058    403.004552       12.391444
min       17.000000  1.228500e+04         1.000000      0.000000      0.000000        1.000000
25%       28.000000  1.175505e+05         9.000000      0.000000      0.000000       40.000000
50%       37.000000  1.781445e+05        10.000000      0.000000      0.000000       40.000000
75%       48.000000  2.376420e+05        12.000000      0.000000      0.000000       45.000000
max       90.000000  1.490400e+06        16.000000  99999.000000   4356.000000       99.000000


In [6]:
# Check for missing values in each column (True NaNs, not '?' yet)
print("\nMissing values before specific handling of '?' characters:")
print(df.isnull().sum())


Missing values before specific handling of '?' characters:
age                0
workclass          0
fnlwgt             0
education          0
educational-num    0
marital-status     0
occupation         0
relationship       0
race               0
gender             0
capital-gain       0
capital-loss       0
hours-per-week     0
native-country     0
income             0
dtype: int64


In [7]:
# Iterate through object (categorical) columns to see their unique values and counts
print("\nUnique values in categorical columns:")
for column in df.select_dtypes(include='object').columns:
    print(f"\n--- {column} ---")
    print(df[column].value_counts())
    print(f"Number of unique values: {df[column].nunique()}")


Unique values in categorical columns:

--- workclass ---
workclass
Private             33906
Self-emp-not-inc     3862
Local-gov            3136
?                    2799
State-gov            1981
Self-emp-inc         1695
Federal-gov          1432
Without-pay            21
Never-worked           10
Name: count, dtype: int64
Number of unique values: 9

--- education ---
education
HS-grad         15784
Some-college    10878
Bachelors        8025
Masters          2657
Assoc-voc        2061
11th             1812
Assoc-acdm       1601
10th             1389
7th-8th           955
Prof-school       834
9th               756
12th              657
Doctorate         594
5th-6th           509
1st-4th           247
Preschool          83
Name: count, dtype: int64
Number of unique values: 16

--- marital-status ---
marital-status
Married-civ-spouse       22379
Never-married            16117
Divorced                  6633
Separated                 1530
Widowed                   1518
Married-spouse-a