In [None]:
#Installing the pandas library
# pandas is a powerful and flexible data analysis and manipulation library for Python.
# It provides data structures and functions needed to work with structured data seamlessly.
# This library will be used for data manipulation and analysis tasks in the project
!pip install pandas



In [None]:
#Imported the CSV file, loading the data to the dataframe and view few rows of the data
import pandas as pd
df=pd.read_csv("adult_income_Analysis.csv")
print(df.head())

   age    workclass  fnlwgt    education  education.num marital.status  \
0   54      Private  140359      7th-8th              4       Divorced   
1   34      Private  216864      HS-grad              9       Divorced   
2   68  Federal-gov  422013      HS-grad              9       Divorced   
3   45      Private  172274    Doctorate             16       Divorced   
4   46      Private   45363  Prof-school             15       Divorced   

          occupation   relationship   race     sex  capital.gain  \
0  Machine-op-inspct      Unmarried  White  Female             0   
1      Other-service      Unmarried  White  Female             0   
2     Prof-specialty  Not-in-family  White  Female             0   
3     Prof-specialty      Unmarried  Black  Female             0   
4     Prof-specialty  Not-in-family  White    Male             0   

   capital.loss  hours.per.week native.country income  
0          3900              40  United-States  <=50K  
1          3770              45  U

In [None]:
# Data Overview
# The dataset contains information about individuals and their income.
# Columns include:
# - age: The age of the individual.
# - workclass: The type of employment (e.g., Private, Federal-gov, etc.).
# - fnlwgt: The final weight, a measure used by the census bureau for statistical purposes.
# - education: The highest level of education attained.
# - education.num: The numerical representation of the education level.
# - marital.status: Marital status of the individual (e.g., Divorced, Never-married, etc.).
# - occupation: The type of occupation (e.g., Machine-op-inspct, Prof-specialty, etc.).
# - relationship: Relationship status (e.g., Unmarried, Not-in-family, etc.).
# - race: The race of the individual (e.g., White, Black, etc).
# - sex: Gender of the individual (e.g., Female, Male).
# - capital.gain: Capital gains income.
# - capital.loss: Capital losses.
# - hours.per.week: Number of hours worked per week.
# - native.country: Country of origin.
# - income: Income bracket (<=50K or >50K).

# Next Step: Checking for Duplicates
# We need to verify if there are any duplicate rows in the dataset.
# Duplicate entries can skew analysis results and should be identified and removed if necessary.
# This step ensures the dataset's integrity before performing further analysis.

# Check for duplicate rows
duplicates = df.duplicated().sum()
print(duplicates)

24


In [None]:
# The dataset contains 24 duplicate rows. Let's examine these duplicate records to understand their content and impact on the analysis.
duplicates_all_df = df[df.duplicated(keep=False)]

# Sort the duplicate rows by all columns for better visibility
duplicates_sorted_df = duplicates_all_df.sort_values(by=list(df.columns))

# Display the sorted duplicate rows
print("Duplicate rows in the dataset, sorted by all columns:")
print(duplicates_sorted_df)

Duplicate rows in the dataset, sorted by all columns:
       age         workclass  fnlwgt     education  education.num  \
26025   19           Private   97261       HS-grad              9   
26325   19           Private   97261       HS-grad              9   
22760   19           Private  138153  Some-college             10   
27123   19           Private  138153  Some-college             10   
25262   19           Private  146679  Some-college             10   
27176   19           Private  146679  Some-college             10   
21800   19           Private  251579  Some-college             10   
30389   19           Private  251579  Some-college             10   
22407   20           Private  107658  Some-college             10   
24169   20           Private  107658  Some-college             10   
23091   21           Private  243368     Preschool              1   
25221   21           Private  243368     Preschool              1   
22084   21           Private  250051  Some-colleg

In [None]:
# Remove duplicate rows from the dataset
# This operation will retain the first occurrence of each duplicate and remove the rest.
# The dataset will be updated to exclude these redundant rows.

data_cleaned=df.drop_duplicates()
#confirming the number of duplicate rows has been reduced to 0
remaining_duplicates = data_cleaned.duplicated().sum()
print(remaining_duplicates)
print(data_cleaned.head())

0
   age    workclass  fnlwgt    education  education.num marital.status  \
0   54      Private  140359      7th-8th              4       Divorced   
1   34      Private  216864      HS-grad              9       Divorced   
2   68  Federal-gov  422013      HS-grad              9       Divorced   
3   45      Private  172274    Doctorate             16       Divorced   
4   46      Private   45363  Prof-school             15       Divorced   

          occupation   relationship   race     sex  capital.gain  \
0  Machine-op-inspct      Unmarried  White  Female             0   
1      Other-service      Unmarried  White  Female             0   
2     Prof-specialty  Not-in-family  White  Female             0   
3     Prof-specialty      Unmarried  Black  Female             0   
4     Prof-specialty  Not-in-family  White    Male             0   

   capital.loss  hours.per.week native.country income  
0          3900              40  United-States  <=50K  
1          3770              45 

In [None]:
print("Column data types:")
print(data_cleaned.dtypes)

Column data types:
age                int64
workclass         object
fnlwgt             int64
education         object
education.num      int64
marital.status    object
occupation        object
relationship      object
race              object
sex               object
capital.gain       int64
capital.loss       int64
hours.per.week     int64
native.country    object
income            object
dtype: object


In [None]:
# Replaced all instances of '?' with 'Unknown' in the DataFrame to handle missing or ambiguous data
df=data_cleaned.replace('?', 'Unknown')

In [None]:
# Map each education level to a broader category and create a new column 'grouped_education_level'
# This categorizes education levels into broader groups for easier analysis and interpretation
education_mapping = {
    '11th': 'High School',
    'HS-grad': 'High School',
    '10th': 'High School',
    '9th': 'High School',
    '12th': 'High School',
    '7th-8th': 'Elementary/Middle School',
    'Some-college': 'Some College',
    '5th-6th': 'Elementary/Middle School',
    'Masters': 'Master\'s Degree',
    'Bachelors': 'Bachelor\'s Degree',
    'Assoc-voc': 'Associate\'s Degree',
    'Assoc-acdm': 'Associate\'s Degree',
    'Preschool': 'Preschool',
    '1st-4th': 'Elementary/Middle School',
    'Doctorate': 'Doctorate',
    'Prof-school': 'Professional School'
}

df['grouped_education_level'] = df['education'].map(education_mapping)

In [None]:
print(df.columns)

Index(['age', 'workclass', 'fnlwgt', 'education', 'education.num',
       'marital.status', 'occupation', 'relationship', 'race', 'sex',
       'capital.gain', 'capital.loss', 'hours.per.week', 'native.country',
       'income', 'grouped_education_level'],
      dtype='object')


In [None]:
# Checked the DataFrame to determine if any '?' values are still present.

contains_question_marks = (df == '?').any().any()
print(contains_question_marks)

False


In [None]:
# Replaced hyphens ('-') with spaces (' ') in the 'native.country' column
# to ensure proper formatting of country names, e.g., 'United-States' becomes 'United States'
df['native.country'] = df['native.country'].str.replace('-', ' ', regex=False)

# Verify the replacement
print(df['native.country'].unique())

['United States' 'Canada' 'India' 'Philippines' 'Unknown' 'Japan' 'Iran'
 'Mexico' 'Cuba' 'England' 'Puerto Rico' 'Germany' 'Ireland' 'France'
 'Honduras' 'Portugal' 'China' 'Italy' 'Scotland' 'Nicaragua' 'Greece'
 'Peru' 'Ecuador' 'Dominican Republic' 'Poland' 'Haiti' 'Guatemala'
 'South' 'Thailand' 'Jamaica' 'El Salvador' 'Vietnam' 'Trinadad&Tobago'
 'Columbia' 'Hungary' 'Outlying US(Guam USVI etc)' 'Yugoslavia' 'Taiwan'
 'Hong' 'Cambodia' 'Laos' 'Holand Netherlands']


In [None]:
df.to_csv('adult_income_Analysis_after_cleaning.csv', index=False)