In [61]:
import pandas as pd        # For data manipulation and analysis
import numpy as np         # For numerical computations
import matplotlib.pyplot as plt   # For data visualization
from sklearn.model_selection import train_test_split   # For splitting data into training and testing sets
from sklearn.preprocessing import LabelEncoder       # For encoding categorical variables
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report   # For model evaluation
from sklearn.model_selection import GridSearchCV      # For hyperparameter tuning
from sklearn.pipeline import Pipeline                # For building pipelines of data preprocessing and model building steps
from sklearn.feature_selection import SelectKBest, f_classif  # For feature selection
from sklearn.ensemble import RandomForestClassifier  

# LOADING THE DATASET

In [62]:
df = pd.read_csv("adult-entire dataset.csv")

In [63]:
print(df.head())

   age          workClass   Education       marital-status  \
0   39          State-gov   Bachelors        Never-married   
1   50   Self-emp-not-inc   Bachelors   Married-civ-spouse   
2   38            Private     HS-grad             Divorced   
3   53            Private        11th   Married-civ-spouse   
4   28            Private   Bachelors   Married-civ-spouse   

           occupation    race      sex  hours-per-week  Income  
0        Adm-clerical   White     Male              40   <=50K  
1     Exec-managerial   White     Male              13   <=50K  
2   Handlers-cleaners   White     Male              40   <=50K  
3   Handlers-cleaners   Black     Male              40   <=50K  
4      Prof-specialty   Black   Female              40   <=50K  


# Checking missing values

In [64]:
missing_values_count = df.isnull().sum()
print(missing_values_count)

age               0
workClass         0
Education         0
marital-status    0
occupation        0
race              0
sex               0
hours-per-week    0
Income            0
dtype: int64


In [65]:
for col in df.columns:
    if df[col].dtype == object:  # Only check columns with object (string) dtype
        print(df[col].value_counts())

 Private             22696
 Self-emp-not-inc     2541
 Local-gov            2093
 ?                    1836
 State-gov            1298
 Self-emp-inc         1116
 Federal-gov           960
 Without-pay            14
 Never-worked            7
Name: workClass, dtype: int64
 HS-grad         10501
 Some-college     7291
 Bachelors        5355
 Masters          1723
 Assoc-voc        1382
 11th             1175
 Assoc-acdm       1067
 10th              933
 7th-8th           646
 Prof-school       576
 9th               514
 12th              433
 Doctorate         413
 5th-6th           333
 1st-4th           168
 Preschool          51
Name: Education, dtype: int64
 Married-civ-spouse       14976
 Never-married            10683
 Divorced                  4443
 Separated                 1025
 Widowed                    993
 Married-spouse-absent      418
 Married-AF-spouse           23
Name: marital-status, dtype: int64
 Prof-specialty       4140
 Craft-repair         4099
 Exec-managerial

SIZE OF DATASET

In [66]:
rows, cols = df.shape
print("Number of rows:", rows)
print("Number of columns:", cols)

Number of rows: 32561
Number of columns: 9


In [67]:
df = pd.DataFrame({'age': [25, 30, 35, None, 40],
                   'workClass': ['Private', None, 'Local-gov', '?', 'State-gov'],
                   'Education': ['Bachelors', 'Masters', 'HS-grad', 'Some-college', 'Bachelors']})

In [68]:
df['workClass'].fillna('Unknown', inplace=True)

In [69]:
print(df['workClass'].isnull().sum())

0


# Removing all rows with ? values

In [70]:

# read in the data, specifying "?" as the missing value
df = pd.read_csv('adult-entire dataset.csv', na_values='?')

# drop rows with missing values
df = df.dropna()

# check the new size of the data frame
print(df.shape)

(32561, 9)


In [71]:
df = df[(df['workClass'] != '?') & (df['occupation'] != '?')]

In [72]:
print(df.shape)

(32561, 9)


In [73]:
import numpy as np

df = df.replace('?', np.nan)
df = df.dropna()

In [77]:
print(df['workClass'].value_counts())
print(df['Education'].value_counts())
print(df['marital-status'].value_counts())
print(df['occupation'].value_counts())
print(df['race'].value_counts())
print(df['sex'].value_counts())
print(df['Income'].value_counts())

 Private             22696
 Self-emp-not-inc     2541
 Local-gov            2093
 ?                    1836
 State-gov            1298
 Self-emp-inc         1116
 Federal-gov           960
 Without-pay            14
 Never-worked            7
Name: workClass, dtype: int64
 HS-grad         10501
 Some-college     7291
 Bachelors        5355
 Masters          1723
 Assoc-voc        1382
 11th             1175
 Assoc-acdm       1067
 10th              933
 7th-8th           646
 Prof-school       576
 9th               514
 12th              433
 Doctorate         413
 5th-6th           333
 1st-4th           168
 Preschool          51
Name: Education, dtype: int64
 Married-civ-spouse       14976
 Never-married            10683
 Divorced                  4443
 Separated                 1025
 Widowed                    993
 Married-spouse-absent      418
 Married-AF-spouse           23
Name: marital-status, dtype: int64
 Prof-specialty       4140
 Craft-repair         4099
 Exec-managerial

In [78]:
print(df['workClass'].value_counts())

 Private             22696
 Self-emp-not-inc     2541
 Local-gov            2093
 ?                    1836
 State-gov            1298
 Self-emp-inc         1116
 Federal-gov           960
 Without-pay            14
 Never-worked            7
Name: workClass, dtype: int64


In [79]:
df = df[df != '?'].dropna()

In [80]:
import numpy as np
import pandas as pd

df.replace('?', np.nan, inplace=True)

In [81]:
import pandas as pd
# Replace "?" with NaN
df.replace('?', pd.np.nan, inplace=True)

# Check for missing values
print(df.isnull().sum())

age               0
workClass         0
Education         0
marital-status    0
occupation        0
race              0
sex               0
hours-per-week    0
Income            0
dtype: int64


  df.replace('?', pd.np.nan, inplace=True)


In [82]:
print(df['workClass'].value_counts())

 Private             22696
 Self-emp-not-inc     2541
 Local-gov            2093
 ?                    1836
 State-gov            1298
 Self-emp-inc         1116
 Federal-gov           960
 Without-pay            14
 Never-worked            7
Name: workClass, dtype: int64


In [83]:
# Replace ? with NaN
df.replace(' ?', np.nan, inplace=True)

# Use strip method to remove leading/trailing whitespaces
df = df.apply(lambda x: x.str.strip() if x.dtype == "object" else x)

In [84]:
print(df['workClass'].value_counts())

Private             22696
Self-emp-not-inc     2541
Local-gov            2093
State-gov            1298
Self-emp-inc         1116
Federal-gov           960
Without-pay            14
Never-worked            7
Name: workClass, dtype: int64


In [85]:
# Replace "?" with NaN
df.replace('?', np.nan, inplace=True)

# Remove rows with NaN values
df.dropna(inplace=True)

# Print the shape of the cleaned data
print(df.shape)

(30718, 9)


In [86]:
print(df['occupation'].value_counts())

Prof-specialty       4140
Craft-repair         4099
Exec-managerial      4066
Adm-clerical         3770
Sales                3650
Other-service        3295
Machine-op-inspct    2002
Transport-moving     1597
Handlers-cleaners    1370
Farming-fishing       994
Tech-support          928
Protective-serv       649
Priv-house-serv       149
Armed-Forces            9
Name: occupation, dtype: int64


# Converting cat values to numerical

In [87]:
from sklearn.preprocessing import LabelEncoder

# Create a label encoder object
le = LabelEncoder()

# Fit the label encoder to the 'workClass' column and transform the values
df['workClass'] = le.fit_transform(df['workClass'])

In [88]:
df["workClass"] = df["workClass"].astype("category").cat.codes
df["Education"] = df["Education"].astype("category").cat.codes
# Convert other categorical variables to numerical variables in a similar way

# Check the data types of all the columns
print(df.dtypes)

age                int64
workClass           int8
Education           int8
marital-status    object
occupation        object
race              object
sex               object
hours-per-week     int64
Income            object
dtype: object


In [89]:
df["marital-status"] = df["marital-status"].astype("category").cat.codes
df["occupation"] = df["occupation"].astype("category").cat.codes
df["race"] = df["race"].astype("category").cat.codes
df["sex"] = df["sex"].astype("category").cat.codes
df["Income"] = df["Income"].astype("category").cat.codes

In [90]:
print(df.dtypes)

age               int64
workClass          int8
Education          int8
marital-status     int8
occupation         int8
race               int8
sex                int8
hours-per-week    int64
Income             int8
dtype: object


In [91]:
print(df['marital-status'].unique())

[4 2 0 3 5 1 6]


Married-civ-spouse: 4
Never-married: 2
Divorced: 0
Separated: 1
Widowed: 3

In [92]:
print(df['workClass'].unique())

[5 4 2 0 1 3 6]


In [93]:
print(df['occupation'].unique())

[ 0  3  5  9  7 11  2 13  4  6 12 10  1  8]


In [94]:
print(df['Education'].unique())

[ 9 11  1 12  6 15  7  8  5 10 14  4  0 13  2  3]


In [95]:
print(df['sex'].unique())

[1 0]


In [96]:
print(df['sex'].value_counts())

1    20788
0     9930
Name: sex, dtype: int64


4.	Eliminate any columns from the data that may not be relevant to your objectives

# I'm going to remove race and gender columns from my dataset

In [97]:
df = df.drop(['race', 'sex'], axis=1)

In [98]:
print(df.head())

   age  workClass  Education  marital-status  occupation  hours-per-week  \
0   39          5          9               4           0              40   
1   50          4          9               2           3              13   
2   38          2         11               0           5              40   
3   53          2          1               2           5              40   
4   28          2          9               2           9              40   

   Income  
0       0  
1       0  
2       0  
3       0  
4       0  
