In [1]:
# import packages
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

# 1. Preprocess the dataset: project_adult.csv & project_validation_inputs.csv
-  Handle missing values.
-  Encode categorical features.
- Standardize numerical features.

- added in the second data file but does that mean that we don't need to test/train split?

In [2]:
# Reading in the training/test data

file_path = '../data/project_adult.csv'

try:
    df = pd.read_csv(file_path, header=None, encoding='utf-8')
    print(f"Successfully loaded '{file_path}'. First 5 rows:")
    print(df.head())
except FileNotFoundError:
    print(f"Error: The file '{file_path}' was not found.")
except pd.errors.EmptyDataError:
    print(f"Error: The file '{file_path}' is empty or contains no data.")
except pd.errors.ParserError as e:
    print(f"Error: A parsing error occurred while reading '{file_path}': {e}")
except Exception as e:
    print(f"An unexpected error occurred: {e}")

Successfully loaded '../data/project_adult.csv'. First 5 rows:
        0    1                 2       3          4              5   \
0      NaN  age         workclass  fnlwgt  education  education-num   
1   5514.0   33         Local-gov  198183  Bachelors             13   
2  19777.0   36           Private   86459  Assoc-voc             11   
3  10781.0   58  Self-emp-not-inc  203039        9th              5   
4  32240.0   21           Private  180190  Assoc-voc             11   

                   6                7              8      9       10  \
0      marital-status       occupation   relationship   race     sex   
1       Never-married   Prof-specialty  Not-in-family  White  Female   
2  Married-civ-spouse  Exec-managerial        Husband  White    Male   
3           Separated     Craft-repair  Not-in-family  White    Male   
4  Married-civ-spouse  Farming-fishing        Husband  White    Male   

             11            12              13              14      15  
0  ca

In [3]:
# Reading in the validation data

file_path = '../data/project_validation_inputs.csv'

try:
    df_test = pd.read_csv(file_path, header=None, encoding='utf-8')
    print(f"Successfully loaded '{file_path}'. First 5 rows:")
    print(df_test.head())
except FileNotFoundError:
    print(f"Error: The file '{file_path}' was not found.")
except pd.errors.EmptyDataError:
    print(f"Error: The file '{file_path}' is empty or contains no data.")
except pd.errors.ParserError as e:
    print(f"Error: A parsing error occurred while reading '{file_path}': {e}")
except Exception as e:
    print(f"An unexpected error occurred: {e}")

Successfully loaded '../data/project_validation_inputs.csv'. First 5 rows:
        0    1          2       3             4              5   \
0      NaN  age  workclass  fnlwgt     education  education-num   
1  14160.0   27    Private  160178  Some-college             10   
2  27048.0   45  State-gov   50567       HS-grad              9   
3  28868.0   29    Private  185908     Bachelors             13   
4   5667.0   30    Private  190040     Bachelors             13   

                   6                  7              8      9       10  \
0      marital-status         occupation   relationship   race     sex   
1            Divorced       Adm-clerical  Not-in-family  White  Female   
2  Married-civ-spouse    Exec-managerial           Wife  White  Female   
3  Married-civ-spouse    Exec-managerial        Husband  Black    Male   
4       Never-married  Machine-op-inspct  Not-in-family  White  Female   

             11            12              13              14  
0  capital-ga

In [None]:
# Checking shape of dataframes
display(df_test.shape)
display(df.shape)

(6514, 15)

(26049, 16)

In [5]:
# Missing values represented as '?' in this data
# Count "?" in each column
question_counts = (df == "?").sum()
print(question_counts)

0        0
1        0
2     1447
3        0
4        0
5        0
6        0
7     1454
8        0
9        0
10       0
11       0
12       0
13       0
14     458
15       0
dtype: int64


In [None]:
# process and standardize the test/train data function
def preprocess_data(df):

    # Drop the first column (seems like just a random index) and first row (column names)
    df = df.iloc[1:, 1:]
    
    # correct column names
    new_column_names = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'class']
    df.columns = new_column_names

    # Handle missing values
    df.replace('?', 'Missing', inplace=True)  # Letting missing values be their own class as they're only in categorical columns

    # Binarize the target variable
    df['class'] = df['class'].apply(lambda x: 1 if x == '>50K' else 0)

    # Keep track of rows indexes to connect X and y
    df = df.reset_index(names=['original_index'])

    # Encode categorical features
    df = pd.get_dummies(df, columns=['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'native-country', 'sex'], dtype = int)

    # Ensure all data types are numeric after encoding
    df = df.apply(pd.to_numeric, errors='coerce')

    # Separate features and target
    X = df.drop(columns=['class'], axis=1)
    y = df[['original_index','class']]

    # Standardize numerical features
    numeric_cols = ['age','fnlwgt','education-num','capital-gain', 'capital-loss', 'hours-per-week']
    scaler = StandardScaler()
    X[numeric_cols] = scaler.fit_transform(X[numeric_cols])

    # Display preprocessed data
    merged_df = pd.concat([X, y['class']], axis=1)
    display(merged_df.head())

    # Return encoding to be using for validation data to maintain consistency
    return X, y

In [None]:
# Preprocess training/test data
X, y = preprocess_data(df)

Unnamed: 0,original_index,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week,workclass_Federal-gov,workclass_Local-gov,workclass_Missing,...,native-country_South,native-country_Taiwan,native-country_Thailand,native-country_Trinadad&Tobago,native-country_United-States,native-country_Vietnam,native-country_Yugoslavia,sex_Female,sex_Male,class
0,1,-0.408756,0.080051,1.133702,-0.145715,-0.217998,0.77946,0,1,0,...,0,0,0,0,1,0,0,1,0,1
1,2,-0.188857,-0.981653,0.357049,-0.145715,4.457168,0.77946,0,0,0,...,0,0,0,0,1,0,0,0,1,1
2,3,1.423734,0.126197,-1.97291,-0.145715,-0.217998,-0.03151,0,0,0,...,0,0,0,0,1,0,0,0,1,0
3,4,-1.288351,-0.090935,0.357049,-0.145715,-0.217998,0.455072,0,0,0,...,0,0,0,0,1,0,0,0,1,0
4,5,-0.848554,0.856334,-0.031277,-0.145715,-0.217998,-0.03151,0,0,0,...,0,0,0,0,1,0,0,0,1,0


In [8]:
# Process and standardize validation data function
# Decided to create another function and remove any lines related to target column so that it worked for the validation dataset

def preprocess_validation_data(df):

    # Drop the first column (seems like just a random index) and first row (column names)
    df = df.iloc[1:, 1:]
    
    # correct column names
    new_column_names = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country']
    df.columns = new_column_names

    # Handle missing values
    df.replace('?', 'Missing', inplace=True) # Letting missing values be their own class as they're only in categorical columns

    # keep track of rows indexes to connect X and y
    df = df.reset_index(names=['original_index'])

    # Encode categorical features
    df = pd.get_dummies(df, columns=['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'native-country', 'sex'], dtype = int)

    # Ensure all data is numeric after encoding
    df = df.apply(pd.to_numeric, errors='coerce')

    # Assign features to X
    X = df

    # Standardize numerical features
    numeric_cols = ['age','fnlwgt','education-num','capital-gain', 'capital-loss', 'hours-per-week']
    scaler = StandardScaler()
    X[numeric_cols] = scaler.fit_transform(X[numeric_cols])

    # display preprocessed data
    display(X.head())

    return X

In [9]:
# preprocess validation data
X_df_test = preprocess_validation_data(df_test)

Unnamed: 0,original_index,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week,workclass_Federal-gov,workclass_Local-gov,workclass_Missing,...,native-country_Scotland,native-country_South,native-country_Taiwan,native-country_Thailand,native-country_Trinadad&Tobago,native-country_United-States,native-country_Vietnam,native-country_Yugoslavia,sex_Female,sex_Male
0,1,-0.851191,-0.277853,-0.031693,-0.147225,-0.211274,-0.212164,0,0,0,...,0,0,0,0,0,1,0,0,1,0
1,2,0.469374,-1.30409,-0.421896,-0.147225,-0.211274,-0.051009,0,0,0,...,0,0,0,0,0,1,0,0,1,0
2,3,-0.704461,-0.036955,1.138915,-0.147225,-0.211274,1.157652,0,0,0,...,0,0,0,0,0,1,0,0,0,1
3,4,-0.631097,0.001731,1.138915,-0.147225,-0.211274,-0.051009,0,0,0,...,0,0,0,0,0,1,0,0,1,0
4,5,-0.704461,-0.004766,-0.031693,0.128973,-0.211274,0.754765,0,0,0,...,0,0,0,0,0,1,0,0,0,1
