In [31]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

# 1. Preprocess the dataset: project_adult.csv & project_validation_inputs.csv
-  Handle missing values.
-  Encode categorical features.
- Standardize numerical features.

- added in the second data file but does that mean that we don't need to test/train split?

In [32]:
# Reading in the training data

file_path = '../data/project_adult.csv'

try:
    df = pd.read_csv(file_path, header=None, encoding='utf-8')
    print(f"Successfully loaded '{file_path}'. First 5 rows:")
    print(df.head())
except FileNotFoundError:
    print(f"Error: The file '{file_path}' was not found.")
except pd.errors.EmptyDataError:
    print(f"Error: The file '{file_path}' is empty or contains no data.")
except pd.errors.ParserError as e:
    print(f"Error: A parsing error occurred while reading '{file_path}': {e}")
except Exception as e:
    print(f"An unexpected error occurred: {e}")

Successfully loaded '../data/project_adult.csv'. First 5 rows:
        0    1                 2       3          4              5   \
0      NaN  age         workclass  fnlwgt  education  education-num   
1   5514.0   33         Local-gov  198183  Bachelors             13   
2  19777.0   36           Private   86459  Assoc-voc             11   
3  10781.0   58  Self-emp-not-inc  203039        9th              5   
4  32240.0   21           Private  180190  Assoc-voc             11   

                   6                7              8      9       10  \
0      marital-status       occupation   relationship   race     sex   
1       Never-married   Prof-specialty  Not-in-family  White  Female   
2  Married-civ-spouse  Exec-managerial        Husband  White    Male   
3           Separated     Craft-repair  Not-in-family  White    Male   
4  Married-civ-spouse  Farming-fishing        Husband  White    Male   

             11            12              13              14      15  
0  ca

In [33]:
# Reading in the test data

file_path = '../data/project_validation_inputs.csv'

try:
    df_test = pd.read_csv(file_path, header=None, encoding='utf-8')
    print(f"Successfully loaded '{file_path}'. First 5 rows:")
    print(df_test.head())
except FileNotFoundError:
    print(f"Error: The file '{file_path}' was not found.")
except pd.errors.EmptyDataError:
    print(f"Error: The file '{file_path}' is empty or contains no data.")
except pd.errors.ParserError as e:
    print(f"Error: A parsing error occurred while reading '{file_path}': {e}")
except Exception as e:
    print(f"An unexpected error occurred: {e}")

Successfully loaded '../data/project_validation_inputs.csv'. First 5 rows:
        0    1          2       3             4              5   \
0      NaN  age  workclass  fnlwgt     education  education-num   
1  14160.0   27    Private  160178  Some-college             10   
2  27048.0   45  State-gov   50567       HS-grad              9   
3  28868.0   29    Private  185908     Bachelors             13   
4   5667.0   30    Private  190040     Bachelors             13   

                   6                  7              8      9       10  \
0      marital-status         occupation   relationship   race     sex   
1            Divorced       Adm-clerical  Not-in-family  White  Female   
2  Married-civ-spouse    Exec-managerial           Wife  White  Female   
3  Married-civ-spouse    Exec-managerial        Husband  Black    Male   
4       Never-married  Machine-op-inspct  Not-in-family  White  Female   

             11            12              13              14  
0  capital-ga

In [34]:
display(df_test.shape)
display(df.shape)

(6514, 15)

(26049, 16)

In [35]:
# Count "?" in each column
question_counts = (df == "?").sum()
print(question_counts)

0        0
1        0
2     1447
3        0
4        0
5        0
6        0
7     1454
8        0
9        0
10       0
11       0
12       0
13       0
14     458
15       0
dtype: int64


In [36]:
# process and standardize data
def preprocess_data(df):

    # Drop the first column (seems like just a random index) and first row (column names)
    df = df.iloc[1:, 1:]
    
    # correct column names
    new_column_names = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'class']
    df.columns = new_column_names

    # Handle missing values
    df.replace('?', np.nan, inplace=True)

    df['workclass'] = df['workclass'].fillna(df['workclass'].mode()[0])
    df['occupation'] = df['occupation'].fillna(df['occupation'].mode()[0])
    df['native-country'] = df['native-country'].fillna(df['native-country'].mode()[0])

    print(f"Total missing values in the DataFrame: {df.isnull().sum().sum()}") 

    # Binarize the target variable
    df['class'] = df['class'].apply(lambda x: 1 if x == '>50K' else 0)

    # keep track of rows indexes to connect X and y
    df = df.reset_index(names=['original_index'])

    #Columns to encode
    columns_to_encode = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'native-country', 'sex']
     
    # Encode categorical features
    #df = pd.get_dummies(df, columns = columns_to_encode, dtype = int)

    
    # Apply LabelEncoder to each column
    label_encoders = {}
    for col in columns_to_encode:    
        le = LabelEncoder()    
        df[col] = le.fit_transform(df[col])    
        label_encoders[col] = le  

    # change data types
    df = df.apply(pd.to_numeric, errors='coerce')


    # Separate features and target
    X = df.drop(columns=['class'], axis=1)
    y = df[['original_index','class']]

    # Standardize numerical features
    numeric_cols = ['age','fnlwgt','education-num','capital-gain', 'capital-loss', 'hours-per-week']
    scaler = StandardScaler()
    X[numeric_cols] = scaler.fit_transform(X[numeric_cols])

    display(df.head())
    
    # Check means and standard deviations
    print("Means after scaling:\n", X[numeric_cols].mean())
    print("\nStandard deviations after scaling:\n", X[numeric_cols].std())

    return X, y

In [37]:
# preprocess training data
X, y = preprocess_data(df)

Total missing values in the DataFrame: 0


Unnamed: 0,original_index,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,class
0,1,33,1,198183,9,13,4,9,1,4,0,0,0,50,38,1
1,2,36,3,86459,8,11,2,3,0,4,1,0,1887,50,38,1
2,3,58,5,203039,6,5,5,2,1,4,1,0,0,40,38,0
3,4,21,3,180190,8,11,2,4,0,4,1,0,0,46,38,0
4,5,27,3,279872,15,10,0,7,1,4,1,0,0,40,38,0


Means after scaling:
 age               1.636692e-16
fnlwgt            1.936753e-17
education-num    -6.137597e-18
capital-gain     -2.427760e-17
capital-loss     -2.836934e-17
hours-per-week   -2.373204e-16
dtype: float64

Standard deviations after scaling:
 age               1.000019
fnlwgt            1.000019
education-num     1.000019
capital-gain      1.000019
capital-loss      1.000019
hours-per-week    1.000019
dtype: float64


In [38]:
# process and standardize data
def preprocess_test_data(df):

    # Drop the first column (seems like just a random index) and first row (column names)
    df = df.iloc[1:, 1:]
    
    
    # correct column names
    new_column_names = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country']
    df.columns = new_column_names

    # Handle missing values
    df.replace('?', np.nan, inplace=True)

    df['workclass'] = df['workclass'].fillna(df['workclass'].mode()[0])
    df['occupation'] = df['occupation'].fillna(df['occupation'].mode()[0])
    df['native-country'] = df['native-country'].fillna(df['native-country'].mode()[0])

    print(f"Total missing values in the DataFrame: {df.isnull().sum().sum()}") 

    # keep track of rows indexes to connect X and y
    df = df.reset_index(names=['original_index'])

    # Columns to encode
    columns_to_encode = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'native-country', 'sex']
     
    # Encode categorical features
    #df = pd.get_dummies(df, columns = columns_to_encode, dtype = int)

    
    # Apply LabelEncoder to each column
    label_encoders = {}
    for col in columns_to_encode:    
        le = LabelEncoder()    
        df[col] = le.fit_transform(df[col])    
        label_encoders[col] = le 

    # change data types
    df = df.apply(pd.to_numeric, errors='coerce')


    # Separate features and target
    X = df

    # Standardize numerical features
    numeric_cols = ['age','fnlwgt','education-num','capital-gain', 'capital-loss', 'hours-per-week']
    scaler = StandardScaler()
    X[numeric_cols] = scaler.fit_transform(X[numeric_cols])

    display(df.head())
    
    # Check means and standard deviations
    print("Means after scaling:\n", X[numeric_cols].mean())
    print("\nStandard deviations after scaling:\n", X[numeric_cols].std())

    return X

In [39]:
# preprocess test data
X_df_test = preprocess_test_data(df_test)

Total missing values in the DataFrame: 0


Unnamed: 0,original_index,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country
0,1,-0.851191,2,-0.277853,15,-0.031693,0,0,1,4,0,-0.147225,-0.211274,-0.212164,36
1,2,0.469374,5,-1.30409,11,-0.421896,2,3,5,4,0,-0.147225,-0.211274,-0.051009,36
2,3,-0.704461,2,-0.036955,9,1.138915,2,3,0,2,1,-0.147225,-0.211274,1.157652,36
3,4,-0.631097,2,0.001731,9,1.138915,4,6,1,4,0,-0.147225,-0.211274,-0.051009,36
4,5,-0.704461,4,-0.004766,15,-0.031693,0,2,1,4,1,0.128973,-0.211274,0.754765,36


Means after scaling:
 age              -7.527629e-17
fnlwgt            1.210966e-16
education-num     1.996458e-16
capital-gain     -2.427388e-17
capital-loss      5.454804e-18
hours-per-week    2.705583e-16
dtype: float64

Standard deviations after scaling:
 age               1.000077
fnlwgt            1.000077
education-num     1.000077
capital-gain      1.000077
capital-loss      1.000077
hours-per-week    1.000077
dtype: float64
