## Data Preprocessing

In [28]:
import os
import math
import requests
import pandas as pd
import numpy as np
import csv
import matplotlib.pyplot as plt

### Notebook Constants
DATA_PATH = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"
COLUMNS = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 
            'occupation', 'relationship', 'race', 'sex', 'capital-gain', 
           'capital-loss', 'hours-per-week', 'native-country', 'salary'] # From archive description, fnlwgt will be ignored

In [29]:
req = requests.get(DATA_PATH)

csvreader = csv.reader(req.text, delimiter=",")

In [30]:
def dataToDict(dataText, rowdelimiter="\n", coldelimiter=", "):
    """
    Splits the rows for the data points by the rowdelimiter.
    Splits the column, row-wise 
    """
    rows = dataText.split(rowdelimiter)[:-2] # Final two rows are blank
    return {i: data.split(coldelimiter) for i, data in enumerate(rows)}

In [33]:
customer_data = dataToDict(req.text)
customer_df = pd.DataFrame.from_dict(customer_data, orient='index', columns=COLUMNS)
customer_df = customer_df.drop("fnlwgt", axis=1)
for col in ["age", "education-num", "capital-gain", "capital-loss", "hours-per-week"]: # Cast numerical columns to numerical data type
    customer_df[col] = pd.to_numeric(customer_df[col])

In [35]:
from sklearn.preprocessing import LabelEncoder
categorical_attributes = ["workclass", "education", "marital-status", "occupation", "relationship", "race", "sex", "native-country"] # These attributes are categorical data
le = LabelEncoder()
encoded_df = pd.DataFrame()
for cat in categorical_attributes:
  tmp = pd.DataFrame(le.fit_transform(customer_df.loc[:, cat]), columns=[cat]) # Creates simple labeled data in place of categories in the range of (n-1) categories per column
  encoded_df = pd.concat([encoded_df, tmp], axis=1)

customer_df = pd.concat([encoded_df, customer_df.loc[:, [col for col in customer_df.columns if col not in categorical_attributes]]], axis=1) # Concatenate the categorical label encoded columns with the numerical columns

# Encode salary
customer_df["salary"] = customer_df["salary"].apply(lambda x: 0 if x == "<=50K" else 1)

0        39
1        50
2        38
3        53
4        28
         ..
32556    27
32557    40
32558    58
32559    22
32560    52
Name: age, Length: 32561, dtype: int64

In [None]:
fig, axs = plt.subplots(13, 1, figsize=(6, 40))  # 13x1 grid

for i, col in enumerate(customer_df.columns):
  if col == "salary":
    break
  axs[i].scatter(sorted(customer_df[col].unique()), customer_df.groupby(col)['salary'].mean())
  axs[i].set_title(f"{col} vs salary")
  axs[i].grid(True)

# Adjust layout
plt.tight_layout()

# Show the plots
plt.show()