Exercise 1: Using Data API - Loading and Preprocessing Data with TensorFlow

In [1]:
!pip install ucimlrepo
import tensorflow as tf
from tensorflow import feature_column
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from ucimlrepo import fetch_ucirepo

Collecting ucimlrepo
  Downloading ucimlrepo-0.0.3-py3-none-any.whl (7.0 kB)
Installing collected packages: ucimlrepo
Successfully installed ucimlrepo-0.0.3


In [2]:
adult_dataset = fetch_ucirepo(id=2)
X = adult_dataset.data.features.copy()
y = adult_dataset.data.targets

In [3]:
# Check for missing values
missing_values = X.isnull().sum()
print("Missing values in each column:\n", missing_values)

# Handling Missing Values
X.fillna(method='ffill', inplace=True)

Missing values in each column:
 age                 0
workclass         963
fnlwgt              0
education           0
education-num       0
marital-status      0
occupation        966
relationship        0
race                0
sex                 0
capital-gain        0
capital-loss        0
hours-per-week      0
native-country    274
dtype: int64


In [4]:
categorical_columns = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country']
for col in categorical_columns:
    X[col] = pd.Categorical(X[col]).codes


In [5]:
# Transformations and Standardization
# Here we are assuming all numerical columns need standardization
numerical_columns = ['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week']
scaler = StandardScaler()
X[numerical_columns] = scaler.fit_transform(X[numerical_columns])


In [6]:
# 5. Splitting Data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [7]:
# 6. Creating TensorFlow Dataset
def df_to_dataset(dataframe, labels, shuffle=True, batch_size=32):
    dataframe = dataframe.copy()
    ds = tf.data.Dataset.from_tensor_slices((dict(dataframe), labels))
    if shuffle:
        ds = ds.shuffle(buffer_size=len(dataframe))
    ds = ds.batch(batch_size)
    return ds

train_ds = df_to_dataset(X_train, y_train)
test_ds = df_to_dataset(X_test, y_test, shuffle=False)