In [None]:
import pyspark.pandas as ps

In [None]:
psdf = ps.DataFrame(spark.sql("SELECT * FROM census_adult_income"))

In [None]:
dummy_columns = [
    'workclass',
    'education',
    'occupation',
    'race',
    'sex',
    'income_bracket',
    'native_country'
]

# Onehot encoding 
for col in dummy_columns:
    psdf = ps.get_dummies(psdf, columns=[col], dtype='int64')


In [None]:
from pyspark.sql.types import IntegerType

# Bin Ages
def age_binner(age):
    if age <= 16:
        return 1
    elif age <= 29:
        return 2
    elif age <= 39:
        return 3
    elif age <= 49:
        return 4
    elif age <= 59:      
        return 5
    else: 
        return 6

psdf['age_bins'] = psdf['age'].apply(lambda x: age_binner(x))


In [None]:
psdf.info()

In [None]:
# Dependent Variable
psdf['never_married'] = psdf['marital_status'].apply(lambda x: 1 if x == 'Never-married' else 0)


In [None]:
# Cleanup up column names for Delta Lake table compatability
psdf['income_bracket_gt_50k'] = psdf['income_bracket_>50K']
psdf['native_country_Outlying-US'] = psdf['native_country_Outlying-US(Guam-USVI-etc)']
psdf['native_country_Trinadad-Tobago'] = psdf['native_country_Trinadad&Tobago']


In [None]:
# Drop redundant column
psdf = psdf.drop(columns=['native_country_Trinadad&Tobago','native_country_Outlying-US(Guam-USVI-etc)','income_bracket_>50K', 'income_bracket_<=50K', 'marital_status', 'age'])

In [None]:
df=psdf.to_spark()


In [None]:
%sql CREATE DATABASE IF NOT EXISTS census_data


In [None]:
from databricks import  feature_store

fs = feature_store.FeatureStoreClient()


fs.create_table(
    name="census_data.census_adult_income_features",
    df=psdf.to_spark(),
    description="Census features for predicting marital status",
    primary_keys="uuid"
)
