## Set SparkSession and Feathr client

#### Imports

In [36]:
import glob
import os
from pathlib import Path

import feathr
import pandas as pd
from feathr import (
    BOOLEAN,
    FLOAT,
    INPUT_CONTEXT,
    INT32,
    BackfillTime,
    DerivedFeature,
    FeathrClient,
    Feature,
    FeatureAnchor,
    FeatureQuery,
    HdfsSource,
    MaterializationSettings,
    ObservationSettings,
    RedisSink,
    TypedKey,
    ValueType,
    WindowAggTransformation,
)
from feathr.datasets.utils import maybe_download
from feathr.utils.config import generate_config
from feathr.utils.job_utils import get_result_df
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, rand
from pyspark.sql.types import DoubleType, IntegerType

PATH_TO_APP_DATA = "hdfs://namenode:9000/data"

print(f"Feathr version: {feathr.__version__}")

Feathr version: 1.0.0


#### SparkSession

In [37]:
spark = (
    SparkSession.builder.appName("write-synthetic-parquet-to-hdfs")  # type: ignore[attr-defined]
    .config("spark.hadoop.fs.defaultFS", "hdfs://namenode:9000")
    .getOrCreate()
)

#### Feathr client

In [38]:
os.environ['SPARK_LOCAL_IP'] = "127.0.0.1"
os.environ['REDIS_PASSWORD'] = ""

jar_name = glob.glob("./*.jar")[0]
print(f"Found jar file at {jar_name}")

feathr_workspace_folder = Path("./feathr_config.yaml")

client = FeathrClient(str(feathr_workspace_folder))

2026-01-25 13:12:18.992 | INFO     | feathr.utils._env_config_reader:get:60 - Config secrets__azure_key_vault__name is not found in the environment variable, configuration file, or the remote key value store. Returning the default value: None.
2026-01-25 13:12:18.996 | INFO     | feathr.utils._env_config_reader:get:60 - Config offline_store__s3__s3_enabled is not found in the environment variable, configuration file, or the remote key value store. Returning the default value: None.
2026-01-25 13:12:18.997 | INFO     | feathr.utils._env_config_reader:get:60 - Config offline_store__adls__adls_enabled is not found in the environment variable, configuration file, or the remote key value store. Returning the default value: None.
2026-01-25 13:12:18.998 | INFO     | feathr.utils._env_config_reader:get:60 - Config offline_store__wasb__wasb_enabled is not found in the environment variable, configuration file, or the remote key value store. Returning the default value: None.
2026-01-25 13:12:18

Found jar file at ./feathr_2.12-1.0.0.jar


## Code

### Upload quick start data to hdfs

In [51]:
quick_start_data_list = !ls feathr_data/

for i in quick_start_data_list:
    df_name = i.split(".")[0]
    hdfs_path = f"{PATH_TO_APP_DATA}/{df_name}"

    df = spark.createDataFrame(pd.read_csv(f"feathr_data/{i}"))
    df.repartition(1).write.mode("overwrite").parquet(hdfs_path)
    
    last_path = hdfs_path.split("/")[-1]

    if "observation" in last_path:
        print(f"====== {last_path} ======")
        spark.read.parquet(f"{hdfs_path}").show(5)

+-------+----------+---------------+--------------+
|user_id|product_id|event_timestamp|product_rating|
+-------+----------+---------------+--------------+
|     u1|        p1|     2023-01-01|             5|
|     u1|        p2|     2023-01-02|             4|
|     u2|        p1|     2023-01-03|             3|
|     u2|        p3|     2023-01-04|             5|
|     u3|        p2|     2023-01-05|             2|
+-------+----------+---------------+--------------+



### Try Feathr

#### Define Feathr source

In [52]:
batch_source = HdfsSource(
    name="user_observation",
    path=f"{PATH_TO_APP_DATA}/user_observation",
    event_timestamp_column="event_timestamp",
    timestamp_format="yyyy-MM-dd",
)

#### 