In [1]:
import pandas as pd
from sklearn import datasets

In [3]:
# Load data

cali_set = datasets.fetch_california_housing()
df = pd.DataFrame(cali_set.data, columns=cali_set.feature_names)

In [4]:
df.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25


In [27]:
# Split dataset into set of features
df1 = df.iloc[:,:2]
df2 = df.iloc[:,2:4]
df3 = df.iloc[:,4:6]
df4 = df.iloc[:,6:]
target_df = pd.DataFrame(data=cali_set.target, columns=["target"])

In [29]:
# Generate arbitrary timestamps for each feature rows

timestamps = pd.date_range(
    end=pd.Timestamp.now(),
    periods=len(df),
    freq='D').to_frame(name="event_timestamp", index=False)

In [30]:
timestamps.head()

Unnamed: 0,event_timestamp
0,1966-06-26 12:49:29.518923
1,1966-06-27 12:49:29.518923
2,1966-06-28 12:49:29.518923
3,1966-06-29 12:49:29.518923
4,1966-06-30 12:49:29.518923


In [31]:
# Concat timestamp to each df

df1 = pd.concat([df1, timestamps], axis=1)
df2 = pd.concat([df2, timestamps], axis=1)
df3 = pd.concat([df3, timestamps], axis=1)
df4 = pd.concat([df4, timestamps], axis=1)

In [34]:
# Create arbitrary id for feature rows

ids = pd.DataFrame(data=list(range(len(df))), columns=['house_id'])

In [35]:
# Add ids to each df

df1 = pd.concat([df1, ids], axis=1)
df2 = pd.concat([df2, ids], axis=1)
df3 = pd.concat([df3, ids], axis=1)
df4 = pd.concat([df4, ids], axis=1)

In [36]:
df1.head()

Unnamed: 0,MedInc,HouseAge,event_timestamp,house_id
0,8.3252,41.0,1966-06-26 12:49:29.518923,0
1,8.3014,21.0,1966-06-27 12:49:29.518923,1
2,7.2574,52.0,1966-06-28 12:49:29.518923,2
3,5.6431,52.0,1966-06-29 12:49:29.518923,3
4,3.8462,52.0,1966-06-30 12:49:29.518923,4


In [37]:
# Output

df1.to_parquet(path="data/df1.parquet")
df2.to_parquet(path="data/df2.parquet")
df3.to_parquet(path="data/df3.parquet")
df4.to_parquet(path="data/df4.parquet")
target_df.to_parquet(path="data/target_df.parquet")