In [22]:
import numpy as np
import pandas as pd
import os 
os.chdir('/Users/yj/Library/CloudStorage/GoogleDrive-zyj960225@gmail.com/My Drive/MIDS/AIPI_531/HW_3/SA2C_code/Kaggle/data')

In [23]:
def load_dataframes(sorted_events, n_files=2, path_name="./item_properties_part"):
    """Load CSV data into DataFrames."""
    sorted_events_df = pd.read_csv(sorted_events)
    dfs = [pd.read_csv(f"{path_name}{i + 1}.csv") for i in range(n_files)]
    item_features_df = pd.concat(dfs, ignore_index=True)
    return sorted_events_df, item_features_df

def preprocess_item_features(sorted_events_df, item_features_df):
    """Preprocess item features DataFrame."""
    unique_item_ids = sorted_events_df["item_id"].unique()
    item_features_df = item_features_df[item_features_df["itemid"].isin(unique_item_ids)].drop_duplicates()
    # drop unavailable items (property = available, value = 0)
    item_features_df = item_features_df[~((item_features_df["property"] == "available") & (item_features_df["value"] == '0'))]
    item_features_df["property_value"] = item_features_df["property"].str.strip() + item_features_df["value"].str.strip()
    return item_features_df.drop(["timestamp"], axis=1).drop_duplicates()

def one_hot_encode_features(sorted_events_df, item_features_df, top_features=500):
    """One hot encode item features."""
    unique_event_items = sorted_events_df["item_id"].unique()
    unique_event_items.sort()
    properties = item_features_df["property_value"].value_counts().head(top_features).index.tolist()
    one_hot_encoded = []
    itemids = []
    
    for _, item in enumerate(unique_event_items):
        item_properties = set(item_features_df[item_features_df["itemid"] == item]["property_value"].unique())
        encoded_row = [1 if prop in item_properties else 0 for prop in properties]
        one_hot_encoded.append(encoded_row)
        itemids.append(item)
    
    print("One hot encoding done.") 
    return pd.DataFrame(one_hot_encoded), itemids

def create_feature_matrix(
    sorted_events,
    n_files=2,
    path_name="./item_properties_part",
    top_features=500,
):
    """Create item features matrix."""    
    sorted_events_df, item_features_df = load_dataframes(sorted_events, n_files, path_name)
    item_features_df = preprocess_item_features(sorted_events_df, item_features_df)

    return one_hot_encode_features(sorted_events_df, item_features_df, top_features)

In [24]:
df_ohe, _ = create_feature_matrix(sorted_events='sorted_events.csv', \
                                    n_files=2, \
                                    path_name="./item_properties_part", \
                                    top_features=500)

One hot encoding done.


In [25]:
df_ohe.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,490,491,492,493,494,495,496,497,498,499
0,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,1,1,1,0,0,0,0,0,0,...,0,0,0,1,1,0,0,0,0,0
2,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [26]:
# save to csv
df_ohe.to_csv('item_features.csv', index=False)