In [1]:
import pandas as pd


In [2]:
def preprocess_events_data(df):
    """
    Preprocesses the events DataFrame.
    Converts 'timestamp' to datetime and creates a 'date' column.
    """
    df["ISO datetime"] = pd.to_datetime(df["timestamp"], unit="ms")
    df["date"] = df["ISO datetime"].dt.date
    return df

def preprocess_item_properties_data(df):
    """
    Preprocesses the item properties DataFrame.
    Converts 'timestamp' to datetime and creates a 'date' column.
    """
    df["ISO datetime"] = pd.to_datetime(df["timestamp"], unit="ms")
    df["date"] = df["ISO datetime"].dt.date
    return df


In [3]:
event_df = pd.read_csv("../dataset/events.csv")
item_properties_part_1_df = pd.read_csv("../dataset/item_properties_part1.csv")
item_properties_part_2_df = pd.read_csv("../dataset/item_properties_part2.csv")
event_df.head()


Unnamed: 0,timestamp,visitorid,event,itemid,transactionid
0,1433221332117,257597,view,355908,
1,1433224214164,992329,view,248676,
2,1433221999827,111016,view,318965,
3,1433221955914,483717,view,253185,
4,1433221337106,951259,view,367447,


In [4]:
# Preprocess dataframes
event_df_processed = preprocess_events_data(event_df.copy())
item_properties_part_1_df_processed = preprocess_item_properties_data(item_properties_part_1_df.copy())
item_properties_part_2_df_processed = preprocess_item_properties_data(item_properties_part_2_df.copy())

In [5]:
event_df_processed = preprocess_events_data(event_df)
print("Events DataFrame head:")
event_df_processed.head()

Events DataFrame head:


Unnamed: 0,timestamp,visitorid,event,itemid,transactionid,ISO datetime,date
0,1433221332117,257597,view,355908,,2015-06-02 05:02:12.117,2015-06-02
1,1433224214164,992329,view,248676,,2015-06-02 05:50:14.164,2015-06-02
2,1433221999827,111016,view,318965,,2015-06-02 05:13:19.827,2015-06-02
3,1433221955914,483717,view,253185,,2015-06-02 05:12:35.914,2015-06-02
4,1433221337106,951259,view,367447,,2015-06-02 05:02:17.106,2015-06-02


In [6]:
item_properties_part_1_df_processed = preprocess_item_properties_data(item_properties_part_1_df)
print("Item Properties Part 1 and 2 DataFrame head:")
item_properties_part_1_df_processed.head()

Item Properties Part 1 and 2 DataFrame head:


Unnamed: 0,timestamp,itemid,property,value,ISO datetime,date
0,1435460400000,460429,categoryid,1338,2015-06-28 03:00:00,2015-06-28
1,1441508400000,206783,888,1116713 960601 n277.200,2015-09-06 03:00:00,2015-09-06
2,1439089200000,395014,400,n552.000 639502 n720.000 424566,2015-08-09 03:00:00,2015-08-09
3,1431226800000,59481,790,n15360.000,2015-05-10 03:00:00,2015-05-10
4,1431831600000,156781,917,828513,2015-05-17 03:00:00,2015-05-17


In [7]:
item_properties_part_2_df_processed = preprocess_item_properties_data(item_properties_part_2_df)
print("Item Properties Part 2 DataFrame head:")
item_properties_part_2_df_processed.head()


Item Properties Part 2 DataFrame head:


Unnamed: 0,timestamp,itemid,property,value,ISO datetime,date
0,1433041200000,183478,561,769062,2015-05-31 03:00:00,2015-05-31
1,1439694000000,132256,976,n26.400 1135780,2015-08-16 03:00:00,2015-08-16
2,1435460400000,420307,921,1149317 1257525,2015-06-28 03:00:00,2015-06-28
3,1431831600000,403324,917,1204143,2015-05-17 03:00:00,2015-05-17
4,1435460400000,230701,521,769062,2015-06-28 03:00:00,2015-06-28


In [8]:
# # Example: Calculate Recency for each visitor in event_df
# latest_event_date = event_df_processed["date"].max()
# visitor_recency = event_df_processed.groupby("visitorid")["date"].max().reset_index()
# visitor_recency["Recency"] = (latest_event_date - visitor_recency["date"]).dt.days
# print("\nVisitor Recency (example):")
# print(visitor_recency.head())


In [9]:
event_df_processed.to_csv("../dataset/events_processed.csv", index=False)
item_properties_part_1_df_processed.to_csv("../dataset/item_properties_part1_processed.csv", index=False)
item_properties_part_2_df_processed.to_csv("../dataset/item_properties_part2_processed.csv", index=False)
item_properties_processed = pd.concat([item_properties_part_1_df_processed, item_properties_part_2_df_processed])
item_properties_processed.head()

Unnamed: 0,timestamp,itemid,property,value,ISO datetime,date
0,1435460400000,460429,categoryid,1338,2015-06-28 03:00:00,2015-06-28
1,1441508400000,206783,888,1116713 960601 n277.200,2015-09-06 03:00:00,2015-09-06
2,1439089200000,395014,400,n552.000 639502 n720.000 424566,2015-08-09 03:00:00,2015-08-09
3,1431226800000,59481,790,n15360.000,2015-05-10 03:00:00,2015-05-10
4,1431831600000,156781,917,828513,2015-05-17 03:00:00,2015-05-17


In [11]:
event_df_processed['date'].max()

datetime.date(2015, 9, 18)

In [10]:
item_properties_processed.to_csv("../dataset/item_properties_processed.csv", index=False)