In [1]:
# -----------------------------------------------
# [] 1. Import Libraries
# -----------------------------------------------
import os
%pip install google-cloud-bigquery db-dtypes --quiet

from google.cloud import bigquery
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
import seaborn as sns



Note: you may need to restart the kernel to use updated packages.


In [2]:
# Make sure you've run: gcloud auth application-default login
client = bigquery.Client()

In [3]:
query = """
SELECT *
FROM `scalable-streaming-analytics.streaming_data.processed_events`
WHERE timestamp >= UNIX_SECONDS(CURRENT_TIMESTAMP()) - 3600
"""

df = client.query(query).to_dataframe()
df.head()

I0000 00:00:1743712450.353514    2410 check_gcp_environment.cc:61] BIOS data file does not exist or cannot be opened.


Unnamed: 0,user_id,event,content_id,timestamp
0,57,click,content_34,1743712000.0
1,10,share,content_5,1743712000.0
2,78,share,content_3,1743712000.0
3,78,click,content_31,1743712000.0
4,8,click,content_2,1743712000.0


In [4]:
print("Missing values:\n", df.isnull().sum())
print("\nData types:\n", df.dtypes)

# Convert UNIX timestamp to datetime
df['timestamp'] = pd.to_datetime(df['timestamp'], unit='s')

# Encode categorical column
df['event'] = df['event'].astype('category')


Missing values:
 user_id       0
event         0
content_id    0
timestamp     0
dtype: int64

Data types:
 user_id         Int64
event          object
content_id     object
timestamp     float64
dtype: object


In [5]:
df.drop_duplicates(inplace=True)


In [6]:
def remove_timestamp_outliers(df):
    Q1 = df['timestamp'].quantile(0.25)
    Q3 = df['timestamp'].quantile(0.75)
    IQR = Q3 - Q1

    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    return df[(df['timestamp'] >= lower_bound) & (df['timestamp'] <= upper_bound)]

df = remove_timestamp_outliers(df)
df.head()


Unnamed: 0,user_id,event,content_id,timestamp
0,57,click,content_34,2025-04-03 20:31:47.289424640
1,10,share,content_5,2025-04-03 20:32:23.916934144
2,78,share,content_3,2025-04-03 20:32:08.355262464
3,78,click,content_31,2025-04-03 20:31:54.770574336
4,8,click,content_2,2025-04-03 20:32:58.555777792


In [7]:
scaler = MinMaxScaler()
df['timestamp'] = scaler.fit_transform(df[['timestamp']])
df.head()

Unnamed: 0,user_id,event,content_id,timestamp
0,57,click,content_34,0.100614
1,10,share,content_5,0.518479
2,78,share,content_3,0.340944
3,78,click,content_31,0.185963
4,8,click,content_2,0.913657


In [8]:
df.to_csv('data/cleaned_events.csv', index=False)