In [15]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random
import os

# Sklearn
from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split

# Helper Functions
from function import date_time_convertion, list_to_cols

# Getting Started

1. Download the data, if not already in the current directory
Wget may not work, so visit https://drive.google.com/file/d/1qe5hOSBxzIuxBb1G_Ih5X-O65QElollE/view?usp=sharing to downlaod the data from google drive

In [16]:
if 'KuaiRec 2.0' not in os.listdir() and 'data' not in os.listdir():
    # !wget 'https://docs.google.com/uc?export=download&id=1qe5hOSBxzIuxBb1G_Ih5X-O65QElollE' --no-check-certificate -O data.zip
    print('Data file not found!')
    if 'KuaiRec 2.0.zip' in os.listdir():
        print('Zipfile found. Unziping')
        !unzip 'KuaiRec.zip' -d ./

2. Switch to the KuaiRec directory

In [17]:
DIR = "KuaiRec 2.0/"
if 'data' not in os.listdir():
    os.chdir(DIR)
rootpath="./"
print(rootpath)

./


In [18]:
# print("Loading big matrix...")
# big_matrix = pd.read_csv(rootpath + "data/big_matrix.csv")
print("Loading small matrix...")
small_matrix = pd.read_csv(rootpath + "data/small_matrix.csv")

print("Loading social network...")
social_network = pd.read_csv(rootpath + "data/social_network.csv")
social_network["friend_list"] = social_network["friend_list"].map(eval)

print("Loading item features...")
item_categories = pd.read_csv(rootpath + "data/item_categories.csv")
item_categories["feat"] = item_categories["feat"].map(eval)

print("Loading user features...")
user_features = pd.read_csv("data/user_features.csv")

print("Loading items' daily features...")
item_daily_features = pd.read_csv("data/item_daily_features.csv")

print("All data loaded.")

Loading small matrix...
Loading social network...
Loading item features...
Loading user features...
Loading items' daily features...
All data loaded.


## Data Exploration

Since we are mainly work with small_matrix our explorations are going to focus on that dataset.

In [19]:
small_matrix.head()

Unnamed: 0,user_id,video_id,play_duration,video_duration,time,date,timestamp,watch_ratio
0,14,148,4381,6067,2020-07-05 05:27:48.378,20200705.0,1593898000.0,0.722103
1,14,183,11635,6100,2020-07-05 05:28:00.057,20200705.0,1593898000.0,1.907377
2,14,3649,22422,10867,2020-07-05 05:29:09.479,20200705.0,1593898000.0,2.063311
3,14,5262,4479,7908,2020-07-05 05:30:43.285,20200705.0,1593898000.0,0.566388
4,14,8234,4602,11000,2020-07-05 05:35:43.459,20200705.0,1593899000.0,0.418364


In [20]:
# sum([True for idx, row in small_matrix.iterrows() if any(row.isnull())])
print(f"Small Matrix shape: {small_matrix.shape}")
rows_wnans = small_matrix.shape[0] - small_matrix.dropna().shape[0]
print(f"There are {rows_wnans} rows with NaNs")

Small Matrix shape: (4676570, 8)
There are 181992 rows with NaNs


Since there are 181991 rows that have NaNs we will impute them in order not to lose valuable data. We could the compare how our results differe if we were to only drop these rows or impute them.

## Data Preparation

#### Convert Date time to integers

In [21]:
import pandas as pd
import numpy as np
import datetime as dt

def date_time_convertion(col):
    col = pd.to_datetime(col)
    min_col = col.min()
    for idx, value in enumerate(col):
        new_val = (value - min_col).total_seconds()
        col.iloc[idx] = new_val
    return col

small_matrix["time"] = date_time_convertion(small_matrix["time"])

#### Merge Datasets

In [7]:
# merged_df = pd.merge(small_matrix, item_categories, on="video_id", how="left")
# print(merged_df.shape)

# # merged_df = pd.merge(small_matrix, item_categories, on="video_id", how="inner")
# # print(merged_df.shape) 
# # This to ensure that every video_id in small_matrix is also in item_categories

# merged_df = pd.merge(merged_df, social_network, on="user_id", how="left")
# print(merged_df.shape)

# # merged_df = pd.merge(small_matrix, social_network, on="user_id", how="inner")
# # print(merged_df.shape) 


In [22]:
small_matrix_merged = (
    small_matrix
    .merge(item_categories, on="video_id", how="left")
    # .merge(social_network, on="user_id", how="left")
)
display(small_matrix_merged.head())

Unnamed: 0,user_id,video_id,play_duration,video_duration,time,date,timestamp,watch_ratio,feat
0,14,148,4381,6067,97462.318,20200705.0,1593898000.0,0.722103,"[11, 28, 19]"
1,14,183,11635,6100,97473.997,20200705.0,1593898000.0,1.907377,[28]
2,14,3649,22422,10867,97543.419,20200705.0,1593898000.0,2.063311,[9]
3,14,5262,4479,7908,97637.225,20200705.0,1593898000.0,0.566388,[25]
4,14,8234,4602,11000,97937.399,20200705.0,1593899000.0,0.418364,[6]


In [23]:
small_matrix_merged = list_to_cols(small_matrix_merged, "feat")

#### Data Imputation

In [24]:
# KNN Imputation
# imputer = KNNImputer(n_neighbors=20)
# df_filled = imputer.fit_transform(small_matrix)

# Median/Mean/Mode Imputation
for col in small_matrix_merged.columns:
    small_matrix_merged[col] = small_matrix_merged[col].fillna(small_matrix_merged[col].median()) # median() or mode()[0]
display(small_matrix_merged)    

Unnamed: 0,user_id,video_id,play_duration,video_duration,time,date,timestamp,watch_ratio,feat_1,feat_2,feat_3,feat_4
0,14,148,4381,6067,97462.318,20200705.0,1.593898e+09,0.722103,11,28.0,19.0,0.0
1,14,183,11635,6100,97473.997,20200705.0,1.593898e+09,1.907377,28,0.0,0.0,0.0
2,14,3649,22422,10867,97543.419,20200705.0,1.593898e+09,2.063311,9,0.0,0.0,0.0
3,14,5262,4479,7908,97637.225,20200705.0,1.593898e+09,0.566388,25,0.0,0.0,0.0
4,14,8234,4602,11000,97937.399,20200705.0,1.593899e+09,0.418364,6,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
4676565,7162,2267,11908,5467,2423337.210,20200801.0,1.596224e+09,2.178160,25,0.0,0.0,0.0
4676566,7162,2065,11919,6067,2423337.210,20200801.0,1.596224e+09,1.964562,9,17.0,0.0,0.0
4676567,7162,1296,16690,19870,2423337.210,20200801.0,1.596224e+09,0.839960,1,5.0,0.0,0.0
4676568,7162,4822,11862,24400,2423337.210,20200801.0,1.596224e+09,0.486148,9,0.0,0.0,0.0


#### Train-Test Split

In [25]:
X = small_matrix_merged.drop(columns=["watch_ratio"])
y = small_matrix_merged["watch_ratio"]

In [26]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.18, random_state=2024, shuffle=True)

### MAB - LinUCB

**TODO**

In [None]:
def compute_ucb(x, A, b, alpha):
    UCB = []
    Ainv = np.linalg.inv(A)
    Rhat = x.T @ Ainv @ b
    Uhat = alpha * np.sqrt(x.T @ Ainv @ x)
    UCB = Rhat + Uhat
    return np.array(UCB)

def linucb(alpha, trials):
    K = len(small_matrix_merged["video_id"].unique())
    D = len(small_matrix_merged.columns) # -1?? Also with or without "watch_ratio"
    A = [np.eye(D) for i in range(K)]
    b = [np.zeros(D).reshape(3, 1) for i in range(K)] # Re-write reshape
    regret = np.zeros(trials)
    values_of_idx = []
    
    for i in range(trials):
        # Get Data
        idx_data = random.randint(0, 4676569)
        x = np.array(X.iloc[idx_data][:])
        # x = x.reshape(3, 1)

        # Pull Arm
        UCB = compute_ucb(x, A, b, alpha)
        idx = np.argmax(UCB)
        reward = y.iloc[idx_data]

        # Update A and b
        A[idx] += np.outer(x, x)
        b[idx] += reward * x

        # Compute Regret
        regret[i] = 

    return regret