# ML testing on yoochoose dataset
## Date: April 2023
## Author: Ta-Wei
## Version: 0.1

In [1]:
%matplotlib inline

import numpy as np
import scipy as sp
import pandas as pd
import random
import itertools

from matplotlib import pyplot as plt
from datetime import datetime

import tensorflow as tf
import torch
import sklearn
import keras
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.svm import NuSVC
from sklearn.neural_network import MLPClassifier

from libsvm.svmutil import *

from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score

from sklearn import datasets

import networkx as nx


In [2]:
# Original data sets are located at:
# https://www.kaggle.com/datasets/chadgostopp/recsys-challenge-2015
# I will use yoochoose-buys.dat and yoochoose-test.dat to test graph creation and GCN ML build-test pipeline
#
# The raw data and jupyter notebook can be found at: C:/Users/c1twc/P_CCB
# To upload to Github, you need to copy/paste jupyter notebook to OneDrive location:
# Ta-Wei - Personal/Documents/GitHub/ZTA

#buy = pd.read_csv("yoochoose-buys.csv")
#test = pd.read_csv("yoochoose-test.csv")
test = pd.read_csv('yoochoose-test.dat', header=None, names=['sessionId','timeStamp','itemId','category'], dtype={'sessionId':'int64', 'timeStamp':'object','itemId':'int64','category':'string'})
buy = pd.read_csv("yoochoose-buys.dat", header=None, names=['sessionId', 'timeStamp', 'itemId', 'price','quantity'])
click = pd.read_csv('yoochoose-clicks.dat', header=None, names=['sessionId','timeStamp','itemId','category'], dtype={'sessionId':'int64','timeStamp':'object','itemId':'int64','category':'string'})

In [3]:
# reduce the size of dataframe to 1/100 for easier processing
buy_small = buy[0:11500]
click_small = click[0:330000]
buy_unique = list(set(buy_small['sessionId']))
click_unique = list(set(click_small['sessionId']))

In [4]:
# Pre-processing
# Split the raw timeStamp, e.g. 2014-04-06T18:44:58.314Z, into two new columns: 'date' and 'time'

date_lst = list(pd.to_datetime(click_small['timeStamp']).dt.date)
time = list(pd.to_datetime(click_small['timeStamp']).dt.time )
time_lst = [str(t) for t in time] # convert the datetime object to string with HH:MM:SS format
#click_small.insert(4, 'date', date_lst)
#click_small.insert(5, 'time', time_lst)

new_cols1 = {'date':date_lst,'time':time_lst}
click_small = click_small.assign(**new_cols1)

# Pre-processing
# Split the 'date' column into 3 new columns: 'year', 'month', and 'day'

year_lst = list(pd.to_datetime(click_small['date']).dt.year)
month_lst = list(pd.to_datetime(click_small['date']).dt.month)
day_lst = list(pd.to_datetime(click_small['date']).dt.day)

#click_small.insert(6, 'year', year_lst)
#click_small.insert(7, 'month', month_lst)
#click_small.insert(8, 'day', day_lst)

new_cols2 = {'year':year_lst, 'month':month_lst,'day':day_lst}
click_small = click_small.assign(**new_cols2)

# Take the 'time' string column containing HH:MM:SS string data, then split it into 3 columns: 'hour','minute',
# and 'second'
click_small[['hour','minute','second']] = click_small['time'].str.split(':', expand=True)

day_of_week_lst = list(pd.to_datetime(click_small['date']).dt.day_name())
click_small.insert(12, 'day_of_week', day_of_week_lst, True)

# Convert day_of_week from string to numbers
click_small.loc[click_small['day_of_week'] == 'Sunday', 'dow'] = 7
click_small.loc[click_small['day_of_week'] == 'Monday', 'dow'] = 1
click_small.loc[click_small['day_of_week'] == 'Tuesday', 'dow'] = 2
click_small.loc[click_small['day_of_week'] == 'Wednesday', 'dow'] = 3
click_small.loc[click_small['day_of_week'] == 'Thursday', 'dow'] = 4
click_small.loc[click_small['day_of_week'] == 'Friday', 'dow'] = 5
click_small.loc[click_small['day_of_week'] == 'Saturday', 'dow'] = 6

# Convert the column data type from float to int
click_small = click_small.astype({'dow':'int'})

# Drop redundant columns 
click_small = click_small.drop(['timeStamp','date','time'], axis=1)

In [5]:
# Create a 'purchase' column and assign '0' or '1' to each 'sessionId'
# '0': sessionId doesn't generate 'buy'
# '1': sessionId generates a 'buy' action
# 
# There are several way to populate the results for 'purchase' column; the following way is the fastest one
#
click_small['purchase'] = click_small['sessionId'].isin(buy_unique).astype(int)

#
# using Python list comprehensive is pretty slow; don't use it if the size of data is large
#
#purchase_lst = [1 if c in buy_unique else 0 for c in click_small['sessionId']]
#click_small.insert(4, "purchase", purchase_lst, True)

## Model: Logistic Regression
### (1) classification problem using supervised ML
### (2) feature vector: [itemId, category, month, day, hour, minute, dow]
### (3) prediction: [purchase]

In [6]:
# X: Feature set
# y: Target set

# Test feature set #1:
#features = click_small[['itemId','category','month','day','hour','minute','dow']]

# Test feature set #2:
features = click_small[['itemId','category','day','hour','minute','dow']]
targets = click_small[['purchase']]

In [7]:
# Split the training and testing datasets into 80% vs. 20% ration, with stratification and scaling applied
#
features_train, features_test, targets_train, targets_test = train_test_split(features, targets, test_size=0.2, random_state=42, stratify=targets)

#features_ndarray_train = features_train.to_numpy()

sc = StandardScaler()
sc.fit(features_train)
#sc.fit(feature_ndarray_train)

features_train_std = sc.transform(features_train)
#features_train_std = sc.transform(features_ndarray_train)
features_test_std = sc.transform(features_test)

In [8]:
#Convert Pandas DataFrame to an Numpy 1d array
#labels_lst_train = list(labels_train['purchase'])
#
targets_lst_train = targets_train['purchase'].values.flatten()
targets_lst_test  = targets_test['purchase'].values.flatten()

In [9]:
weights, params = [], []
for c in np.arange(-5, 5):
#    lr = LogisticRegression(C=10.**c, solver='lbfgs', multi_class='ovr')
    lr = LogisticRegression(C=10.**c, solver='liblinear', multi_class='ovr') # highly optimized solver
    lr.fit(features_train_std, targets_lst_train)
    weights.append(lr.coef_[0])
    params.append(10.**c)
    
weights = np.array(weights)

In [10]:
# Need to import the following library:
# from sklearn.model_selection import train_test_split
# 
# Reference:
# https://sparkbyexamples.com/pandas/pandas-create-test-and-train-samples-from-dataframe/
#
Click_train, Click_test = train_test_split(click_small, test_size=0.2, random_state=42)
Buy_train, Buy_test = train_test_split(buy_small, test_size=0.2, random_state=42)

## Model: SVC

In [11]:
svm = SVC(kernel='linear', C=1.0, random_state=42)
svm.fit(features_train_std, targets_lst_train)

In [12]:
svm_acc = svm.score(features_test_std, targets_test)
print(f'Accuracy on test data set: {svm_acc:.3f}')

Accuracy on test data set: 0.968


In [13]:
nusvc = NuSVC(nu=0.03)
nusvc.fit(features_train_std, targets_lst_train)

In [14]:
nusvc_acc = nusvc.score(features_test_std, targets_test)
print(f'Accuracy on test data set: {nusvc_acc:.3f}')

Accuracy on test data set: 0.373


## Model:Multi-layer Perceptron (MLP)

In [15]:
clf = MLPClassifier(solver='lbfgs', activation='relu', hidden_layer_sizes=(40, 2), alpha=1e-5, random_state=42)
clf.fit(features_train_std, targets_lst_train)

In [None]:
clf_acc = clf.score(features_test_std, targets_test)
print(f'Accuracy on test data set: {clf_acc:.3f}')