In [1]:
import cuml
import cudf
import nvcategory

import xgboost as xgb
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error, roc_auc_score

In [2]:
#Read in the data. Notice how it decompresses as it reads the data into memory. 
gdf = cudf.read_csv('/rapids/Data/black-friday.zip')

In [3]:
#Taking a look at the data. We use "to_pandas()" to get the pretty printing. 
gdf.head().to_pandas()

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
0,1000001,P00069042,F,0-17,10,A,2,0,3,,,8370
1,1000001,P00248942,F,0-17,10,A,2,0,1,6.0,14.0,15200
2,1000001,P00087842,F,0-17,10,A,2,0,12,,,1422
3,1000001,P00085442,F,0-17,10,A,2,0,12,14.0,,1057
4,1000002,P00285442,M,55+,16,C,4+,0,8,,,7969


In [4]:
#Exercise: Let's do some descriptive statistics 

In [5]:
#Hint: try some of the function you may know from Pandas like DataFrame.Series.max() or look up the documentation here:

In [6]:
#grabbing the first character of the years in city string to get rid of plus sign, and converting to int
gdf['city_years'] = gdf.Stay_In_Current_City_Years.str.get(0).stoi()

In [7]:
#Here we can see how we can control what the value of our dummies with the replace method and turn strings to ints
gdf['City_Category'] = gdf.City_Category.str.replace('A', '1')
gdf['City_Category'] = gdf.City_Category.str.replace('B', '2')
gdf['City_Category'] = gdf.City_Category.str.replace('C', '3')
gdf['City_Category'] = gdf['City_Category'].str.stoi()

In [8]:
#EXERCISE: replace city in the same way as City Category

In [9]:
#Hint: the Gender column only has values 'M' and 'F'

In [10]:
#Solution


In [11]:
#Let's take a look at how many products we have
prod_count = cudf.Series(nvcategory.from_strings(gdf.Product_ID.data).values()).unique().count() #hideous one-liner
print("Unique Products: {}".format(prod_count))

Unique Products: 3623


In [12]:
#Let's take a look at how many primary product categories we have
#We do it differently here because the variable is a number, not a string
prod1_count = gdf.Product_Category_1.unique().count()
print("Unique Product Categories: {}".format(prod1_count))

Unique Product Categories: 18


In [13]:
#Filling missing values
gdf['Product_Category_2'] = gdf['Product_Category_2'].fillna(0)

In [14]:
#EXERCISE: Make a variable that's 1 if the product is multi-category, 0 otherwise

In [15]:
#Hint: think about how to combine the Product Category 2 and Product Category 3

In [16]:
#Solution: 


In [17]:
#EXERCISE: Create a Gender/Marital Status Interaction Effect

In [18]:
#Hint: bother Gender and Marital Status are 0/1

In [19]:
#Solution:


In [20]:
#Because Occupation is a code, it should converted into indicator variables
gdf = gdf.one_hot_encoding('Occupation', 'occ_dummy', gdf.Occupation.unique())

In [21]:
#Dummy variable from Int
gdf = gdf.one_hot_encoding('City_Category', 'city_cat', gdf.City_Category.unique())

#Dummy from string
cat = nvcategory.from_strings(gdf.Age.data)
gdf['Age'] = cudf.Series(cat.values())
gdf = gdf.one_hot_encoding('Age', 'age', gdf.Age.unique())

#EXERCISE: Create dummy variables from Product Category 1

In [22]:
#Solution:


In [23]:
#We're going to drop th variables we've transformed
drop_list = ['User_ID', 'Age', 'Stay_In_Current_City_Years', 'City_Category','Product_ID', 'Product_Category_1', 'Product_Category_2', 'Product_Category_3']
gdf = gdf.drop(drop_list)

In [24]:
#We're going to make a list of all the first indicator variables in a series now so it will be
#easier to exclude them when we're doing regressions later

In [25]:
dummy_list = ['occ_dummy_0', 'city_cat_1', 'age_0', 'product_1', 'Purchase']

In [26]:
#All variables currently have to have the same type for some methods in cuML
for col in gdf.columns.tolist():
    gdf[col] = gdf[col].astype('float64')

In [27]:
train_size = round(len(gdf)*0.2)
test_size = round(len(gdf)-train_size)

In [28]:
train = gdf.iloc[0:train_size]

In [30]:
#EXERCISE: Make the test set in a similar way

In [32]:
#Solution:


In [33]:
#Deleting the main gdf because we're going to be making other subsets and other stuff, so it will be nice to have the memory. 
del(gdf)

In [34]:
y_train = gdf_train['Purchase']
X_reg = gdf_train.drop(dummy_list)

In [35]:
# # I'm going to perform a hyperparameter search for alpha in a ridge regression
# for alpha in np.arange(0.0, 1, 0.01):
    
#     Ridge = cuml.Ridge(alpha=alpha, fit_intercept=True)
#     _fit = Ridge.fit(X_reg, y_train)
#     _y_hat = _fit.predict(X_reg)
#     _roc = roc_auc_score(y_train, _y_hat)
#     output['MSE_RIDGE_{}'.format(alpha)] = _roc

# print('MAX AUC: {}'.format(min(output, key=output.get)))

In [36]:
# Ridge = cuml.Ridge(alpha=.1, fit_intercept=True)
# _fit = Ridge.fit(X_reg, y_train)
# _y_hat = _fit.predict(X_reg)
# _roc = roc_auc_score(y_train, _y_hat)
# output['MSE_RIDGE_{}'.format(alpha)] = _roc

In [39]:
# y_xgb = gdf_train[['Purchase']]
# X_xgb = gdf_train.drop('Purchase')
# xgb_train_set = xgb.DMatrix(data=X_xgb, label=y_xgb)

In [40]:
# xgb_params = {
#     'nround':100,
#     'max_depth':4,
#     'max_leaves':2**4,
#     'tree_method':'gpu_hist',
#     'n_gpus':1,
#     'loss':'ls',
#     'objective':'reg:linear',
#     'max_features':'auto',
#     'criterion':'friedman_mse',
#     'grow_policy':'lossguide',
#     'verbose':True
# }

In [41]:
# xgb_model = xgb.train(xgb_params, dtrain=xgb_train_set)

In [42]:
# y_hat_xgb = xgb_model.predict(xgb_train_set)

In [43]:
# RMSE = np.sqrt(mean_squared_error(y_xgb['Purchase'].to_pandas(), y_hat_xgb)) #get out of sample RMSE too

In [44]:
# print(RMSE)

In [47]:
#EXERCISE: Change XGB around to predict if someone is married based on the data we have

In [48]:
#Hint: in the xgb parameters, change the objective function to 'reg:logistic'

In [49]:
#Solution
# y_xgb = gdf_train[['Marital_Status']]
# X_xgb = gdf_train.drop('Marital_Status')
# xgb_train_set = xgb.DMatrix(data=X_xgb, label=y_xgb)

# xgb_params = {
#     'nround':100,
#     'max_depth':4,
#     'max_leaves':2**4,
#     'tree_method':'gpu_hist',
#     'n_gpus':1,
#     'loss':'ls',
#     'objective':'reg:logistic',
#     'max_features':'auto',
#     'criterion':'friedman_mse',
#     'grow_policy':'lossguide',
#     'verbose':True
# }

# xgb_model = xgb.train(xgb_params, dtrain=xgb_train_set)
# y_hat_xgb = xgb_model.predict(xgb_train_set)
# AUC = roc_auc_score(y_xgb['Marital_Status'].to_pandas(), y_hat_xgb)
# print(AUC)

In [50]:
#EXTRA EXERCISE: Apply kNN to the customers
#EXTRA EXERCISE: Apply PCA to data