In [1]:
import pandas as pd
import json
import sys
import ast
import os
import xgboost
import matplotlib.pyplot as plt

from sklearn.model_selection import KFold
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score

#Need to include the path to MINGW64 for XGBoost to work on a Windows Environment
mingw_path = 'C:\\Program Files\\mingw-w64\\x86_64-7.1.0-posix-seh-rt_v5-rev0\\mingw64\\bin'
os.environ['PATH'] = mingw_path + ';' + os.environ['PATH']

In [2]:
def get_user_browsed_products(product_clicks):
    browsed_products = []
    
    for index, product_row in product_clicks.iterrows():
        browsed_products.append(product_row.productId)
    
    return browsed_products

In [3]:
def get_user_purchased_products(product_clicks):
    purchased_products = []
    
    for index, product_row in product_clicks.iterrows():
        purchased_products.append(product_row.products)
    
    return purchased_products

In [4]:
def get_spending_habits(purchase_list, catalog_prices):
    avg_spending_list = []
    sale_product_list = []
    full_product_list = []
    avg_spending = 0
    num_items_sale = 0
    likes_sales = False
    
    for product_list in purchase_list:
        spending = 0
        for product in product_list:
            prod_id = product['pid']
            prod_qt = int(product['quantity'])
            catalog_entry = catalog_prices.loc[catalog_prices.pid == prod_id]
            sale_product_list.append(catalog_entry.is_sale.item())
            prod_price = catalog_entry.current_price.item()
            spending += (prod_price * prod_qt)
            full_product_list.append(prod_id)
            
        avg_spending_list.append(spending)
        
    if len(avg_spending_list) == 0:
        return -1,-1,-1
        
    for spent in avg_spending_list:
        avg_spending += spent
        
    for item_sale in sale_product_list:
        if item_sale:
            num_items_sale += 1
        
    avg_spending /= len(avg_spending_list)
    avg_spending += 0.5
    avg_spending = int(avg_spending)
    
    if num_items_sale >= (len(sale_product_list) / 2):
        likes_sales = True
        
    product_frequency_dict = {i:full_product_list.count(i) for i in full_product_list}
    cluster_list = []
    for pid in product_frequency_dict.keys():
        cluster_list.append({'pid':pid,
                             'cluster':catalog_prices.loc[catalog_prices.pid == pid].cluster.item(),
                            'frequency':product_frequency_dict[pid]})
        
    cluster_freq = {}
    for i in range(0,15):
        cluster_freq[i] = 0
        
    for cluster_obj in cluster_list:
        cluster_freq[cluster_obj['cluster']] += 1 + cluster_obj['frequency']
        
    favorite_cluster = -1
    top_num_cluster = 0
    
    for key, freq in cluster_freq.items():
        if freq > top_num_cluster:
            favorite_cluster = key
            top_num_cluster = freq
    
    return avg_spending, likes_sales, favorite_cluster

In [5]:
def get_browsing_habits(browse_list, catalog_clusters):
    full_product_list = []
    
    if type(browse_list) == type('str'):
        browse_list = ast.literal_eval(browse_list)
    
    for product in browse_list:
        spending = 0
        full_product_list.append(product)
        
    if len(full_product_list) == 0:
        return -1
    
    product_frequency_dict = {i:full_product_list.count(i) for i in full_product_list}
    cluster_list = []
    for pid in product_frequency_dict.keys():
        try:
            cluster_list.append({'pid':pid,
                                 'cluster':catalog_clusters.loc[catalog_clusters.pid == pid].cluster.item(),
                                 'frequency':product_frequency_dict[pid]})
        except:
            pass
        
    cluster_freq = {}
    for i in range(0,15):
        cluster_freq[i] = 0
        
    for cluster_obj in cluster_list:
        cluster_freq[cluster_obj['cluster']] += 1 + cluster_obj['frequency']
        
    favorite_cluster = -1
    top_num_cluster = 0
    
    for key, freq in cluster_freq.items():
        if freq > top_num_cluster:
            favorite_cluster = key
            top_num_cluster = freq
    
    return favorite_cluster

In [6]:
with open('inputs/data.json', 'r') as file:
    raw_data = pd.DataFrame(json.loads(line) for line in file)

raw_data.head(10)

Unnamed: 0,date,event_type,gender,page_type,productId,products,source,timestamp,uid
0,,pageview,F,cart,,,,2016-01-01 12:56:13,7f5dcd82442966b05f4ebba588c40357752d4a70
1,,pageview,M,cart,,,,2016-01-01 13:41:38,edb9cbe0bc232ae6ec219fefcf57dfbf0adaec8a
2,,pageview,M,checkout,,,,2016-01-01 13:54:50,edb9cbe0bc232ae6ec219fefcf57dfbf0adaec8a
3,,pageview,M,checkout,,,,2016-01-01 13:55:50,edb9cbe0bc232ae6ec219fefcf57dfbf0adaec8a
4,,pageview,M,cart,,,,2016-01-01 14:23:44,b8026364c974ef373aaac058558fbb358cdd2e19
5,,pageview,M,cart,,,,2016-01-01 14:23:53,b8026364c974ef373aaac058558fbb358cdd2e19
6,,pageview,M,cart,,,,2016-01-01 14:24:26,b8026364c974ef373aaac058558fbb358cdd2e19
7,,pageview,M,cart,,,,2016-01-01 14:26:06,b8026364c974ef373aaac058558fbb358cdd2e19
8,,pageview,M,cart,,,,2016-01-01 15:58:13,2d5f26235f2ee67cc8604ed49eafd7aef799931b
9,,pageview,F,cart,,,,2016-01-01 15:58:42,7f5dcd82442966b05f4ebba588c40357752d4a70


In [7]:
print(raw_data.event_type.unique())
print(raw_data.page_type.unique())
print(len(raw_data.uid.unique()))

['pageview' 'purchase']
['cart' 'checkout' 'search' 'confirmation' 'category' 'home' 'subcategory'
 'brand_landing' 'other' 'product' nan]
39995


In [8]:
rows = raw_data.loc[raw_data['uid'] == '0f5bd3d40e69a19454ec8579b9786c0130040c43']
rows.head(10)

Unnamed: 0,date,event_type,gender,page_type,productId,products,source,timestamp,uid
464,,pageview,M,category,,,,2016-01-03 23:55:46,0f5bd3d40e69a19454ec8579b9786c0130040c43
2775,,pageview,M,search,,,,2016-01-10 15:15:44,0f5bd3d40e69a19454ec8579b9786c0130040c43
18813,,pageview,M,cart,,,,2016-01-27 13:49:42,0f5bd3d40e69a19454ec8579b9786c0130040c43
18814,,pageview,M,cart,,,,2016-01-27 13:49:53,0f5bd3d40e69a19454ec8579b9786c0130040c43
18819,,pageview,M,cart,,,,2016-01-27 13:52:40,0f5bd3d40e69a19454ec8579b9786c0130040c43
18820,,pageview,M,cart,,,,2016-01-27 13:53:04,0f5bd3d40e69a19454ec8579b9786c0130040c43
18828,,pageview,M,checkout,,,,2016-01-27 14:15:06,0f5bd3d40e69a19454ec8579b9786c0130040c43
18829,,pageview,M,checkout,,,,2016-01-27 14:16:06,0f5bd3d40e69a19454ec8579b9786c0130040c43
18879,,pageview,M,checkout,,,,2016-01-27 10:49:20,0f5bd3d40e69a19454ec8579b9786c0130040c43
18941,,pageview,M,cart,,,,2016-01-27 12:01:52,0f5bd3d40e69a19454ec8579b9786c0130040c43


In [9]:
rows = raw_data.loc[raw_data['uid'] == '0f5bd3d40e69a19454ec8579b9786c0130040c43']
print(len(rows.index))
print(len(rows.loc[rows.page_type == 'product'].index))

143
0


In [10]:
#Looking to see if cart clicks contain any meaningful information
raw_data.loc[raw_data['page_type'] == 'cart'].head(10)

Unnamed: 0,date,event_type,gender,page_type,productId,products,source,timestamp,uid
0,,pageview,F,cart,,,,2016-01-01 12:56:13,7f5dcd82442966b05f4ebba588c40357752d4a70
1,,pageview,M,cart,,,,2016-01-01 13:41:38,edb9cbe0bc232ae6ec219fefcf57dfbf0adaec8a
4,,pageview,M,cart,,,,2016-01-01 14:23:44,b8026364c974ef373aaac058558fbb358cdd2e19
5,,pageview,M,cart,,,,2016-01-01 14:23:53,b8026364c974ef373aaac058558fbb358cdd2e19
6,,pageview,M,cart,,,,2016-01-01 14:24:26,b8026364c974ef373aaac058558fbb358cdd2e19
7,,pageview,M,cart,,,,2016-01-01 14:26:06,b8026364c974ef373aaac058558fbb358cdd2e19
8,,pageview,M,cart,,,,2016-01-01 15:58:13,2d5f26235f2ee67cc8604ed49eafd7aef799931b
9,,pageview,F,cart,,,,2016-01-01 15:58:42,7f5dcd82442966b05f4ebba588c40357752d4a70
13,,pageview,M,cart,,,,2016-01-01 21:13:31,64ee880e9a59063714361d945eadf5547159137f
24,,pageview,F,cart,,,,2016-01-01 12:34:25,7f5dcd82442966b05f4ebba588c40357752d4a70


In [11]:
#Nope, they don't contain any useful data
print(raw_data.loc[raw_data['page_type'] == 'cart'].products.unique())
print(raw_data.loc[raw_data['page_type'] == 'cart'].productId.unique())

[nan]
[nan]


In [12]:
#Exploring other types of click
raw_data.loc[raw_data['page_type'] == 'home'].head(10)

Unnamed: 0,date,event_type,gender,page_type,productId,products,source,timestamp,uid
589,,pageview,M,home,,,,2016-01-04 13:05:51,902ba3cda1883801594b6e1b452790cc53948fda
776,,pageview,M,home,,,,2016-01-05 15:04:02,902ba3cda1883801594b6e1b452790cc53948fda
777,,pageview,M,home,,,,2016-01-05 15:04:25,902ba3cda1883801594b6e1b452790cc53948fda
920,,pageview,M,home,,,,2016-01-05 16:47:18,902ba3cda1883801594b6e1b452790cc53948fda
1011,,pageview,M,home,,,,2016-01-05 12:59:08,902ba3cda1883801594b6e1b452790cc53948fda
1047,,pageview,M,home,,,,2016-01-05 20:15:58,902ba3cda1883801594b6e1b452790cc53948fda
1055,,pageview,M,home,,,,2016-01-05 11:47:25,da4b9237bacccdf19c0760cab7aec4a8359010b0
1239,,pageview,M,home,,,,2016-01-06 14:09:42,902ba3cda1883801594b6e1b452790cc53948fda
1251,,pageview,M,home,,,,2016-01-06 19:55:48,902ba3cda1883801594b6e1b452790cc53948fda
1329,,pageview,M,home,,,,2016-01-06 13:54:20,902ba3cda1883801594b6e1b452790cc53948fda


In [13]:
raw_data.loc[raw_data['page_type'] == 'brand_landing'].head(10)

Unnamed: 0,date,event_type,gender,page_type,productId,products,source,timestamp,uid
273967,,pageview,M,brand_landing,,,,2016-07-05 17:25:54,902ba3cda1883801594b6e1b452790cc53948fda
273969,,pageview,M,brand_landing,,,,2016-07-05 17:30:04,902ba3cda1883801594b6e1b452790cc53948fda
274499,,pageview,M,brand_landing,,,,2016-07-06 16:14:53,902ba3cda1883801594b6e1b452790cc53948fda
275357,,pageview,M,brand_landing,,,,2016-07-07 22:57:55,cbdbd2a35862c032cd05ff7314ca808bc9fba71a
275896,,pageview,M,brand_landing,,,,2016-07-07 17:07:35,29bc900c228881c64fad09cd9a75139451a6dc78
276566,,pageview,M,brand_landing,,,,2016-07-08 12:11:12,70088f5e0017437b35c9929446d1208e2f47ee2d
276673,,pageview,F,brand_landing,,,,2016-07-08 15:16:07,92dbe9c0659c8c69467204aab38f9846740b7587
278222,,pageview,F,brand_landing,,,,2016-07-11 14:32:51,f4b8f4cf4300b4a4b72548cc565bf149d74a98c5
278328,,pageview,F,brand_landing,,,,2016-07-11 12:37:12,ac4a34ab0b7506c78edb7305573ceb9d376b818e
279245,,pageview,F,brand_landing,,,,2016-07-12 15:07:53,74a5cd0a01ad30659b75795540a99c7961561742


In [14]:
raw_data.loc[raw_data['page_type'] == 'confirmation'].head(10)

Unnamed: 0,date,event_type,gender,page_type,productId,products,source,timestamp,uid
62,,pageview,F,confirmation,,,,2016-01-01 21:04:41,21c72d445bc329d710ee021f4439b102eec829cc
73,,pageview,M,confirmation,,,,2016-01-01 16:09:29,2d5f26235f2ee67cc8604ed49eafd7aef799931b
148,,pageview,M,confirmation,,,,2016-01-02 15:28:26,d7ccba57d8b64e61e40d13684205579b3c3e495d
152,,pageview,F,confirmation,,,,2016-01-02 23:19:33,c73dfc34d9bbe0216f9bcb7c2c632b05cb3f379d
167,,pageview,M,confirmation,,,,2016-01-02 01:43:17,d30cda886ece822cd079ab09ace34931bed0962d
187,,pageview,F,confirmation,,,,2016-01-02 12:27:58,488875c5dcc35a34be41d84b05dbe62875044c6c
200,,pageview,M,confirmation,,,,2016-01-02 14:01:36,48528da55b1608d48c40f7722a0ddb2ef09aa8a0
229,,pageview,F,confirmation,,,,2016-01-02 11:42:37,eb7d962ee05b55754e0d3fad066b71b5cfc6e4ab
230,,pageview,F,confirmation,,,,2016-01-02 11:44:42,eb7d962ee05b55754e0d3fad066b71b5cfc6e4ab
242,,pageview,F,confirmation,,,,2016-01-02 20:17:09,a270dff367152eaf1aa2a0763c482dafe8f63df7


In [15]:
print(raw_data.loc[raw_data['page_type'] == 'confirmation'].products.unique())
print(raw_data.loc[raw_data['page_type'] == 'confirmation'].productId.unique())

[nan]
[nan]


In [16]:
raw_data.loc[raw_data['page_type'] == 'search'].head(10)

Unnamed: 0,date,event_type,gender,page_type,productId,products,source,timestamp,uid
14,,pageview,M,search,,,,2016-01-01 22:57:41,116323fae490240dc08989558d2c381589ba8f5a
16,,pageview,M,search,,,,2016-01-01 22:58:02,116323fae490240dc08989558d2c381589ba8f5a
91,,pageview,M,search,,,,2016-01-01 22:55:27,116323fae490240dc08989558d2c381589ba8f5a
102,,pageview,M,search,,,,2016-01-01 22:56:29,116323fae490240dc08989558d2c381589ba8f5a
103,,pageview,M,search,,,,2016-01-01 22:56:43,116323fae490240dc08989558d2c381589ba8f5a
105,,pageview,M,search,,,,2016-01-01 22:57:18,116323fae490240dc08989558d2c381589ba8f5a
156,,pageview,F,search,,,,2016-01-02 17:27:17,eb7d962ee05b55754e0d3fad066b71b5cfc6e4ab
169,,pageview,F,search,,,,2016-01-02 10:49:13,968f6720ca38305576fd6be24868d56194e619f5
188,,pageview,F,search,,,,2016-01-02 12:51:04,01242adffa0b849a3aeb9988a438d51ffc888437
199,,pageview,F,search,,,,2016-01-02 10:55:15,968f6720ca38305576fd6be24868d56194e619f5


In [17]:
print(raw_data.loc[raw_data['page_type'] == 'search'].products.unique())
print(raw_data.loc[raw_data['page_type'] == 'search'].productId.unique())

[nan]
[nan]


In [18]:
raw_data.loc[raw_data['page_type'] == 'other'].head(30)

Unnamed: 0,date,event_type,gender,page_type,productId,products,source,timestamp,uid
280345,,pageview,M,other,,,,2016-07-13 12:58:04,902ba3cda1883801594b6e1b452790cc53948fda
280372,,pageview,M,other,,,,2016-07-13 13:00:05,902ba3cda1883801594b6e1b452790cc53948fda
280401,,pageview,M,other,,,,2016-07-13 13:00:45,902ba3cda1883801594b6e1b452790cc53948fda
453798,,pageview,M,other,,,,2016-09-02 10:58:27,e385938d6e1f9dba6e74f1ae9de83a0977dca170
505925,,pageview,M,other,,,,2016-09-20 11:26:29,e385938d6e1f9dba6e74f1ae9de83a0977dca170
596550,,pageview,M,other,,,,2016-10-18 15:41:21,85e1c2d321190c02a759d705b8491267b0e37bd2
1103184,,pageview,M,other,,,desktop,2017-03-20 18:34:39,c1c8ac7d9f3df143ddf40990ea5f75ccbd2fe6f8
1110475,,pageview,M,other,,,desktop,2017-03-21 12:47:01,ad95781c0e085c02a7c158e61fc09f7bbe327b65
1113264,,pageview,M,other,,,desktop,2017-03-21 12:51:08,8de9160ecfd086d985e48c4100ed4f07a45762f3


In [19]:
print(len(raw_data.loc[raw_data['page_type'] == 'other']))
print(raw_data.loc[raw_data['page_type'] == 'other'].products.unique())
print(raw_data.loc[raw_data['page_type'] == 'other'].productId.unique())

9
[nan]
[nan]


In [20]:
#All NaN page_type rows are purchase rows, so they must be treated here to reflect that

raw_data.page_type.fillna('purchase', inplace=True)

In [21]:
#This user proves that the value 'checkout' is noise
rows = raw_data.loc[raw_data['uid'] == '6db5f9338f9fc74889816f1494968155e4622eeb']
rows.head(10)

Unnamed: 0,date,event_type,gender,page_type,productId,products,source,timestamp,uid
15,,pageview,F,checkout,,,,2016-01-01 22:57:50,6db5f9338f9fc74889816f1494968155e4622eeb
17,,pageview,F,checkout,,,,2016-01-01 22:58:05,6db5f9338f9fc74889816f1494968155e4622eeb
104,,pageview,F,checkout,,,,2016-01-01 22:56:56,6db5f9338f9fc74889816f1494968155e4622eeb


In [None]:
#Dropping rows assumed to be noise views are dropped, in order to only work with meaningful data (and reduce dimensionality of the data)

raw_data.drop(raw_data.loc[raw_data['page_type'] == 'cart'].index, inplace=True, axis=0)
raw_data.drop(raw_data.loc[raw_data['page_type'] == 'home'].index, inplace=True, axis=0) #This column might be useful to determine how much the shopper browses around
raw_data.drop(raw_data.loc[raw_data['page_type'] == 'brand_landing'].index, inplace=True, axis=0) #This column might be useful to determine how much the shopper browses around
raw_data.drop(raw_data.loc[raw_data['page_type'] == 'search'].index, inplace=True, axis=0) #This column might be useful to determine how much the shopper browses around
raw_data.drop(raw_data.loc[raw_data['page_type'] == 'confirmation'].index, inplace=True, axis=0)
raw_data.drop(raw_data.loc[raw_data['page_type'] == 'other'].index, inplace=True, axis=0)
raw_data.drop(raw_data.loc[raw_data['page_type'] == 'checkout'].index, inplace=True, axis=0)
raw_data.drop(raw_data.loc[raw_data['page_type'] == 'category'].index, inplace=True, axis=0) #This column might be useful, have to test score and revisit later
raw_data.drop(raw_data.loc[raw_data['page_type'] == 'subcategory'].index, inplace=True, axis=0) #This column might be useful, have to test score and revisit later
print(raw_data.page_type.unique())
print(len(raw_data.index))

#for now I will ignore the 3 columns that might be useful and see what score I get. Later I might revisit this section

['product' 'purchase']
569580


In [None]:
#This will process the data we cleaned, so the click data is aggregated in one line

processed_dict = {}
count = 0
total_len = len(raw_data.uid.unique())

for user_id in raw_data.uid.unique():
    user_dict = {}
    user_data = raw_data.loc[raw_data['uid'] == user_id]
    user_dict['gender'] = user_data.gender.unique()[0]
    user_dict['products_browsed'] = get_user_browsed_products(user_data.loc[user_data.page_type == 'product'])
    user_dict['products_purchased'] = get_user_purchased_products(user_data.loc[user_data.event_type == 'purchase'])
 
    processed_dict[user_id] = user_dict
    
    count += 1
    sys.stdout.write('\r' + '%0.2f' % ((count / total_len) * 100) + '%')
    sys.stdout.flush()

42.69%

In [None]:
#Transform dict into pandas DataFrame and checking for errors in the data processing

processed_data = pd.DataFrame.from_dict(processed_dict, orient='index')
processed_data.head(10)

In [None]:
#Loading catalog so its data can be used to better understand the user's behaviour

catalog = pd.read_csv('processed_catalog.csv', sep=';')
catalog.drop('Unnamed: 0', axis=1, inplace=True)
processed_data = pd.read_csv('step1_data.csv', sep=';')

In [None]:
#Learning about purchase habits
new_list = []
count = 0
total_len = len(processed_data.uid.unique())

for user_id in processed_data.uid.unique():
    catalog_prices = catalog[['pid', 'current_price', 'is_sale', 'cluster']]
    purchase_list = processed_data.loc[processed_data.uid == user_id].products_purchased.item()
    avg_spending, likes_sales, favorite_cluster = get_spending_habits(purchase_list, catalog)
    new_list.append(
        {
            'uid' : user_id,
            'avg_spending' : avg_spending,
            'likes_sales' : likes_sales,
            'favorite_purchase_cluster' : favorite_cluster
        }
    )
    
    count += 1
    sys.stdout.write('\r' + '%0.2f' % ((count / total_len) * 100) + '%')
    sys.stdout.flush()
    
new_df = pd.DataFrame(new_list)
new_df.head(10)

In [None]:
joined_data = processed_data.join(new_df.set_index('uid'), on='uid')
joined_data.head(10)

In [None]:
#Learning about purchase habits
catalog = catalog[['pid', 'cluster']]
new_list = []
count = 0
total_len = len(processed_data.uid.unique())

for user_id in processed_data.uid.unique():
    catalog_prices = catalog[['pid', 'cluster']]
    browse_list = processed_data.loc[processed_data.uid == user_id].products_browsed.item()
    favorite_cluster = get_browsing_habits(browse_list, catalog)
    new_list.append(
        {
            'uid' : user_id,
            'favorite_browse_cluster' : favorite_cluster
        }
    )
    
    count += 1
    sys.stdout.write('\r' + '%0.2f' % ((count / total_len) * 100) + '%')
    sys.stdout.flush()
    
new_df = pd.DataFrame(new_list)
new_df.head(10)

In [None]:
final_data = joined_data.join(new_df.set_index('uid'), on='uid')
final_data.head(10)

In [None]:
final_data.set_index('uid', inplace=True)
final_data['likes_sales'] = final_data['likes_sales'].map({'True': 1, 'False': 0, '-1': -1}) #So XGBoost can work with it
final_data.head(10)

In [None]:
#Fitting the model with 5 K-fold Cross-validation
#Chosen classifier is XGBoost, because of its strong performance in online competitions (and dev familiarity with it)

X = final_data.drop('gender', axis=1)
Y = final_data['gender']
kf = KFold(n_splits=5, shuffle=True)
model = XGBClassifier(booster='gbtree', eval_metric="mlogloss")

for train_index, test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    Y_train, Y_test = Y.iloc[train_index], Y.iloc[test_index]
    
    model.fit(X_train, Y_train)
    
    results = cross_val_score(model, X, Y, cv=kf)
    print("Accuracy: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))

In [None]:
#Feature importance graph

fig, (ax1) = plt.subplots(1,figsize=(15, 10))
xgboost.plot_importance(model, ax=ax1)
plt.show()

In [None]:
target_data = pd.read_csv('target_final_data.csv', sep=';')
target_data.set_index('Unnamed: 0', inplace=True)
target_data.head(10)

In [None]:
y_pred = model.predict(target_data)
final_prediction = pd.DataFrame(y_pred, target_data.index)
final_prediction.reset_index(inplace=True)
final_prediction.columns = ['userId', 'gender']
final_prediction.head(10)

In [None]:
#polished file end
final_prediction.to_csv('prediction.csv', sep=',', index=False)