In [1]:
import pandas as pd
import datetime
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from functools import reduce

In [2]:
df = pd.read_csv("train_8wry4cB.csv")
df.sample(10)

Unnamed: 0,session_id,startTime,endTime,ProductList,gender
2511,u21698,22/12/14 12:53,22/12/14 12:54,A00002/B00007/C00017/D20066/,female
8253,u23350,12/12/14 22:16,12/12/14 22:16,A00001/B00001/C00019/D25468/;A00001/B00001/C00...,male
1237,u21227,06/12/14 17:12,06/12/14 17:12,A00002/B00003/C00014/D20077/,female
5114,u20939,06/12/14 11:03,06/12/14 11:04,A00003/B00026/C00306/D19266/;A00003/B00026/C00...,female
9562,u23127,16/12/14 10:52,16/12/14 10:57,A00002/B00007/C00021/D28776/;A00002/B00003/C00...,female
7959,u19982,03/12/14 20:40,03/12/14 20:44,A00001/B00001/C00019/D18089/;A00001/B00001/C00...,male
9635,u11404,18/11/14 10:58,18/11/14 10:58,A00001/B00001/C00029/D06605/,female
7575,u17322,12/12/14 11:05,12/12/14 11:09,A00002/B00003/C00005/D17897/;A00002/B00003/C00...,female
3941,u24396,18/12/14 11:18,18/12/14 11:18,A00003/B00022/C00036/D31516/,female
459,u18463,29/11/14 12:58,29/11/14 13:00,A00001/B00009/C00032/D15536/;A00001/B00031/C00...,male


# Check the number of rows and null values

In [3]:
print(f"Number of rows {df.__len__()}")

Number of rows 10500


In [4]:
df.notna().all(axis=0)

session_id     True
startTime      True
endTime        True
ProductList    True
gender         True
dtype: bool

since all the columns returns postive value that means no missing values are there.

# Target distribution

In [5]:
df.gender.value_counts() # count

female    8192
male      2308
Name: gender, dtype: int64

In [6]:
(df.gender.value_counts() *100 / df.__len__()).round(2) # percentage distribution

female    78.02
male      21.98
Name: gender, dtype: float64

# Converting date in standard format

In [7]:
def standard_datetime(column,format: str ="%d/%m/%y %H:%M"):
    """parses a string representation of datetime into an datetime object with the given format
    
    Args:
        column (Dataframe or series or str): The pandas series on which the string will be parsed to datetime object.
        format : The format for the parser. 
        
    Returns:
        Dataframe or series of Datetime objects
        
    References:
        https://docs.python.org/2/library/datetime.html#strftime-strptime-behavior

    """
    return pd.to_datetime(column , format=format)

In [8]:
df.startTime = standard_datetime(df.startTime)
df.endTime = standard_datetime(df.endTime)


In [9]:
df.head(5)

Unnamed: 0,session_id,startTime,endTime,ProductList,gender
0,u16159,2014-12-15 18:11:00,2014-12-15 18:12:00,A00002/B00003/C00006/D28435/;A00002/B00003/C00...,female
1,u10253,2014-12-16 14:35:00,2014-12-16 14:41:00,A00001/B00009/C00031/D29404/;A00001/B00009/C00...,male
2,u19037,2014-12-01 15:58:00,2014-12-01 15:58:00,A00002/B00001/C00020/D16944/,female
3,u14556,2014-11-23 02:57:00,2014-11-23 03:00:00,A00002/B00004/C00018/D10284/;A00002/B00004/C00...,female
4,u24295,2014-12-17 16:44:00,2014-12-17 16:46:00,A00001/B00001/C00012/D30805/;A00001/B00001/C00...,male


 # working with productlist column

In [10]:
df.rename(columns={"ProductList":"product_list","startTime":"start_time","endTime":"end_time"},inplace=True)
df.head(1)

Unnamed: 0,session_id,start_time,end_time,product_list,gender
0,u16159,2014-12-15 18:11:00,2014-12-15 18:12:00,A00002/B00003/C00006/D28435/;A00002/B00003/C00...,female


# Feature Creation

In [11]:
df.session_id.unique().__len__() , df.__len__()

(10500, 10500)

Since same all session ids are uniques that means we can remove this feature.

In [12]:
df.drop(columns=["session_id"],inplace=True)
df.head(1)

Unnamed: 0,start_time,end_time,product_list,gender
0,2014-12-15 18:11:00,2014-12-15 18:12:00,A00002/B00003/C00006/D28435/;A00002/B00003/C00...,female


Product list contains list of products viewed by the user in the given session and it also contains the category, sub category, sub-sub category and the product all encoded and separated with a slash symbol. Each consecutive product is separated with a semicolon.

A00002/B00003/C00006/D28435/

cat/sub-cat/sub-sub-cat/product/

In [13]:
df["pl_count"] = df.product_list.apply(lambda x : x.split(";").__len__())
df.head(1)

Unnamed: 0,start_time,end_time,product_list,gender,pl_count
0,2014-12-15 18:11:00,2014-12-15 18:12:00,A00002/B00003/C00006/D28435/;A00002/B00003/C00...,female,4


In [14]:
def unique_cat(index):
    return reduce(lambda x,y : x.union(y),df.product_list.apply(lambda x:set([i.split("/")[index] for i in x.split(";")])))

In [15]:
unique_cat(0)

{'A00001',
 'A00002',
 'A00003',
 'A00004',
 'A00005',
 'A00006',
 'A00007',
 'A00008',
 'A00009',
 'A00010',
 'A00011'}