In [84]:
import pandas as pd
import datetime
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from functools import reduce
from collections import Counter


In [85]:
df = pd.read_csv("../data/raw/train_8wry4cB.csv")

In [86]:
def standard_datetime(column,format: str ="%d/%m/%y %H:%M"):
    return pd.to_datetime(column , format=format)

In [87]:
def standardize(df,drop_session_id=False):
    df.startTime = standard_datetime(df.startTime)
    df.endTime = standard_datetime(df.endTime)
    df.rename(columns={"ProductList":"product_list","startTime":"start_time","endTime":"end_time"},inplace=True)
    if not drop_session_id:
        df.drop(columns=["session_id"],inplace=True)

    for index,value in enumerate(["cat" , "sub-cat","sub-sub-cat","product"]):
        df[value] = df.product_list.apply(lambda x : [i.split("/")[index] for i in x.split(";")])
    return df

In [88]:
standardize(df)
df.head()

Unnamed: 0,start_time,end_time,product_list,gender,cat,sub-cat,sub-sub-cat,product
0,2014-12-15 18:11:00,2014-12-15 18:12:00,A00002/B00003/C00006/D28435/;A00002/B00003/C00...,female,"[A00002, A00002, A00002, A00002]","[B00003, B00003, B00003, B00003]","[C00006, C00006, C00006, C00006]","[D28435, D02554, D28436, D28437]"
1,2014-12-16 14:35:00,2014-12-16 14:41:00,A00001/B00009/C00031/D29404/;A00001/B00009/C00...,male,"[A00001, A00001, A00001, A00001, A00001, A0000...","[B00009, B00009, B00009, B00009, B00009, B0000...","[C00031, C00031, C00031, C00031, C00031, C0003...","[D29404, D02617, D29407, D29410, D29411, D2544..."
2,2014-12-01 15:58:00,2014-12-01 15:58:00,A00002/B00001/C00020/D16944/,female,[A00002],[B00001],[C00020],[D16944]
3,2014-11-23 02:57:00,2014-11-23 03:00:00,A00002/B00004/C00018/D10284/;A00002/B00004/C00...,female,"[A00002, A00002, A00002]","[B00004, B00004, B00004]","[C00018, C00018, C00018]","[D10284, D10285, D10286]"
4,2014-12-17 16:44:00,2014-12-17 16:46:00,A00001/B00001/C00012/D30805/;A00001/B00001/C00...,male,"[A00001, A00001]","[B00001, B00001]","[C00012, C00012]","[D30805, D30806]"


# Making Datetime feature

In [89]:
def set_day(df):
    weeks_day = ['Monday', 'Friday', 'Saturday', 'Tuesday', 'Sunday', 'Wednesday', 'Thursday']
    a = df.start_time.dt.day_name()
    for i in weeks_day:
        df[i] = (a == i)
    return df

In [90]:
def set_month_start(df):
    df["month_start"] = df.start_time.dt.is_month_start
    return df

In [91]:
def set_weeks(df):
    weeks = [51, 50, 49, 47, 48, 46, 52]
    a = df.start_time.dt.weekofyear
    for i in weeks:
        df[f"week_{i}"] = (a==i)
    return df

In [92]:
def set_interval_time(df , interval="3H"):
    df["st"] = pd.to_datetime(df.start_time.dt.strftime("%H:%M"),format="%H:%M")
    bins = df.groupby(pd.Grouper(key = "st",freq=interval)).count().index
    bins = [(i,j-pd.Timedelta(seconds=1)) for i,j in zip(bins[:-1],bins[1:])] + [(bins[-1] , bins[0]- pd.Timedelta(seconds=1))]
    for i , j in bins:
        df[f"time_{i.time().hour}"] = df.st.between(i,j)
    return df

In [93]:
def set_duration(df):
    a =(df["end_time"] - df["start_time"]).dt.seconds/60
    a[a > a.quantile(0.97)] = a.quantile(0.97) 
    df["duration"] = a
    return df

In [94]:
set_day(df)
set_month_start(df)
set_weeks(df)
set_interval_time(df)
set_duration(df)
df.head()

Unnamed: 0,start_time,end_time,product_list,gender,cat,sub-cat,sub-sub-cat,product,Monday,Friday,...,st,time_0,time_3,time_6,time_9,time_12,time_15,time_18,time_21,duration
0,2014-12-15 18:11:00,2014-12-15 18:12:00,A00002/B00003/C00006/D28435/;A00002/B00003/C00...,female,"[A00002, A00002, A00002, A00002]","[B00003, B00003, B00003, B00003]","[C00006, C00006, C00006, C00006]","[D28435, D02554, D28436, D28437]",True,False,...,1900-01-01 18:11:00,False,False,False,False,False,False,True,False,1.0
1,2014-12-16 14:35:00,2014-12-16 14:41:00,A00001/B00009/C00031/D29404/;A00001/B00009/C00...,male,"[A00001, A00001, A00001, A00001, A00001, A0000...","[B00009, B00009, B00009, B00009, B00009, B0000...","[C00031, C00031, C00031, C00031, C00031, C0003...","[D29404, D02617, D29407, D29410, D29411, D2544...",False,False,...,1900-01-01 14:35:00,False,False,False,False,True,False,False,False,6.0
2,2014-12-01 15:58:00,2014-12-01 15:58:00,A00002/B00001/C00020/D16944/,female,[A00002],[B00001],[C00020],[D16944],True,False,...,1900-01-01 15:58:00,False,False,False,False,False,True,False,False,0.0
3,2014-11-23 02:57:00,2014-11-23 03:00:00,A00002/B00004/C00018/D10284/;A00002/B00004/C00...,female,"[A00002, A00002, A00002]","[B00004, B00004, B00004]","[C00018, C00018, C00018]","[D10284, D10285, D10286]",False,False,...,1900-01-01 02:57:00,True,False,False,False,False,False,False,False,3.0
4,2014-12-17 16:44:00,2014-12-17 16:46:00,A00001/B00001/C00012/D30805/;A00001/B00001/C00...,male,"[A00001, A00001]","[B00001, B00001]","[C00012, C00012]","[D30805, D30806]",False,False,...,1900-01-01 16:44:00,False,False,False,False,False,True,False,False,2.0


In [95]:

df.columns

Index(['start_time', 'end_time', 'product_list', 'gender', 'cat', 'sub-cat',
       'sub-sub-cat', 'product', 'Monday', 'Friday', 'Saturday', 'Tuesday',
       'Sunday', 'Wednesday', 'Thursday', 'month_start', 'week_51', 'week_50',
       'week_49', 'week_47', 'week_48', 'week_46', 'week_52', 'st', 'time_0',
       'time_3', 'time_6', 'time_9', 'time_12', 'time_15', 'time_18',
       'time_21', 'duration'],
      dtype='object')



# Making Product List Feature


In [96]:
def set_product_count(df):
    df["product_count"] = df["product"].apply(len)
    return df

In [97]:
def create_useful_cat_sub_cat(train_df):
    # calculating male and female cat - sub-cat count 
    c_m = Counter()
    c_f = Counter()
    
    for index,rows in train_df.iterrows():
        a = [f"{i}-{j}"for i , j in zip(rows["cat"],rows["sub-cat"])]
        if rows["gender"] == "female":
            c_f.update(a)
        else:
            c_m.update(a)
    
    # cumsum for 97 percentile in female
    f = pd.DataFrame(c_f.items(),columns=["cat","value"]).sort_values(by="value",ascending=False).reset_index(drop=True)
    f["cu"] = (f.value.cumsum()/f.value.sum())
    f = f[f["cu"]<=0.975]
    # cumsum for 97 percentile in male
    m = pd.DataFrame(c_m.items(),columns=["cat","value"]).sort_values(by="value",ascending=False).reset_index(drop=True)
    m["cu"] = (f.value.cumsum()/f.value.sum())
    m = m[m["cu"]<=0.975]
    # merge to get "cat - sub-cat to be used for both of them"
    f = f.merge(m , how="outer",left_on="cat",right_on="cat", suffixes=('_female', '_male'))["cat"].values.tolist()
    
    return f

In [98]:
def cat_sub_cat(df ,cat_sub_cat_list=None):
    if cat_sub_cat_list is None:
        cat_sub_cat_list =  create_useful_cat_sub_cat(df) ## for testing purpose we have to save the cat_sub_cat_list in pickle format
    m = df.apply(lambda x: [f"{i}-{j}" for i ,j in zip(x["cat"],x["sub-cat"])] ,axis=1)
    m = m.apply(lambda x:  dict(Counter(x)))
    for i in cat_sub_cat_list:df[i] = m.apply(lambda x: x.get(i,0))
    return df

In [99]:
def set_duration_product_ratio(df):
    df["duration_product_ratio"] = df["duration"]/df["product_count"]

In [100]:
def cat_flag(df):
    cat_list = reduce(lambda x,y :set(x).union(set(y)) , df.cat)
    for i in cat_list:
        df[i] = df.cat.apply(lambda x: i in x)
    return df

In [101]:
def func(x):
    m_count = f_count=0
    for i in x:
        if i in only_f:
            f_count = f_count + 1
        elif i in only_m:
            m_count = m_count + 1
    if f_count!=0 and m_count==0:
        return 0
    elif m_count!=0  and f_count==0:
        return 1
    else:
        return None

def flag_product(train_df,test_df=None):
    global only_f , only_m
    m_set = reduce(lambda x,y: set(x).union(set(y)), train_df[train_df["gender"]=="male"]["product"])
    f_set = reduce(lambda x,y: set(x).union(set(y)), train_df[train_df["gender"]=="female"]["product"])
    only_f = f_set-m_set
    only_m = m_set-f_set

    if test_df is None:
        test_df=train_df
    flag = ["only_f_product" , "only_m_product"]
#     flag = ["only_f_product"]

    for i,val in enumerate(flag):
        test_df[val] = test_df["product"].apply(func) == i
    return test_df

def flag_sub_sub_cat(train_df,test_df=None):
    global only_f , only_m
    m_set = reduce(lambda x,y: set(x).union(set(y)), train_df[train_df["gender"]=="male"]["sub-sub-cat"])
    f_set = reduce(lambda x,y: set(x).union(set(y)), train_df[train_df["gender"]=="female"]["sub-sub-cat"])
    only_f = f_set-m_set
    only_m = m_set-f_set
    if test_df is None:
        test_df=train_df
    flag = ["only_f_sub_sub_cat" , "only_m_sub_sub_cat"]
#     flag = ["only_f_sub_sub_cat"]

    for i,val in enumerate(flag):
        test_df[val] = test_df["sub-sub-cat"].apply(func) == i
    return test_df

In [102]:
set_product_count(df)
cat_sub_cat(df)
set_duration_product_ratio(df)
cat_flag(df)
flag_product(df)
flag_sub_sub_cat(df)
df.columns

Index(['start_time', 'end_time', 'product_list', 'gender', 'cat', 'sub-cat',
       'sub-sub-cat', 'product', 'Monday', 'Friday', 'Saturday', 'Tuesday',
       'Sunday', 'Wednesday', 'Thursday', 'month_start', 'week_51', 'week_50',
       'week_49', 'week_47', 'week_48', 'week_46', 'week_52', 'st', 'time_0',
       'time_3', 'time_6', 'time_9', 'time_12', 'time_15', 'time_18',
       'time_21', 'duration', 'product_count', 'A00002-B00002',
       'A00002-B00003', 'A00002-B00001', 'A00003-B00012', 'A00002-B00007',
       'A00002-B00004', 'A00003-B00022', 'A00002-B00005', 'A00002-B00016',
       'A00002-B00017', 'A00002-B00006', 'A00003-B00021', 'A00003-B00020',
       'A00001-B00001', 'A00003-B00004', 'A00001-B00009', 'A00003-B00028',
       'A00005-B00019', 'A00002-B00011', 'A00001-B00015', 'A00001-B00004',
       'A00003-B00026', 'A00005-B00032', 'A00004-B00014', 'A00011-B00050',
       'A00001-B00031', 'A00003-B00036', 'A00002-B00010', 'A00006-B00057',
       'A00003-B00039', 'A00005

# Splitting Train and Valid.

In [103]:
# drop these columns
train_df  = df.drop(columns=['start_time', 'end_time', 'product_list','st','cat', 'sub-cat','sub-sub-cat', 'product'])

In [104]:
X = train_df.drop(columns=["gender"])
y = np.where(train_df.gender == "male" , 1 ,0)

In [105]:
from sklearn.model_selection import train_test_split

In [106]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42, stratify=y)

In [107]:
# from tpot import TPOTClassifier
# tpot = TPOTClassifier(generations=2, population_size=2 ,verbosity=2, random_state=42,n_jobs=-1)
# tpot.fit( X_train, y_train)
# print(tpot.score(X_test, y_test))

In [118]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_validate, StratifiedShuffleSplit
from sklearn.metrics import f1_score, accuracy_score, classification_report
model = LogisticRegression(max_iter=200)
model.fit(X_train,y_train)
model.score(X_test,y_test)

0.9717460317460317

In [110]:
test_df = pd.read_csv("../data/raw/test_Yix80N0.csv")
standardize(test_df,drop_session_id=False)
test_df.head()

Unnamed: 0,start_time,end_time,product_list,cat,sub-cat,sub-sub-cat,product
0,2014-12-08 13:36:00,2014-12-08 13:36:00,A00002/B00003/C00006/D19956/,[A00002],[B00003],[C00006],[D19956]
1,2014-12-19 13:52:00,2014-12-19 13:52:00,A00002/B00005/C00067/D02026/,[A00002],[B00005],[C00067],[D02026]
2,2014-12-01 10:44:00,2014-12-01 10:44:00,A00002/B00002/C00004/D12538/,[A00002],[B00002],[C00004],[D12538]
3,2014-12-08 20:19:00,2014-12-08 20:22:00,A00002/B00003/C00079/D22781/;A00002/B00003/C00...,"[A00002, A00002, A00002, A00002]","[B00003, B00003, B00003, B00003]","[C00079, C00079, C00079, C00079]","[D22781, D22782, D19325, D22786]"
4,2014-12-15 19:33:00,2014-12-15 19:33:00,A00002/B00001/C00010/D23419/,[A00002],[B00001],[C00010],[D23419]


In [111]:
set_day(test_df)
set_month_start(test_df)
set_weeks(test_df)
set_interval_time(test_df)
set_duration(test_df)

set_product_count(test_df)
cat_sub_cat(test_df ,cat_sub_cat_list=create_useful_cat_sub_cat(df) )
set_duration_product_ratio(test_df)
cat_flag(test_df)
flag_product(df,test_df)
flag_sub_sub_cat(df,test_df)


test_df.head()

Unnamed: 0,start_time,end_time,product_list,cat,sub-cat,sub-sub-cat,product,Monday,Friday,Saturday,...,A00001,A00007,A00010,A00005,A00011,A00003,only_f_product,only_m_product,only_f_sub_sub_cat,only_m_sub_sub_cat
0,2014-12-08 13:36:00,2014-12-08 13:36:00,A00002/B00003/C00006/D19956/,[A00002],[B00003],[C00006],[D19956],True,False,False,...,False,False,False,False,False,False,False,False,False,False
1,2014-12-19 13:52:00,2014-12-19 13:52:00,A00002/B00005/C00067/D02026/,[A00002],[B00005],[C00067],[D02026],False,True,False,...,False,False,False,False,False,False,True,False,False,False
2,2014-12-01 10:44:00,2014-12-01 10:44:00,A00002/B00002/C00004/D12538/,[A00002],[B00002],[C00004],[D12538],True,False,False,...,False,False,False,False,False,False,True,False,False,False
3,2014-12-08 20:19:00,2014-12-08 20:22:00,A00002/B00003/C00079/D22781/;A00002/B00003/C00...,"[A00002, A00002, A00002, A00002]","[B00003, B00003, B00003, B00003]","[C00079, C00079, C00079, C00079]","[D22781, D22782, D19325, D22786]",True,False,False,...,False,False,False,False,False,False,True,False,False,False
4,2014-12-15 19:33:00,2014-12-15 19:33:00,A00002/B00001/C00010/D23419/,[A00002],[B00001],[C00010],[D23419],True,False,False,...,False,False,False,False,False,False,False,False,False,False


In [112]:
test_df  = test_df.drop(columns=['start_time', 'end_time', 'product_list','st','cat', 'sub-cat','sub-sub-cat', 'product'])

In [113]:
test_df.columns

Index(['Monday', 'Friday', 'Saturday', 'Tuesday', 'Sunday', 'Wednesday',
       'Thursday', 'month_start', 'week_51', 'week_50', 'week_49', 'week_47',
       'week_48', 'week_46', 'week_52', 'time_0', 'time_3', 'time_6', 'time_9',
       'time_12', 'time_15', 'time_18', 'time_21', 'duration', 'product_count',
       'A00002-B00002', 'A00002-B00003', 'A00002-B00001', 'A00003-B00012',
       'A00002-B00007', 'A00002-B00004', 'A00003-B00022', 'A00002-B00005',
       'A00002-B00016', 'A00002-B00017', 'A00002-B00006', 'A00003-B00021',
       'A00003-B00020', 'A00001-B00001', 'A00003-B00004', 'A00001-B00009',
       'A00003-B00028', 'A00005-B00019', 'A00002-B00011', 'A00001-B00015',
       'A00001-B00004', 'A00003-B00026', 'A00005-B00032', 'A00004-B00014',
       'A00011-B00050', 'A00001-B00031', 'A00003-B00036', 'A00002-B00010',
       'A00006-B00057', 'A00003-B00039', 'A00005-B00044', 'A00005-B00018',
       'A00001-B00037', 'A00001-B00027', 'A00004-B00013',
       'duration_product_ratio'

In [114]:
sub = pd.read_csv("../data/raw/test_Yix80N0.csv")

In [119]:
sub["gender"]=model.predict(test_df)

In [120]:
sub.drop(columns =["startTime","endTime","ProductList"],inplace=True)

In [121]:
sub.gender = sub.gender.map({0:"female",1:"male"})

In [122]:
sub.to_csv("flag_third_submission.csv",index=False)