In [1]:
import pandas as pd 
import numpy as np
import seaborn as sns 
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier
from warnings import filterwarnings
filterwarnings("ignore")
import parquet
from fastparquet import write, ParquetFile
import matplotlib.pyplot as plt 
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier


In [2]:
train = pd.read_parquet("training.snappy.parquet")
test = pd.read_parquet("test.snappy.parquet")
test = test[["date","referrer_url","current_url","page_type","product_price","cart_amount",
            "userid","sessionid","search_words","old_product_price","product_category","page_category","productid"]]

In [3]:
train.head(3)

Unnamed: 0,date,referrer_url,current_url,page_type,product_price,cart_amount,userid,sessionid,search_words,old_product_price,product_category,page_category,productid
0,2020-02-06 09:11:12,9d206cbddfddaa97c20690adcd6ea802,006bde08100ccffb2fa8884e700a49f8,category,,0.0,3649149be6ac6b2ded1ae8e5778e3f40,f67ed2ac99c1074b9f19cfe250db7f66,,,,[6408EC06126E8040270C7E8AAF8FC685],
1,2020-02-06 09:31:58,9d206cbddfddaa97c20690adcd6ea802,0da33cc234a95c72475b08138062184f,category,,0.0,4972b982adf86f4f2949208f780bc7fa,726c29cc12d35d7d1bd6b9807ef80d02,,,,[4B6DE4B085D3099FF02A6E1BCDC5D80A],
2,2020-02-06 09:06:52,d41d8cd98f00b204e9800998ecf8427e,9d206cbddfddaa97c20690adcd6ea802,main,,0.0,933513a2ab574affb218ec782411efe4,7f363cc0d075b82cd23cf9894b51a76b,,,,,


In [4]:
test.head(3)

Unnamed: 0,date,referrer_url,current_url,page_type,product_price,cart_amount,userid,sessionid,search_words,old_product_price,product_category,page_category,productid
0,2020-03-21 02:03:50,fb79c9fb058c55d599b2c67a9cfe8d48,9d206cbddfddaa97c20690adcd6ea802,main,,0.0,2b8467d8dffc4f420052ae3feddf0d4a,78a441abe18f03758ccfa1a65d055841,,,,,
1,2020-03-21 02:57:09,3d66a2183ccd73b86b7c9c9ba7ec82dd,e195aa19f28bc7835aa1a0bf295df2f1,category,,0.0,23b916225b742ba69265f64fa2032144,8569057a87f6c616dfdac8ff35d47cb0,,,,[6A4D46B89EBF7FCEED0CC74F00B89D64],
2,2020-03-21 02:22:08,4f8def2167aee81e50396674500e7367,e4c3c725ffb3f969c09d87183d8a93c8,category,,0.0,09c847566aa9ca324f8d1c69d9414d74,d651eb8518bda8419433ce3e4f61a438,,,,[EB45EC8149B8EAE211DD8BEDD560E92B],


In [5]:
testid = test["userid"]
test = test.drop(["userid"],axis=1)


In [6]:
train['date'] =  pd.to_datetime(train['date']).dt.normalize()

In [7]:
grouped = train.groupby('userid')['date']
train['second_lowest'] = grouped.transform(lambda x: x.nsmallest(2).max())

In [8]:
train = train.sort_values("date",ascending=False)
train2 = train.drop_duplicates(subset="userid",keep="last")
train2.shape


(211943, 14)

In [9]:
train2["diff"] = (train2["second_lowest"] - train2["date"]).dt.days

In [10]:
def f(row):
    if row["diff"] >= 8 :
        val = 1
    else:
        val=0
    return val

train2["churn"] = train2.apply(f,axis=1)

In [11]:
train2.churn.value_counts()

0    208118
1      3825
Name: churn, dtype: int64

In [12]:
train2_not_churn = train2[train2.churn == 0]
train2_churn = train2[train2.churn == 1]

In [14]:
from sklearn.utils import resample


train2_churn_upsampled = resample(train2_churn, 
                                 replace=True,     
                                 n_samples=100000)

In [15]:
new_train = pd.concat([train2_not_churn, train2_churn_upsampled])

In [16]:
new_train.churn.value_counts()

0    208118
1    100000
Name: churn, dtype: int64

In [17]:
fulldata = pd.concat([new_train,test],axis=0)

In [18]:
fulldata.head(2)

Unnamed: 0,date,referrer_url,current_url,page_type,product_price,cart_amount,userid,sessionid,search_words,old_product_price,product_category,page_category,productid,second_lowest,diff,churn
1202972,2020-02-27 00:00:00,434d75e1c05e2d5ac407296e521f2613,649d4b6c53ab2843a5b0e199ddea400c,main,,0.0,406867f4d13f4d14613460c509a7e055,60309670f8c12949ccac9895817f78be,,,,,,2020-02-27,0.0,0.0
1203071,2020-02-27 00:00:00,a0ccd46ef8dcd2416cb49dec3127dc72,aa5b6de2967900460ede9e2c0109e99e,category,,0.0,2960428483b0724fad1b94f681b20b09,59457aec6ae8d73533959c567addb385,,,,[B328A4F89E3374F0E4A7F823E6752E86],,2020-02-27,0.0,0.0


In [19]:
dummies = pd.get_dummies(fulldata["page_type"])
fulldata = fulldata.drop(["page_type"],axis=1)
fulldata = fulldata.drop(["search_words","product_category","page_category","productid","referrer_url","current_url","sessionid","second_lowest","diff"],axis=1)

In [158]:
fulldata.head(2)

Unnamed: 0,date,product_price,cart_amount,userid,old_product_price,churn
1202972,2020-02-27 00:00:00,5284000000000.0,0.0,406867f4d13f4d14613460c509a7e055,113773400.0,0.0
1203071,2020-02-27 00:00:00,5284000000000.0,0.0,2960428483b0724fad1b94f681b20b09,113773400.0,0.0


In [20]:
fulldata.isnull().sum()

date                      0
product_price        360252
cart_amount             285
userid               177589
old_product_price    364098
churn                177589
dtype: int64

In [21]:
fulldata.dtypes

date                  object
product_price        float64
cart_amount          float64
userid                object
old_product_price    float64
churn                float64
dtype: object

In [22]:
numeric_columns=fulldata.select_dtypes(include = ["int64","float64"])
for column in numeric_columns:
      fulldata[column].fillna(fulldata[column].median(),inplace=True)


In [23]:
fulldata.isnull().sum()

date                      0
product_price             0
cart_amount               0
userid               177589
old_product_price         0
churn                     0
dtype: int64

In [24]:
fulldata.drop(["date","userid"],axis=1,inplace=True)

In [25]:
fulldata.isnull().sum()

product_price        0
cart_amount          0
old_product_price    0
churn                0
dtype: int64

In [26]:
test.head(2)

Unnamed: 0,date,referrer_url,current_url,page_type,product_price,cart_amount,sessionid,search_words,old_product_price,product_category,page_category,productid
0,2020-03-21 02:03:50,fb79c9fb058c55d599b2c67a9cfe8d48,9d206cbddfddaa97c20690adcd6ea802,main,,0.0,78a441abe18f03758ccfa1a65d055841,,,,,
1,2020-03-21 02:57:09,3d66a2183ccd73b86b7c9c9ba7ec82dd,e195aa19f28bc7835aa1a0bf295df2f1,category,,0.0,8569057a87f6c616dfdac8ff35d47cb0,,,,[6A4D46B89EBF7FCEED0CC74F00B89D64],


In [27]:
train = new_train.drop(["userid"],axis=1)


In [28]:
train.head(2)

Unnamed: 0,date,referrer_url,current_url,page_type,product_price,cart_amount,sessionid,search_words,old_product_price,product_category,page_category,productid,second_lowest,diff,churn
1202972,2020-02-27,434d75e1c05e2d5ac407296e521f2613,649d4b6c53ab2843a5b0e199ddea400c,main,,0.0,60309670f8c12949ccac9895817f78be,,,,,,2020-02-27,0,0
1203071,2020-02-27,a0ccd46ef8dcd2416cb49dec3127dc72,aa5b6de2967900460ede9e2c0109e99e,category,,0.0,59457aec6ae8d73533959c567addb385,,,,[B328A4F89E3374F0E4A7F823E6752E86],,2020-02-27,0,0


In [29]:
df = pd.concat([fulldata,dummies],axis=1)

In [30]:
train_len = len(new_train)
train = df[:train_len]
test = df[train_len:]


In [31]:
train.head(2)

Unnamed: 0,product_price,cart_amount,old_product_price,churn,cart,category,main,other,productDetail,success
1202972,350000.0,0.0,415000.0,0.0,0,0,1,0,0,0
1203071,350000.0,0.0,415000.0,0.0,0,1,0,0,0,0


In [32]:
test.head(2)

Unnamed: 0,product_price,cart_amount,old_product_price,churn,cart,category,main,other,productDetail,success
0,350000.0,0.0,415000.0,0.0,0,0,1,0,0,0
1,350000.0,0.0,415000.0,0.0,0,1,0,0,0,0


In [33]:
test.drop(["churn"],axis=1,inplace=True)

In [34]:
y = train.churn.values
x = train.drop(["churn"],axis=1)
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.20)


In [36]:
xgb = XGBClassifier().fit(x_train,y_train)
lightgbm = LGBMClassifier().fit(x_train,y_train)
randomforest = RandomForestClassifier().fit(x_train,y_train)
neural = MLPClassifier().fit(x_train,y_train)
modeller = [xgb,lightgbm,randomforest,neural]

for model in modeller:
    isimler = model.__class__.__name__
    pred = model.predict(x_test)
    acc = accuracy_score(pred,y_test)
    print(isimler + " ---> " + " accuracy : {:.2%} ".format(acc))



XGBClassifier --->  accuracy : 68.03% 
LGBMClassifier --->  accuracy : 68.02% 
RandomForestClassifier --->  accuracy : 68.04% 
MLPClassifier --->  accuracy : 34.27% 
