# Causal Story of E-Commerce Churn out

**Markov Property Validation**
It assumes that a graph's d-separations represent conditional independence relations in the data. In other words, if two nodes are d-separated in the graph, then they are conditionally independent in the data.

In [2]:
# Config dict to set the logging level
import logging.config
DEFAULT_LOGGING = {
    'version': 1,
    'disable_existing_loggers': False,
    'loggers': {
        '': {
            'level': 'INFO',
        },
    }
}

logging.config.dictConfig(DEFAULT_LOGGING)
# Disabling warnings output
import warnings
from sklearn.exceptions import DataConversionWarning, ConvergenceWarning
warnings.filterwarnings(action='ignore', category=DataConversionWarning)
warnings.filterwarnings(action='ignore', category=ConvergenceWarning)
warnings.filterwarnings(action='ignore', category=UserWarning)


In [3]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import dowhy
import matplotlib.pyplot as plt

In [4]:
data = pd.read_csv("EComm.csv")

In [5]:
data.columns

Index(['CustomerID', 'Churn', 'Tenure', 'PreferredLoginDevice', 'CityTier',
       'WarehouseToHome', 'PreferredPaymentMode', 'Gender', 'HourSpendOnApp',
       'NumberOfDeviceRegistered', 'PreferedOrderCat', 'SatisfactionScore',
       'MaritalStatus', 'NumberOfAddress', 'Complain',
       'OrderAmountHikeFromlastYear', 'CouponUsed', 'OrderCount',
       'DaySinceLastOrder', 'CashbackAmount'],
      dtype='object')

In [6]:
data.isnull().sum()

CustomerID                       0
Churn                            0
Tenure                         264
PreferredLoginDevice             0
CityTier                         0
WarehouseToHome                251
PreferredPaymentMode             0
Gender                           0
HourSpendOnApp                 255
NumberOfDeviceRegistered         0
PreferedOrderCat                 0
SatisfactionScore                0
MaritalStatus                    0
NumberOfAddress                  0
Complain                         0
OrderAmountHikeFromlastYear    265
CouponUsed                     256
OrderCount                     258
DaySinceLastOrder              307
CashbackAmount                   0
dtype: int64

In [7]:
data['HighlyDissatisfied'] = data.apply(lambda x: True if(x['SatisfactionScore']<=1) else False, axis = 1)

In [8]:
data.drop(['CustomerID', 'SatisfactionScore'], axis = 1, inplace = True)
data.dropna(inplace = True)
data.head()

Unnamed: 0,Churn,Tenure,PreferredLoginDevice,CityTier,WarehouseToHome,PreferredPaymentMode,Gender,HourSpendOnApp,NumberOfDeviceRegistered,PreferedOrderCat,MaritalStatus,NumberOfAddress,Complain,OrderAmountHikeFromlastYear,CouponUsed,OrderCount,DaySinceLastOrder,CashbackAmount,HighlyDissatisfied
0,1,4.0,Mobile Phone,3,6.0,Debit Card,Female,3.0,3,Laptop & Accessory,Single,9,1,11.0,1.0,1.0,5.0,160,False
3,1,0.0,Phone,3,15.0,Debit Card,Male,2.0,4,Laptop & Accessory,Single,8,0,23.0,0.0,1.0,3.0,134,False
5,1,0.0,Computer,1,22.0,Debit Card,Female,3.0,5,Mobile Phone,Single,2,1,22.0,4.0,6.0,7.0,139,False
11,1,11.0,Mobile Phone,1,6.0,Debit Card,Male,3.0,4,Fashion,Single,10,1,13.0,0.0,1.0,0.0,154,False
12,1,0.0,Phone,1,11.0,COD,Male,2.0,3,Mobile,Single,2,1,13.0,2.0,2.0,2.0,134,False


In [27]:
import pandas as pd
import numpy as np
from pgmpy.base.DAG import DAG
from pgmpy.estimators.CITests import chi_square
from pgmpy.independencies import IndependenceAssertion

# Build the causal DAG.
G = DAG()
G.add_edges_from(
    [
      ('HighlyDissatisfied', 'Churn'),
    ('Complain', 'Churn'),
    ('Complain', 'HighlyDissatisfied'),
    ('DaySinceLastOrder', 'Churn'),
    ('DaySinceLastOrder', 'OrderCount'),
    ('CityTier', 'OrderCount'),
    ('CouponUsed', 'OrderCount'),
    ('CouponUsed', 'OrderAmountHikeFromlastYear'),
    ('OrderCount', 'OrderAmountHikeFromlastYear'),
    ('OrderAmountHikeFromlastYear', 'Churn'),
    ('Tenure', 'OrderAmountHikeFromlastYear'),
    ('WarehouseToHome', 'CityTier'),
#     ('CashBackAmount', 'OrderCount'),
    ('PreferredLoginDevice', 'HourSpendOnApp'),
    ('HourSpendOnApp', 'OrderCount'),
    ('NumberOfAddress', 'NumberOfDeviceRegistered'),
    ('NumberOfDeviceRegistered', 'OrderCount'),
    ]
)

# List D-Separations
dseps = G.get_independencies()
# print(dseps)


In [28]:
%%time
# Run Chi-squared tests for independence
significance = .05

def test_dsep(dsep: IndependenceAssertion):
    test_outputs = []
    for X in list(dsep.get_assertion()[0]):
        for Y in list(dsep.get_assertion()[1]):
            Z = list(dsep.get_assertion()[2])
            test_result = chi_square(X=X, Y=Y, Z=Z, data=data, boolean=True, significance_level=significance)
            test_outputs.append((IndependenceAssertion(X, Y, Z), test_result))
    return test_outputs

results = [test_dsep(dsep) for dsep in dseps.get_assertions()]
results_flat = [item for sublist in results for item in sublist]
results = {k: v for k, v in results_flat}

# Hint on how to count the number of Trues.
sum(results.values()), len(results)

CPU times: user 1h 19min 21s, sys: 10.3 s, total: 1h 19min 31s
Wall time: 1h 19min 27s


(14701, 151360)

### Checking the results where the chi-square test returns true.
If test results are true: the two d-seperated variables are independent 
else: they are related

In [29]:
tocheck = []
for key,value in results.items():
    if value:
        print(key)
    else:
        tocheck.append(key)

(OrderCount ⟂ Tenure)
(OrderCount ⟂ Complain)
(OrderCount ⟂ HighlyDissatisfied)
(OrderCount ⟂ Complain | Tenure)
(OrderCount ⟂ Tenure | DaySinceLastOrder)
(OrderCount ⟂ Complain | DaySinceLastOrder)
(OrderCount ⟂ Tenure | NumberOfDeviceRegistered)
(OrderCount ⟂ Complain | NumberOfDeviceRegistered)
(OrderCount ⟂ NumberOfAddress | NumberOfDeviceRegistered)
(OrderCount ⟂ Tenure | Complain)
(OrderCount ⟂ Tenure | HourSpendOnApp)
(OrderCount ⟂ Complain | HourSpendOnApp)
(OrderCount ⟂ PreferredLoginDevice | HourSpendOnApp)
(OrderCount ⟂ Tenure | HighlyDissatisfied)
(OrderCount ⟂ Complain | HighlyDissatisfied)
(OrderCount ⟂ Tenure | NumberOfAddress)
(OrderCount ⟂ Complain | NumberOfAddress)
(OrderCount ⟂ Tenure | CityTier)
(OrderCount ⟂ Complain | CityTier)
(OrderCount ⟂ WarehouseToHome | CityTier)
(OrderCount ⟂ Tenure | WarehouseToHome)
(OrderCount ⟂ Complain | WarehouseToHome)
(OrderCount ⟂ Tenure | PreferredLoginDevice)
(OrderCount ⟂ Complain | PreferredLoginDevice)
(OrderCount ⟂ Tenure | 

In [None]:
tocheck

[(OrderCount ⟂ HighlyDissatisfied | Tenure),
 (OrderCount ⟂ HighlyDissatisfied | DaySinceLastOrder),
 (OrderCount ⟂ HighlyDissatisfied | NumberOfDeviceRegistered),
 (OrderCount ⟂ HighlyDissatisfied | Complain),
 (OrderCount ⟂ HighlyDissatisfied | HourSpendOnApp),
 (OrderCount ⟂ Complain | OrderAmountHikeFromlastYear),
 (OrderCount ⟂ HighlyDissatisfied | OrderAmountHikeFromlastYear),
 (OrderCount ⟂ HighlyDissatisfied | NumberOfAddress),
 (OrderCount ⟂ HighlyDissatisfied | CityTier),
 (OrderCount ⟂ HighlyDissatisfied | WarehouseToHome),
 (OrderCount ⟂ HighlyDissatisfied | PreferredLoginDevice),
 (OrderCount ⟂ HighlyDissatisfied | CouponUsed),
 (OrderCount ⟂ Complain | Tenure, DaySinceLastOrder),
 (OrderCount ⟂ HighlyDissatisfied | Tenure, DaySinceLastOrder),
 (OrderCount ⟂ Complain | Tenure, NumberOfDeviceRegistered),
 (OrderCount ⟂ NumberOfAddress | Tenure, NumberOfDeviceRegistered),
 (OrderCount ⟂ HighlyDissatisfied | Tenure, NumberOfDeviceRegistered),
 (OrderCount ⟂ HighlyDissatisfied

In [None]:
import csv
with open('results.csv', 'w') as f:
    for key in results.keys():
        f.write("%s,%s\n"%(key,my_dict[key]))

### Conclusions:

*On checking the chi-square results for the d-seperated pair of nodes, we concluded that the generated causal graph doesn't have any conflict.*