### LIBRARIES

In [1]:
import time
import numpy as np
import pandas as pd
import datetime as dt

from matplotlib import pyplot as plt
import seaborn as sns

import scipy.stats as st
import empiricaldist as emd
import thinkstats as ts
import utils as ut

from sklearn.model_selection import train_test_split, StratifiedKFold, RepeatedStratifiedKFold, cross_validate, RandomizedSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, auc, roc_auc_score, precision_recall_curve, confusion_matrix, ConfusionMatrixDisplay, classification_report
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
from sklearn.calibration import CalibrationDisplay, CalibratedClassifierCV

import warnings

In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 150)
pd.set_option('display.max_rows', None)
pd.set_option('display.float_format', lambda x: '%.3f' % x)
warnings.filterwarnings('ignore')

### CLEANING AND VALIDATION

In [3]:
original_data = pd.read_csv('ecom_data.csv', encoding='unicode_escape', parse_dates=['InvoiceDate'])

df = original_data.copy()
df.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom


In [4]:
print(df.InvoiceDate.min(), df.InvoiceDate.max())

2010-12-01 08:26:00 2011-12-09 12:50:00


In [5]:
df.info() # there are some nulls in Description and CustomerID

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 541909 entries, 0 to 541908
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       --------------   -----         
 0   InvoiceNo    541909 non-null  object        
 1   StockCode    541909 non-null  object        
 2   Description  540455 non-null  object        
 3   Quantity     541909 non-null  int64         
 4   InvoiceDate  541909 non-null  datetime64[ns]
 5   UnitPrice    541909 non-null  float64       
 6   CustomerID   406829 non-null  float64       
 7   Country      541909 non-null  object        
dtypes: datetime64[ns](1), float64(2), int64(1), object(4)
memory usage: 33.1+ MB


In [6]:
df.isnull().sum()

InvoiceNo           0
StockCode           0
Description      1454
Quantity            0
InvoiceDate         0
UnitPrice           0
CustomerID     135080
Country             0
dtype: int64

In [7]:
pd.DataFrame(df.describe([.003, .01, .025, .05, .975, .99, .997])).T

Unnamed: 0,count,mean,min,0.3%,1%,2.5%,5%,50%,97.5%,99%,99.7%,max,std
Quantity,541909.0,9.552,-80995.000,-22.000,-2.000,1.000,1.000,3.000,48.000,100.000,240.000,80995.000,218.081
InvoiceDate,541909.0,2011-07-04 13:34:57.156386048,2010-12-01 08:26:00,2010-12-01 14:32:00,2010-12-03 11:13:00,2010-12-06 16:57:00,2010-12-13 09:35:00,2011-07-19 17:17:00,2011-12-05 17:24:00,2011-12-08 09:28:00,2011-12-09 08:39:00,2011-12-09 12:50:00,
UnitPrice,541909.0,4.611,-11062.060,0.000,0.190,0.390,0.420,2.080,12.750,18.000,39.950,38970.000,96.76
CustomerID,406829.0,15287.691,12346.000,12362.000,12415.000,12476.000,12626.000,15152.000,18109.000,18212.000,18262.000,18287.000,1713.6


In [8]:
# Investigating observations with unit price less than 0

df.loc[df['UnitPrice'] < 0, :].head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
299983,A563186,B,Adjust bad debt,1,2011-08-12 14:51:00,-11062.06,,United Kingdom
299984,A563187,B,Adjust bad debt,1,2011-08-12 14:52:00,-11062.06,,United Kingdom


In [9]:
# That A in front of the InvoiceNo stands for "adjust", probably. I'm gonna drop these two

print(len(df))

df = df.loc[df['UnitPrice'] >= 0, :]

print(len(df))

541909
541907


In [10]:
# What does it mean for the price to be zero?

df.loc[df['UnitPrice'] == 0, :].head(15)

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
622,536414,22139,,56,2010-12-01 11:52:00,0.0,,United Kingdom
1970,536545,21134,,1,2010-12-01 14:32:00,0.0,,United Kingdom
1971,536546,22145,,1,2010-12-01 14:33:00,0.0,,United Kingdom
1972,536547,37509,,1,2010-12-01 14:33:00,0.0,,United Kingdom
1987,536549,85226A,,1,2010-12-01 14:34:00,0.0,,United Kingdom
1988,536550,85044,,1,2010-12-01 14:34:00,0.0,,United Kingdom
2024,536552,20950,,1,2010-12-01 14:34:00,0.0,,United Kingdom
2025,536553,37461,,3,2010-12-01 14:35:00,0.0,,United Kingdom
2026,536554,84670,,23,2010-12-01 14:35:00,0.0,,United Kingdom
2406,536589,21777,,-10,2010-12-01 16:50:00,0.0,,United Kingdom


In [11]:
df.loc[df['UnitPrice'] == 0, 'CustomerID'].value_counts(dropna=False)

CustomerID
NaN          2475
13081.000       4
14646.000       4
14911.000       2
13985.000       2
12415.000       2
16560.000       1
15107.000       1
13239.000       1
13113.000       1
12457.000       1
14410.000       1
17667.000       1
16818.000       1
17560.000       1
12647.000       1
15581.000       1
12507.000       1
12748.000       1
16133.000       1
12446.000       1
18059.000       1
14110.000       1
15804.000       1
12437.000       1
12431.000       1
13014.000       1
15602.000       1
12603.000       1
12444.000       1
16406.000       1
13256.000       1
Name: count, dtype: int64

In [12]:
# It's tough to make a meaning out of those, many of them does not even have a CustomerID.

df.loc[df['UnitPrice'] == 0, :].shape

(2515, 8)

In [13]:
# I'm dropping those as well, since I don't have necessary information to make sense at first glance.

df = df.loc[df['UnitPrice'] > 0, :]

df.shape

(539392, 8)

In [14]:
# Let's check out invoices that have 'C' on their identifier, it stands for "cancel" I suppose.

df.loc[df['InvoiceNo'].str.contains('C'), :].head(30)

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
141,C536379,D,Discount,-1,2010-12-01 09:41:00,27.5,14527.0,United Kingdom
154,C536383,35004C,SET OF 3 COLOURED FLYING DUCKS,-1,2010-12-01 09:49:00,4.65,15311.0,United Kingdom
235,C536391,22556,PLASTERS IN TIN CIRCUS PARADE,-12,2010-12-01 10:24:00,1.65,17548.0,United Kingdom
236,C536391,21984,PACK OF 12 PINK PAISLEY TISSUES,-24,2010-12-01 10:24:00,0.29,17548.0,United Kingdom
237,C536391,21983,PACK OF 12 BLUE PAISLEY TISSUES,-24,2010-12-01 10:24:00,0.29,17548.0,United Kingdom
238,C536391,21980,PACK OF 12 RED RETROSPOT TISSUES,-24,2010-12-01 10:24:00,0.29,17548.0,United Kingdom
239,C536391,21484,CHICK GREY HOT WATER BOTTLE,-12,2010-12-01 10:24:00,3.45,17548.0,United Kingdom
240,C536391,22557,PLASTERS IN TIN VINTAGE PAISLEY,-12,2010-12-01 10:24:00,1.65,17548.0,United Kingdom
241,C536391,22553,PLASTERS IN TIN SKULLS,-24,2010-12-01 10:24:00,1.65,17548.0,United Kingdom
939,C536506,22960,JAM MAKING SET WITH JARS,-6,2010-12-01 12:38:00,4.25,17897.0,United Kingdom


In [15]:
df.loc[(df['InvoiceNo'].str.startswith('C')) & (df['Quantity'] >= 0), :].head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country


In [16]:
print(df.loc[df['InvoiceNo'].str.contains('C'), 'InvoiceDate'].min(), df.loc[df['InvoiceNo'].str.contains('C'), 'InvoiceDate'].max())

2010-12-01 09:41:00 2011-12-09 11:58:00


While creating variables, I need to make sure that I have that data available chronologically. To not deal with it immediately, and to set the baseline, I may start with creating variables regarding invoices.

In [17]:
# I'll make quantities and unitprices > 0 by taking their absolute, since I can already identify canceled orders via their InvoiceNo

df['Quantity'] = abs(df['Quantity'])

df.loc[df['UnitPrice'] < 0, :].head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
