In [1]:
import numpy as np
import pandas as pd
import category_encoders as ce
import sys, os
import warnings
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from mpl_toolkits.mplot3d import Axes3D
from sklearn.preprocessing import OneHotEncoder

In [2]:
sys.path.append(os.path.abspath(os.path.join('../scripts')))
from file_handler import FileHandler
from df_selector import *
from df_cleaner import *
from df_visualizer import *

In [3]:
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', 50)

# Reading Data

In [4]:
file_handler = FileHandler()

In [5]:
# reading the store csv file

missing_values=["n/a", "na", "undefined"]
df = pd.read_csv("../data/data.csv", na_values=missing_values)
            
df.head()


Unnamed: 0,TransactionId,BatchId,AccountId,SubscriptionId,CustomerId,CurrencyCode,CountryCode,ProviderId,ProductId,ProductCategory,ChannelId,Amount,Value,TransactionStartTime,PricingStrategy,FraudResult
0,TransactionId_76871,BatchId_36123,AccountId_3957,SubscriptionId_887,CustomerId_4406,UGX,256,ProviderId_6,ProductId_10,airtime,ChannelId_3,1000.0,1000,2018-11-15T02:18:49Z,2,0
1,TransactionId_73770,BatchId_15642,AccountId_4841,SubscriptionId_3829,CustomerId_4406,UGX,256,ProviderId_4,ProductId_6,financial_services,ChannelId_2,-20.0,20,2018-11-15T02:19:08Z,2,0
2,TransactionId_26203,BatchId_53941,AccountId_4229,SubscriptionId_222,CustomerId_4683,UGX,256,ProviderId_6,ProductId_1,airtime,ChannelId_3,500.0,500,2018-11-15T02:44:21Z,2,0
3,TransactionId_380,BatchId_102363,AccountId_648,SubscriptionId_2185,CustomerId_988,UGX,256,ProviderId_1,ProductId_21,utility_bill,ChannelId_3,20000.0,21800,2018-11-15T03:32:55Z,2,0
4,TransactionId_28195,BatchId_38780,AccountId_4841,SubscriptionId_3829,CustomerId_988,UGX,256,ProviderId_4,ProductId_6,financial_services,ChannelId_2,-644.0,644,2018-11-15T03:34:21Z,2,0


# General Statistics

In [6]:
# number of elements in the df
df.size

1530592

In [7]:
# rows and columns in the df
df.shape

(95662, 16)

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 95662 entries, 0 to 95661
Data columns (total 16 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   TransactionId         95662 non-null  object 
 1   BatchId               95662 non-null  object 
 2   AccountId             95662 non-null  object 
 3   SubscriptionId        95662 non-null  object 
 4   CustomerId            95662 non-null  object 
 5   CurrencyCode          95662 non-null  object 
 6   CountryCode           95662 non-null  int64  
 7   ProviderId            95662 non-null  object 
 8   ProductId             95662 non-null  object 
 9   ProductCategory       95662 non-null  object 
 10  ChannelId             95662 non-null  object 
 11  Amount                95662 non-null  float64
 12  Value                 95662 non-null  int64  
 13  TransactionStartTime  95662 non-null  object 
 14  PricingStrategy       95662 non-null  int64  
 15  FraudResult        

# Missing Values

In [9]:
percent_missing_values(df)

The dataset contains 0.0 % missing values.


# Data Types

In [10]:
#check if there are columns with mixed data types.
show_cols_mixed_dtypes(df)

None of the columns contain mixed types.


In [11]:
df.dtypes

TransactionId            object
BatchId                  object
AccountId                object
SubscriptionId           object
CustomerId               object
CurrencyCode             object
CountryCode               int64
ProviderId               object
ProductId                object
ProductCategory          object
ChannelId                object
Amount                  float64
Value                     int64
TransactionStartTime     object
PricingStrategy           int64
FraudResult               int64
dtype: object

In [12]:
df.head()

Unnamed: 0,TransactionId,BatchId,AccountId,SubscriptionId,CustomerId,CurrencyCode,CountryCode,ProviderId,ProductId,ProductCategory,ChannelId,Amount,Value,TransactionStartTime,PricingStrategy,FraudResult
0,TransactionId_76871,BatchId_36123,AccountId_3957,SubscriptionId_887,CustomerId_4406,UGX,256,ProviderId_6,ProductId_10,airtime,ChannelId_3,1000.0,1000,2018-11-15T02:18:49Z,2,0
1,TransactionId_73770,BatchId_15642,AccountId_4841,SubscriptionId_3829,CustomerId_4406,UGX,256,ProviderId_4,ProductId_6,financial_services,ChannelId_2,-20.0,20,2018-11-15T02:19:08Z,2,0
2,TransactionId_26203,BatchId_53941,AccountId_4229,SubscriptionId_222,CustomerId_4683,UGX,256,ProviderId_6,ProductId_1,airtime,ChannelId_3,500.0,500,2018-11-15T02:44:21Z,2,0
3,TransactionId_380,BatchId_102363,AccountId_648,SubscriptionId_2185,CustomerId_988,UGX,256,ProviderId_1,ProductId_21,utility_bill,ChannelId_3,20000.0,21800,2018-11-15T03:32:55Z,2,0
4,TransactionId_28195,BatchId_38780,AccountId_4841,SubscriptionId_3829,CustomerId_988,UGX,256,ProviderId_4,ProductId_6,financial_services,ChannelId_2,-644.0,644,2018-11-15T03:34:21Z,2,0


In [13]:
# get the columns with object data type
string_columns = df.select_dtypes(include='object').columns.tolist()
string_columns

['TransactionId',
 'BatchId',
 'AccountId',
 'SubscriptionId',
 'CustomerId',
 'CurrencyCode',
 'ProviderId',
 'ProductId',
 'ProductCategory',
 'ChannelId',
 'TransactionStartTime']

In [14]:
convert_to_string(df, string_columns)

In [15]:
df.dtypes

TransactionId           string[python]
BatchId                 string[python]
AccountId               string[python]
SubscriptionId          string[python]
CustomerId              string[python]
CurrencyCode            string[python]
CountryCode                      int64
ProviderId              string[python]
ProductId               string[python]
ProductCategory         string[python]
ChannelId               string[python]
Amount                         float64
Value                            int64
TransactionStartTime    string[python]
PricingStrategy                  int64
FraudResult                      int64
dtype: object

In [16]:
df.head()

Unnamed: 0,TransactionId,BatchId,AccountId,SubscriptionId,CustomerId,CurrencyCode,CountryCode,ProviderId,ProductId,ProductCategory,ChannelId,Amount,Value,TransactionStartTime,PricingStrategy,FraudResult
0,TransactionId_76871,BatchId_36123,AccountId_3957,SubscriptionId_887,CustomerId_4406,UGX,256,ProviderId_6,ProductId_10,airtime,ChannelId_3,1000.0,1000,2018-11-15T02:18:49Z,2,0
1,TransactionId_73770,BatchId_15642,AccountId_4841,SubscriptionId_3829,CustomerId_4406,UGX,256,ProviderId_4,ProductId_6,financial_services,ChannelId_2,-20.0,20,2018-11-15T02:19:08Z,2,0
2,TransactionId_26203,BatchId_53941,AccountId_4229,SubscriptionId_222,CustomerId_4683,UGX,256,ProviderId_6,ProductId_1,airtime,ChannelId_3,500.0,500,2018-11-15T02:44:21Z,2,0
3,TransactionId_380,BatchId_102363,AccountId_648,SubscriptionId_2185,CustomerId_988,UGX,256,ProviderId_1,ProductId_21,utility_bill,ChannelId_3,20000.0,21800,2018-11-15T03:32:55Z,2,0
4,TransactionId_28195,BatchId_38780,AccountId_4841,SubscriptionId_3829,CustomerId_988,UGX,256,ProviderId_4,ProductId_6,financial_services,ChannelId_2,-644.0,644,2018-11-15T03:34:21Z,2,0


In [17]:
convert_to_datetime(df, ['TransactionStartTime'])

In [18]:
df.dtypes

TransactionId                string[python]
BatchId                      string[python]
AccountId                    string[python]
SubscriptionId               string[python]
CustomerId                   string[python]
CurrencyCode                 string[python]
CountryCode                           int64
ProviderId                   string[python]
ProductId                    string[python]
ProductCategory              string[python]
ChannelId                    string[python]
Amount                              float64
Value                                 int64
TransactionStartTime    datetime64[ns, UTC]
PricingStrategy                       int64
FraudResult                           int64
dtype: object

In [19]:
df.head()

Unnamed: 0,TransactionId,BatchId,AccountId,SubscriptionId,CustomerId,CurrencyCode,CountryCode,ProviderId,ProductId,ProductCategory,ChannelId,Amount,Value,TransactionStartTime,PricingStrategy,FraudResult
0,TransactionId_76871,BatchId_36123,AccountId_3957,SubscriptionId_887,CustomerId_4406,UGX,256,ProviderId_6,ProductId_10,airtime,ChannelId_3,1000.0,1000,2018-11-15 02:18:49+00:00,2,0
1,TransactionId_73770,BatchId_15642,AccountId_4841,SubscriptionId_3829,CustomerId_4406,UGX,256,ProviderId_4,ProductId_6,financial_services,ChannelId_2,-20.0,20,2018-11-15 02:19:08+00:00,2,0
2,TransactionId_26203,BatchId_53941,AccountId_4229,SubscriptionId_222,CustomerId_4683,UGX,256,ProviderId_6,ProductId_1,airtime,ChannelId_3,500.0,500,2018-11-15 02:44:21+00:00,2,0
3,TransactionId_380,BatchId_102363,AccountId_648,SubscriptionId_2185,CustomerId_988,UGX,256,ProviderId_1,ProductId_21,utility_bill,ChannelId_3,20000.0,21800,2018-11-15 03:32:55+00:00,2,0
4,TransactionId_28195,BatchId_38780,AccountId_4841,SubscriptionId_3829,CustomerId_988,UGX,256,ProviderId_4,ProductId_6,financial_services,ChannelId_2,-644.0,644,2018-11-15 03:34:21+00:00,2,0


## Duplicates

In [20]:
# search for duplicate rows and drop them
drop_duplicates(df)

No duplicate rows were found.


In [21]:
df.duplicated(subset=['TransactionId', 'TransactionStartTime']).all()

False

## Feature Engineering

### Aggregate Features

In [22]:
df.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
CountryCode,95662.0,256.0,0.0,256.0,256.0,256.0,256.0,256.0
Amount,95662.0,6717.846433,123306.797164,-1000000.0,-50.0,1000.0,2800.0,9880000.0
Value,95662.0,9900.583941,123122.087776,2.0,275.0,1000.0,5000.0,9880000.0
PricingStrategy,95662.0,2.255974,0.732924,0.0,2.0,2.0,2.0,4.0
FraudResult,95662.0,0.002018,0.044872,0.0,0.0,0.0,0.0,1.0


In [23]:
df ['TransactionTotal']= df['Value'].sum()
df ['TransactionCount']= df['Value'].count()
df ['TransactionSTD']= df['Value'].std()
df ['Transactionaverage']= df['Value'].mean()

In [24]:
df.sample(10)

Unnamed: 0,TransactionId,BatchId,AccountId,SubscriptionId,CustomerId,CurrencyCode,CountryCode,ProviderId,ProductId,ProductCategory,ChannelId,Amount,Value,TransactionStartTime,PricingStrategy,FraudResult,TransactionTotal,TransactionCount,TransactionSTD,Transactionaverage
1619,TransactionId_129966,BatchId_30069,AccountId_4841,SubscriptionId_3829,CustomerId_625,UGX,256,ProviderId_4,ProductId_6,financial_services,ChannelId_2,-50.0,50,2018-11-16 13:58:28+00:00,2,0,947109661,95662,123122.087776,9900.583941
64048,TransactionId_111790,BatchId_68377,AccountId_4841,SubscriptionId_3829,CustomerId_446,UGX,256,ProviderId_4,ProductId_6,financial_services,ChannelId_2,-5000.0,5000,2019-01-18 13:05:24+00:00,2,0,947109661,95662,123122.087776,9900.583941
36945,TransactionId_49919,BatchId_113640,AccountId_3185,SubscriptionId_2267,CustomerId_3613,UGX,256,ProviderId_3,ProductId_15,financial_services,ChannelId_3,5000.0,5000,2018-12-22 11:15:01+00:00,2,0,947109661,95662,123122.087776,9900.583941
71686,TransactionId_1392,BatchId_135420,AccountId_1596,SubscriptionId_2249,CustomerId_1974,UGX,256,ProviderId_5,ProductId_10,airtime,ChannelId_3,10000.0,10000,2019-01-25 10:23:52+00:00,4,0,947109661,95662,123122.087776,9900.583941
76819,TransactionId_118993,BatchId_52558,AccountId_4841,SubscriptionId_3829,CustomerId_3730,UGX,256,ProviderId_4,ProductId_6,financial_services,ChannelId_2,-20.0,20,2019-01-28 23:07:31+00:00,2,0,947109661,95662,123122.087776,9900.583941
3078,TransactionId_65811,BatchId_35827,AccountId_4150,SubscriptionId_3552,CustomerId_4602,UGX,256,ProviderId_6,ProductId_11,data_bundles,ChannelId_3,500.0,500,2018-11-18 21:06:49+00:00,2,0,947109661,95662,123122.087776,9900.583941
69206,TransactionId_97981,BatchId_40925,AccountId_4841,SubscriptionId_3829,CustomerId_2816,UGX,256,ProviderId_4,ProductId_6,financial_services,ChannelId_2,-20.0,20,2019-01-23 13:57:36+00:00,2,0,947109661,95662,123122.087776,9900.583941
33991,TransactionId_108103,BatchId_74122,AccountId_4468,SubscriptionId_3787,CustomerId_4928,UGX,256,ProviderId_6,ProductId_10,airtime,ChannelId_3,1000.0,1000,2018-12-20 17:12:10+00:00,2,0,947109661,95662,123122.087776,9900.583941
47724,TransactionId_49627,BatchId_35369,AccountId_4841,SubscriptionId_3829,CustomerId_4954,UGX,256,ProviderId_4,ProductId_6,financial_services,ChannelId_2,-50.0,50,2018-12-31 15:26:50+00:00,2,0,947109661,95662,123122.087776,9900.583941
79307,TransactionId_71240,BatchId_53911,AccountId_4841,SubscriptionId_3829,CustomerId_1096,UGX,256,ProviderId_4,ProductId_6,financial_services,ChannelId_2,-50.0,50,2019-01-31 13:39:24+00:00,2,0,947109661,95662,123122.087776,9900.583941


In [26]:
df['TransactionYear'] = df['TransactionStartTime'].apply(lambda x: x.year)
df['TransactionMonth'] = df['TransactionStartTime'].apply(lambda x: x.month)
df['TransactionDay'] = df['TransactionStartTime'].apply(lambda x: x.day)
df['TransactionHour'] = df['TransactionStartTime'].apply(lambda x: x.hour)

In [27]:
df.sample(10)

Unnamed: 0,TransactionId,BatchId,AccountId,SubscriptionId,CustomerId,CurrencyCode,CountryCode,ProviderId,ProductId,ProductCategory,ChannelId,Amount,Value,TransactionStartTime,PricingStrategy,FraudResult,TransactionTotal,TransactionCount,TransactionSTD,Transactionaverage,TransactionYear,TransactionMonth,TransactionDay,TransactionHour
25709,TransactionId_78211,BatchId_135263,AccountId_3206,SubscriptionId_1372,CustomerId_3634,UGX,256,ProviderId_6,ProductId_10,airtime,ChannelId_3,5000.0,5000,2018-12-14 05:15:13+00:00,2,0,947109661,95662,123122.087776,9900.583941,2018,12,14,5
17579,TransactionId_20270,BatchId_116371,AccountId_744,SubscriptionId_1203,CustomerId_1089,UGX,256,ProviderId_1,ProductId_15,financial_services,ChannelId_3,10000.0,10000,2018-12-05 15:36:06+00:00,2,0,947109661,95662,123122.087776,9900.583941,2018,12,5,15
46489,TransactionId_22277,BatchId_38502,AccountId_268,SubscriptionId_2700,CustomerId_594,UGX,256,ProviderId_1,ProductId_14,financial_services,ChannelId_3,6297.0,6297,2018-12-29 20:16:45+00:00,2,0,947109661,95662,123122.087776,9900.583941,2018,12,29,20
4962,TransactionId_117173,BatchId_24059,AccountId_4841,SubscriptionId_3829,CustomerId_1510,UGX,256,ProviderId_4,ProductId_6,financial_services,ChannelId_2,-50.0,50,2018-11-21 10:02:02+00:00,2,0,947109661,95662,123122.087776,9900.583941,2018,11,21,10
437,TransactionId_15464,BatchId_123844,AccountId_4841,SubscriptionId_3829,CustomerId_1709,UGX,256,ProviderId_4,ProductId_6,financial_services,ChannelId_2,-50.0,50,2018-11-15 15:34:36+00:00,2,0,947109661,95662,123122.087776,9900.583941,2018,11,15,15
54442,TransactionId_37684,BatchId_110440,AccountId_4249,SubscriptionId_4429,CustomerId_7343,UGX,256,ProviderId_4,ProductId_10,airtime,ChannelId_2,-40000.0,40000,2019-01-08 11:58:13+00:00,4,0,947109661,95662,123122.087776,9900.583941,2019,1,8,11
82476,TransactionId_138575,BatchId_36324,AccountId_909,SubscriptionId_4786,CustomerId_1258,UGX,256,ProviderId_6,ProductId_3,airtime,ChannelId_3,5500.0,5500,2019-02-01 18:21:51+00:00,2,0,947109661,95662,123122.087776,9900.583941,2019,2,1,18
65304,TransactionId_25959,BatchId_41544,AccountId_2043,SubscriptionId_842,CustomerId_2445,UGX,256,ProviderId_1,ProductId_15,financial_services,ChannelId_3,8600000.0,8600000,2019-01-19 07:23:31+00:00,2,1,947109661,95662,123122.087776,9900.583941,2019,1,19,7
39204,TransactionId_87709,BatchId_67019,AccountId_835,SubscriptionId_3465,CustomerId_1180,UGX,256,ProviderId_6,ProductId_3,airtime,ChannelId_3,1000.0,1000,2018-12-24 16:30:12+00:00,2,0,947109661,95662,123122.087776,9900.583941,2018,12,24,16
26552,TransactionId_30705,BatchId_12178,AccountId_2928,SubscriptionId_4163,CustomerId_3353,UGX,256,ProviderId_6,ProductId_3,airtime,ChannelId_3,1000.0,1000,2018-12-14 11:10:21+00:00,2,0,947109661,95662,123122.087776,9900.583941,2018,12,14,11


In [28]:
df.shape

(95662, 24)

In [29]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 95662 entries, 0 to 95661
Data columns (total 24 columns):
 #   Column                Non-Null Count  Dtype              
---  ------                --------------  -----              
 0   TransactionId         95662 non-null  string             
 1   BatchId               95662 non-null  string             
 2   AccountId             95662 non-null  string             
 3   SubscriptionId        95662 non-null  string             
 4   CustomerId            95662 non-null  string             
 5   CurrencyCode          95662 non-null  string             
 6   CountryCode           95662 non-null  int64              
 7   ProviderId            95662 non-null  string             
 8   ProductId             95662 non-null  string             
 9   ProductCategory       95662 non-null  string             
 10  ChannelId             95662 non-null  string             
 11  Amount                95662 non-null  float64            
 12  Valu

# Univariate Analysis

## Non-Graphical Univariate Analysis