In [210]:
import pandas as pd
import numpy as np 
import datetime

In [211]:
region = pd.read_csv("RAW/region.csv")
region

Unnamed: 0,region_id,region_name
0,1,Africa
1,2,America
2,3,Asia
3,4,Europe
4,5,Oceania


In [212]:
customer_nodes = pd.read_csv("RAW/Customer_Nodes.csv")

customer_nodes = customer_nodes.merge(region, 'left', 'region_id')

#removing whitespaces from time_stamps
customer_nodes['end_date'] = customer_nodes['end_date'].str.strip()

customer_nodes


Unnamed: 0,customer_id,region_id,node_id,start_date,end_date,region_name
0,1,3,4,2020-01-02,2020-01-03,Asia
1,2,3,5,2020-01-03,2020-01-17,Asia
2,3,5,4,2020-01-27,2020-02-18,Oceania
3,4,5,4,2020-01-07,2020-01-19,Oceania
4,5,3,3,2020-01-15,2020-01-23,Asia
...,...,...,...,...,...,...
3495,496,3,4,2020-02-25,9999-12-31,Asia
3496,497,5,4,2020-05-27,9999-12-31,Oceania
3497,498,1,2,2020-04-05,9999-12-31,Africa
3498,499,5,1,2020-02-03,9999-12-31,Oceania


In [213]:
customer_transactions = pd.read_csv("RAW/Customer_Transactions.csv")
customer_transactions

Unnamed: 0,customer_id,txn_date,txn_type,txn_amount
0,429,2020-01-21,deposit,82
1,155,2020-01-10,deposit,712
2,398,2020-01-01,deposit,196
3,255,2020-01-14,deposit,563
4,185,2020-01-29,deposit,626
...,...,...,...,...
5863,189,2020-02-03,withdrawal,870
5864,189,2020-03-22,purchase,718
5865,189,2020-02-06,purchase,393
5866,189,2020-01-22,deposit,302


A. Customer Nodes Exploration
How many unique nodes are there on the Data Bank system?
What is the number of nodes per region?
How many customers are allocated to each region?
How many days on average are customers reallocated to a different node?
What is the median, 80th and 95th percentile for this same reallocation days metric for each region?

In [214]:
#How many unique nodes are there on the Data Bank system?
print("there are",customer_nodes.node_id.count(),"entries of nodes in the Data bank System")

print("these are all variations of the following ",len(customer_nodes.node_id.unique()),":",customer_nodes.node_id.unique())

print("in the following proportions", customer_nodes.groupby('node_id').size())

there are 3500 entries of nodes in the Data bank System
these are all variations of the following  5 : [4 5 3 1 2]
in the following proportions node_id
1    728
2    662
3    699
4    704
5    707
dtype: int64


In [215]:
#What is the number of nodes per region?
print("The Distribution for nodes per region is ", customer_nodes.groupby('region_name').size())

The Distribution for nodes per region is  region_name
Africa     770
America    735
Asia       714
Europe     665
Oceania    616
dtype: int64


In [216]:
#How many customers are allocated to each region?

print("Customers Allocated Per region are")
customer_nodes.groupby('region_name').agg(customers=('customer_id', 'nunique'))



Customers Allocated Per region are


Unnamed: 0_level_0,customers
region_name,Unnamed: 1_level_1
Africa,110
America,105
Asia,102
Europe,95
Oceania,88


In [217]:
print("The Average Distribution per region is", int(
    customer_nodes.customer_id.nunique()/customer_nodes.region_id.nunique()))


The Average Distribution per region is 100


In [218]:
#How many days on average are customers reallocated to a different node?


#filtering out outlier 9999-12-31
cn_2 = customer_nodes[customer_nodes['end_date'] != '9999-12-31'][['start_date','end_date']].copy()

#parsing dates to datetime from string
cn_2['start_date'] = pd.to_datetime(cn_2['start_date'], format='%Y-%m-%d')
cn_2['end_date'] = pd.to_datetime(cn_2['end_date'], format='%Y-%m-%d')

#calculating time_delta in a list 
time_deltas = cn_2['end_date']-cn_2['start_date']

#calculating average time a customer stays in a node
# giving datetime.timedelta(0) as the start value makes sum work on tds
average_time_delta = sum(time_deltas, datetime.timedelta(0))/len(time_deltas)

print("On average a customer is relocated to a new node after",average_time_delta.days,"days")

#clearing temporary dataframe and lists from memory 
del cn_2
del time_deltas


On average a customer is relocated to a new node after 14 days


In [219]:
#Handling outliers in End_Date

#Parsing Start_date in Datetime format
customer_nodes['start_date'] = pd.to_datetime(customer_nodes['start_date'], format='%Y-%m-%d')

#Replacing Values with date '9999-12-31' with Start + Average Node relocation Time
customer_nodes['end_date'] = np.where((customer_nodes['end_date'] == '9999-12-31'),
                                   (customer_nodes['start_date'] + datetime.timedelta(days=average_time_delta.days)).apply(lambda x: x.strftime("%Y-%m-%d")), customer_nodes['end_date'])

customer_nodes['end_date'] = pd.to_datetime(customer_nodes['end_date'], format='%Y-%m-%d')

#Parsing End_Date in Datetime format
customer_nodes['end_date'] = pd.to_datetime(customer_nodes['end_date'], format='%Y-%m-%d')

customer_nodes


Unnamed: 0,customer_id,region_id,node_id,start_date,end_date,region_name
0,1,3,4,2020-01-02,2020-01-03,Asia
1,2,3,5,2020-01-03,2020-01-17,Asia
2,3,5,4,2020-01-27,2020-02-18,Oceania
3,4,5,4,2020-01-07,2020-01-19,Oceania
4,5,3,3,2020-01-15,2020-01-23,Asia
...,...,...,...,...,...,...
3495,496,3,4,2020-02-25,2020-03-10,Asia
3496,497,5,4,2020-05-27,2020-06-10,Oceania
3497,498,1,2,2020-04-05,2020-04-19,Africa
3498,499,5,1,2020-02-03,2020-02-17,Oceania


In [279]:
#What is the median, 80th and 95th percentile for this same reallocation days metric for each region?
customer_nodes['time_delta'] = round((customer_nodes['end_date'] - customer_nodes['start_date']).astype('timedelta64[D]'),0)

print("overall median value is",customer_nodes['time_delta'].median())


overall median value is 14.0


In [288]:
def q80(x):
    return x.quantile(0.80)

def q95(x):
    return x.quantile(0.95)

metrics = {'time_delta': ['median', q80, q95]}
customer_nodes.groupby('region_name').agg(metrics)


Unnamed: 0_level_0,time_delta,time_delta,time_delta
Unnamed: 0_level_1,median,q80,q95
region_name,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
Africa,14.0,22.0,28.0
America,14.0,22.0,27.0
Asia,14.0,23.0,28.0
Europe,14.0,22.0,28.0
Oceania,14.0,23.0,28.0
