In [1]:
import pandas as pd
new_customer_data = pd.read_csv('data/single_customer_data.csv')
new_customer_data.head()

Unnamed: 0,Customer Name,uid,business sector,location,phone number,national ID,passport,tin,gender,education level,business start date
0,William Bridget,aef4b9fa-5402-4967-9f1a-3921665ddf84,Agriculture,Jinja,256772298984,83433993,U4603610,7042452,Female,University,2009-06-15


In [2]:
new_transactions_data = pd.read_csv('data/single_customer_transactions_data.csv')
new_transactions_data.head()

Unnamed: 0,Customer Name,uid,amount,type,reason,date
0,William Bridget,aef4b9fa-5402-4967-9f1a-3921665ddf84,480560.44,Debit,Purchase,2019-12-15
1,William Bridget,aef4b9fa-5402-4967-9f1a-3921665ddf84,28991.72,Debit,Refund,2019-04-20
2,William Bridget,aef4b9fa-5402-4967-9f1a-3921665ddf84,55789.91,Credit,Sale,2023-10-17
3,William Bridget,aef4b9fa-5402-4967-9f1a-3921665ddf84,403996.93,Debit,Operating Expenses,2014-12-28
4,William Bridget,aef4b9fa-5402-4967-9f1a-3921665ddf84,432391.08,Credit,Miscellaneous,2022-06-27


In [3]:
print(f"Number of rows in transactions_data: {new_transactions_data.shape[0]}")

Number of rows in transactions_data: 104


In [4]:
customer_data_cleaned = new_customer_data.drop(columns=['phone number', 'national ID', 'passport', 'tin'])
customer_data_cleaned.head()

Unnamed: 0,Customer Name,uid,business sector,location,gender,education level,business start date
0,William Bridget,aef4b9fa-5402-4967-9f1a-3921665ddf84,Agriculture,Jinja,Female,University,2009-06-15


In [5]:
# new_customer_data.head()

In [6]:
from datetime import datetime

#convert the business start date to data time so that we can calculate year
customer_data_cleaned['business start date'] = pd.to_datetime(customer_data_cleaned['business start date'])

duration_in_days = (datetime.now() - customer_data_cleaned['business start date']).dt.days

duration_in_years = (duration_in_days / 365).round(2)

customer_data_cleaned['business duration (years)'] = duration_in_years

customer_data_cleaned.head()

Unnamed: 0,Customer Name,uid,business sector,location,gender,education level,business start date,business duration (years)
0,William Bridget,aef4b9fa-5402-4967-9f1a-3921665ddf84,Agriculture,Jinja,Female,University,2009-06-15,14.36


### Add Transaction calculation Columns

In [7]:
# Calculate Net Profit
total_credit = new_transactions_data[new_transactions_data['type'] == 'Credit']['amount'].sum()
total_debit = new_transactions_data[new_transactions_data['type'] == 'Debit']['amount'].sum()
net_profit = total_credit - total_debit

# # Calculate Transaction Volume
transaction_volume = len(new_transactions_data)

# # Merge net profit and transaction volume with customer_data
customer_data_cleaned['Net Profit'] = net_profit
customer_data_cleaned['Transaction Volume'] = transaction_volume


# Calculate net profit per year and transaction volume per year
customer_data_cleaned['Net Profit Per Year'] = customer_data_cleaned['Net Profit']/ customer_data_cleaned['business duration (years)']
customer_data_cleaned['Transaction Volume Per Year'] = customer_data_cleaned['Transaction Volume']/ customer_data_cleaned['business duration (years)']

customer_data_cleaned.head()
# customer_data_cleaned_with_txn_vol.head()

Unnamed: 0,Customer Name,uid,business sector,location,gender,education level,business start date,business duration (years),Net Profit,Transaction Volume,Net Profit Per Year,Transaction Volume Per Year
0,William Bridget,aef4b9fa-5402-4967-9f1a-3921665ddf84,Agriculture,Jinja,Female,University,2009-06-15,14.36,4215271.67,104,293542.595404,7.24234


### Encoding

In [8]:
categorical_columns = ['business sector', 'location', 'gender', 'education level']
customer_data_encoded = pd.get_dummies(customer_data_cleaned, columns=categorical_columns)

categorical_dummies = [col for col in customer_data_encoded.columns if any(prefix in col for prefix in categorical_columns)]
customer_data_encoded[categorical_dummies] = customer_data_encoded[categorical_dummies].astype(int)

customer_data_encoded.head()


Unnamed: 0,Customer Name,uid,business start date,business duration (years),Net Profit,Transaction Volume,Net Profit Per Year,Transaction Volume Per Year,business sector_Agriculture,location_Jinja,gender_Female,education level_University
0,William Bridget,aef4b9fa-5402-4967-9f1a-3921665ddf84,2009-06-15,14.36,4215271.67,104,293542.595404,7.24234,1,1,1,1


In [9]:
# 1. Find the minimum and maximum values in the "business duration (years)" column
min_duration = customer_data_encoded['business duration (years)'].min()
max_duration = customer_data_encoded['business duration (years)'].max()

print("min is " + str(min_duration))
print("max is " + str(max_duration))
# 1. Define the specific bins
bins = [0, 5, 10, 15, 20.01]  # 20.01 is included to ensure businesses with exactly 20.01 years are also categorized

# 2. Create the labels
labels = ['0 - 5 years', 
          '5 - 10 years', 
          '10 - 15 years', 
          '15 - 20 years']  # Notice the last label is '15 - 20 years', not '15 - 20.01 years' for display purposes

# 3. Create a new column called "business duration range" in the original dataframe and assign each row its respective range
customer_data_encoded['business duration range'] = pd.cut(customer_data_encoded['business duration (years)'], bins=bins, labels=labels, right=False, include_lowest=True)
duration_dummies = pd.get_dummies(customer_data_encoded['business duration range'], prefix='duration range')

duration_dummies = duration_dummies.astype(int)


# Extract rows for our own analysis
business_duration_ranges_df = customer_data_encoded.groupby('business duration range').size().reset_index(name='Number of Businesses')
business_duration_ranges_df.head()

# Reorder the columns
desired_order = ['Customer Name', 'uid', 'business start date', 'business duration (years)', 'business duration range'] + [col for col in customer_data_encoded if col not in ['Customer Name', 'uid', 'business start date', 'business duration (years)', 'business duration range']]
customer_data_encoded = customer_data_encoded[desired_order]

customer_data_encoded.head()

min is 14.36
max is 14.36


Unnamed: 0,Customer Name,uid,business start date,business duration (years),business duration range,Net Profit,Transaction Volume,Net Profit Per Year,Transaction Volume Per Year,business sector_Agriculture,location_Jinja,gender_Female,education level_University
0,William Bridget,aef4b9fa-5402-4967-9f1a-3921665ddf84,2009-06-15,14.36,10 - 15 years,4215271.67,104,293542.595404,7.24234,1,1,1,1


In [10]:
duration_dummies = pd.get_dummies(customer_data_encoded.get('business duration range', pd.Series()), prefix='duration range').astype(int)
customer_data_encoded = pd.concat([customer_data_encoded, duration_dummies], axis=1)
customer_data_encoded.drop('business duration range', axis=1, inplace=True)
customer_data_encoded.head()


Unnamed: 0,Customer Name,uid,business start date,business duration (years),Net Profit,Transaction Volume,Net Profit Per Year,Transaction Volume Per Year,business sector_Agriculture,location_Jinja,gender_Female,education level_University,duration range_0 - 5 years,duration range_5 - 10 years,duration range_10 - 15 years,duration range_15 - 20 years
0,William Bridget,aef4b9fa-5402-4967-9f1a-3921665ddf84,2009-06-15,14.36,4215271.67,104,293542.595404,7.24234,1,1,1,1,0,0,1,0


In [11]:
# 1. Find the minimum and maximum transactions in the "Transaction Volume Per Year" column
min_transaction = customer_data_encoded['Transaction Volume Per Year'].min()
max_transaction = customer_data_encoded['Transaction Volume Per Year'].max()

print("min is " + str(min_transaction))
print("max is " + str(max_transaction))
bins = (0, 2000,4000,6000,8000,11000)

# 2. Create the labels
labels = ['0 - 2000', 
          '2000 - 4000', 
          '4000 - 6000',
          '6000 - 8000', 
          '8000 - 11000'] 

customer_data_encoded['transaction volume range'] = pd.cut(customer_data_encoded['Transaction Volume Per Year'], bins=bins, labels=labels, right=False, include_lowest=True)


customer_data_encoded.head()

min is 7.242339832869081
max is 7.242339832869081


Unnamed: 0,Customer Name,uid,business start date,business duration (years),Net Profit,Transaction Volume,Net Profit Per Year,Transaction Volume Per Year,business sector_Agriculture,location_Jinja,gender_Female,education level_University,duration range_0 - 5 years,duration range_5 - 10 years,duration range_10 - 15 years,duration range_15 - 20 years,transaction volume range
0,William Bridget,aef4b9fa-5402-4967-9f1a-3921665ddf84,2009-06-15,14.36,4215271.67,104,293542.595404,7.24234,1,1,1,1,0,0,1,0,0 - 2000
