In [2]:
import pandas as pd
new_customer_data = pd.read_csv('data/single_customer_data.csv')
new_customer_data.head()

Unnamed: 0,Customer Name,uid,business sector,location,phone number,national ID,passport,tin,gender,education level,business start date
0,James Bridget,330e5a89-513b-4b45-bb55-71b7c3d3d080,Fisheries,Entebbe,256715065219,44789563,U4950081,9346111,Female,Vocational,2020-09-09


In [3]:
new_transactions_data = pd.read_csv('data/single_customer_transactions_data.csv')
new_transactions_data.head()

Unnamed: 0,Customer Name,uid,amount,type,reason,date
0,James Bridget,330e5a89-513b-4b45-bb55-71b7c3d3d080,416918.31,Credit,Refund,2020-11-27
1,James Bridget,330e5a89-513b-4b45-bb55-71b7c3d3d080,926431.17,Credit,Refund,2021-12-06
2,James Bridget,330e5a89-513b-4b45-bb55-71b7c3d3d080,321147.14,Debit,Sale,2021-03-17
3,James Bridget,330e5a89-513b-4b45-bb55-71b7c3d3d080,145512.16,Credit,Miscellaneous,2020-12-06
4,James Bridget,330e5a89-513b-4b45-bb55-71b7c3d3d080,303000.63,Debit,Sale,2022-01-12


In [4]:
print(f"Number of rows in transactions_data: {new_transactions_data.shape[0]}")

Number of rows in transactions_data: 124


In [5]:
customer_data_cleaned = new_customer_data.drop(columns=['phone number', 'national ID', 'passport', 'tin'])
customer_data_cleaned.head()

Unnamed: 0,Customer Name,uid,business sector,location,gender,education level,business start date
0,James Bridget,330e5a89-513b-4b45-bb55-71b7c3d3d080,Fisheries,Entebbe,Female,Vocational,2020-09-09


In [6]:
from datetime import datetime

#convert the business start date to data time so that we can calculate year
new_customer_data['business start date'] = pd.to_datetime(new_customer_data['business start date'])

duration_in_days = (datetime.now() - new_customer_data['business start date']).dt.days

duration_in_years = (duration_in_days / 365).round(2)

customer_data_cleaned['business duration (years)'] = duration_in_years

customer_data_cleaned.head()

Unnamed: 0,Customer Name,uid,business sector,location,gender,education level,business start date,business duration (years)
0,James Bridget,330e5a89-513b-4b45-bb55-71b7c3d3d080,Fisheries,Entebbe,Female,Vocational,2020-09-09,3.09


### Add Transaction calculation Columns

In [7]:
# Calculate Net Profit
total_credit = new_transactions_data[new_transactions_data['type'] == 'Credit']['amount'].sum()
total_debit = new_transactions_data[new_transactions_data['type'] == 'Debit']['amount'].sum()
net_profit = total_credit - total_debit

# # Calculate Transaction Volume
transaction_volume = len(new_transactions_data)

# # Merge net profit and transaction volume with customer_data
customer_data_cleaned['Net Profit'] = net_profit
customer_data_cleaned['Transaction Volume'] = transaction_volume


# Calculate net profit per year and transaction volume per year
customer_data_cleaned['Net Profit Per Year'] = customer_data_cleaned['Net Profit']/ customer_data_cleaned['business duration (years)']
customer_data_cleaned['Transaction Volume Per Year'] = customer_data_cleaned['Transaction Volume']/ customer_data_cleaned['business duration (years)']

customer_data_cleaned.head()
# customer_data_cleaned_with_txn_vol.head()

Unnamed: 0,Customer Name,uid,business sector,location,gender,education level,business start date,business duration (years),Net Profit,Transaction Volume,Net Profit Per Year,Transaction Volume Per Year
0,James Bridget,330e5a89-513b-4b45-bb55-71b7c3d3d080,Fisheries,Entebbe,Female,Vocational,2020-09-09,3.09,-3582789.55,124,-1159479.0,40.12945


### Encoding

In [8]:
categorical_columns = ['business sector', 'location', 'gender', 'education level']
customer_data_encoded = pd.get_dummies(customer_data_cleaned, columns=categorical_columns)

categorical_dummies = [col for col in customer_data_encoded.columns if any(prefix in col for prefix in categorical_columns)]
customer_data_encoded[categorical_dummies] = customer_data_encoded[categorical_dummies].astype(int)

customer_data_encoded.head()


Unnamed: 0,Customer Name,uid,business start date,business duration (years),Net Profit,Transaction Volume,Net Profit Per Year,Transaction Volume Per Year,business sector_Fisheries,location_Entebbe,gender_Female,education level_Vocational
0,James Bridget,330e5a89-513b-4b45-bb55-71b7c3d3d080,2020-09-09,3.09,-3582789.55,124,-1159479.0,40.12945,1,1,1,1


In [9]:
# 1. Find the minimum and maximum values in the "business duration (years)" column
min_duration = new_customer_data_encoded['business duration (years)'].min()
max_duration = new_customer_data_encoded['business duration (years)'].max()

print("min is " + str(min_duration))
print("max is " + str(max_duration))
# 1. Define the specific bins
bins = [0, 5, 10, 15, 20.01]  # 20.01 is included to ensure businesses with exactly 20.01 years are also categorized

# 2. Create the labels
labels = ['0 - 5 years', 
          '5 - 10 years', 
          '10 - 15 years', 
          '15 - 20 years']  # Notice the last label is '15 - 20 years', not '15 - 20.01 years' for display purposes

# 3. Create a new column called "business duration range" in the original dataframe and assign each row its respective range
new_customer_data_encoded['business duration range'] = pd.cut(new_customer_data_encoded['business duration (years)'], bins=bins, labels=labels, right=False, include_lowest=True)
duration_dummies = pd.get_dummies(new_customer_data_encoded['business duration range'], prefix='duration range')

duration_dummies = duration_dummies.astype(int)


# Extract rows for our own analysis
business_duration_ranges_df = new_customer_data_encoded.groupby('business duration range').size().reset_index(name='Number of Businesses')
business_duration_ranges_df.head()

# Reorder the columns
desired_order = ['Customer Name', 'uid', 'business start date', 'business duration (years)', 'business duration range'] + [col for col in new_customer_data_encoded if col not in ['Customer Name', 'uid', 'business start date', 'business duration (years)', 'business duration range']]
new_customer_data_encoded = new_customer_data_encoded[desired_order]

new_customer_data_encoded.head()

NameError: name 'new_customer_data_encoded' is not defined

In [None]:
duration_dummies = pd.get_dummies(new_customer_data_encoded.get('business duration range', pd.Series()), prefix='duration range').astype(int)
new_customer_data_encoded = pd.concat([new_customer_data_encoded, duration_dummies], axis=1)
new_customer_data_encoded_range_dropped = new_customer_data_encoded.drop('business duration range', axis=1)
new_customer_data_encoded_range_dropped.head()


In [None]:
# 1. Find the minimum and maximum transactions in the "Transaction Volume Per Year" column
min_transaction = new_customer_data_encoded_range_dropped['Transaction Volume Per Year'].min()
max_transaction = new_customer_data_encoded_range_dropped['Transaction Volume Per Year'].max()

print("min is " + str(min_transaction))
print("max is " + str(max_transaction))
bins = (0, 2000,4000,6000,8000,11000)

# 2. Create the labels
labels = ['0 - 2000', 
          '2000 - 4000', 
          '4000 - 6000',
          '6000 - 8000', 
          '8000 - 11000'] 

new_customer_data_encoded_range_dropped['transaction volume range'] = pd.cut(new_customer_data_encoded_range_dropped['Transaction Volume Per Year'], bins=bins, labels=labels, right=False, include_lowest=True)


new_customer_data_encoded_range_dropped.head()