In [1]:
import pandas as pd

# Load the provided Excel file to review its content
file_path = 'Transactions split (1).xlsx'
transactions_data = pd.read_excel(file_path)

# Display the content of the Excel file
transactions_data.head()


Unnamed: 0.1,Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,Unnamed: 10,Unnamed: 11
0,,,,,,,,,,,,
1,Revenue Streams,2023-07-01 00:00:00,2023-08-01 00:00:00,2023-09-01 00:00:00,2023-10-01 00:00:00,2023-11-01 00:00:00,2023-12-01 00:00:00,2024-01-01 00:00:00,2024-02-01 00:00:00,2024-03-01 00:00:00,2024-04-01 00:00:00,Total
2,Total Transactional Value,-,-,-,28946,128950,349060,503987.28,1286496.95,1739505.63,3420567.89,
3,Plutus Take RateEnabled Value,-,-,-,-,-,139760,374996.46,586496.95,786554.62,1667460,
4,Net Revenue/Taxable Value (Excluding Taxes),,,,0,-,12586,32983.59,49885,74860,140738.87,


In [2]:
#drop the two first rows
transactions_data.drop([0], inplace=True)
#make the first as the column name
transactions_data.columns = transactions_data.iloc[0] 
#remove the first row (the header) and reset index
transactions_data = transactions_data.iloc[1:].reset_index(drop=True)

In [3]:
# Replace '-' with 0 for monetary value columns and convert all to numeric where applicable
for column in transactions_data.columns[1:-1]:  # Exclude 'Revenue Streams' and 'Total'
    transactions_data[column] = transactions_data[column].replace('-', 0).astype(float)

# Verify the changes and data types after conversion
transactions_data.head(), transactions_data.dtypes


(1                              Revenue Streams  2023-07-01 00:00:00  \
 0                    Total Transactional Value                  0.0   
 1               Plutus Take RateEnabled  Value                  0.0   
 2  Net Revenue/Taxable Value (Excluding Taxes)                  NaN   
 3                                  Total Users                  NaN   
 
 1  2023-08-01 00:00:00  2023-09-01 00:00:00  2023-10-01 00:00:00  \
 0                  0.0                  0.0              28946.0   
 1                  0.0                  0.0                  0.0   
 2                  NaN                  NaN                  0.0   
 3                  NaN                  NaN               6450.0   
 
 1  2023-11-01 00:00:00  2023-12-01 00:00:00  2024-01-01 00:00:00  \
 0             128950.0             349060.0            503987.28   
 1                  0.0             139760.0            374996.46   
 2                  0.0              12586.0             32983.59   
 3             

In [4]:
import random
import math
from datetime import timedelta, datetime

# Initialize the DataFrame for the synthetic data
synthetic_transactions = []

# Define function to generate random dates within a given month
def random_date(year, month):
    start_date = datetime(year, month, 1)
    if month == 12:
        end_date = datetime(year + 1, 1, 1) - timedelta(days=1)
    else:
        end_date = datetime(year, month + 1, 1) - timedelta(days=1)
    random_days = random.randint(0, (end_date - start_date).days)
    return start_date + timedelta(days=random_days)
# Re-initialize the DataFrame for synthetic data
synthetic_transactions = []

# Re-iterate over months and generate data with handling NaN in user counts
for column in transactions_data.columns[2:-1]:  # Exclude 'Revenue Streams' and 'Total', start from first month
    # Take the date part, ignore time part
    
    date_str = str(column).split(" ")[0]  # Take the date part, ignore time part
    year, month, _ = map(int, date_str.split('-'))
    total_value = transactions_data.loc[0, column]  # Total transactional value for the month
    users_count = transactions_data.loc[3, column]  # Total users for the month

    if pd.isna(users_count):
        continue  # Skip months with NaN user counts

    users_count = int(users_count)
    if users_count > 0:
        avg_transaction_value = total_value / users_count
    else:
        continue  # Skip if no users

    # Generate transactions
    for user in range(users_count):
        transaction_id = f"TXN{random.randint(1000, 999999)}"
        customer_id = f"CUST{random.randint(1000, 999999)}"
        transaction_date = random_date(year, month)
        # Decide the transaction amount based on specified ranges and average
        transaction_amount = random.uniform(4, 200) if random.random() < 0.1 else avg_transaction_value
        
        # Append the transaction to the DataFrame
        synthetic_transactions.append({
            'transaction_id': transaction_id,
            'date': transaction_date,
            'customer_id': customer_id,
            'net_revenue_earned': f"${transaction_amount:.2f}"
        })




In [5]:
df = pd.DataFrame(synthetic_transactions)


In [6]:
df.to_csv('synthetic_transactions.csv', index=False)

In [None]:
df["net_revenue_earned"].sum()

In [8]:
transactions_data.head()

1,Revenue Streams,2023-07-01 00:00:00,2023-08-01 00:00:00,2023-09-01 00:00:00,2023-10-01 00:00:00,2023-11-01 00:00:00,2023-12-01 00:00:00,2024-01-01 00:00:00,2024-02-01 00:00:00,2024-03-01 00:00:00,2024-04-01 00:00:00,Total
0,Total Transactional Value,-,-,-,28946,128950,349060,503987.28,1286496.95,1739505.63,3420567.89,
1,Plutus Take RateEnabled Value,-,-,-,-,-,139760,374996.46,586496.95,786554.62,1667460.0,
2,Net Revenue/Taxable Value (Excluding Taxes),,,,0,-,12586,32983.59,49885.0,74860.0,140738.87,
3,Total Users,,,,6450,16723,48723,78347.0,128994.0,197853.0,308943.0,


In [21]:
# prrint to csv
data = transactions_data

In [23]:
from datetime import datetime, timedelta
import numpy as np

# Define the start and end months for data generation
start_date = datetime(2023, 10, 1)
end_date = datetime(2024, 4, 30)

# Extract the transactional volumes for each month
monthly_volumes = data.iloc[0, 4:11].astype(float)

# Helper functions for generating random data
def random_date(start, end):
    """Generate a random date within a month."""
    days_in_month = (end - start).days
    random_day = np.random.randint(0, days_in_month)
    return start + timedelta(days=random_day)

def generate_transactions(month, volume):
    """Generate transaction data for a given month."""
    start_of_month = datetime(month.year, month.month, 1)
    end_of_month = (start_of_month + timedelta(days=32)).replace(day=1) - timedelta(days=1)
    
    transactions = []
    remaining_volume = volume
    while remaining_volume > 0:
        if remaining_volume > 200:
            net_revenue = np.random.choice([4, 5, 100, 150, 200])
        else:
            net_revenue = np.random.choice([4, 5, remaining_volume], p=[0.45, 0.45, 0.1])
        
        transaction_id = f"TXN{np.random.randint(100000, 999999)}"
        customer_id = f"CUST{np.random.randint(1000, 9999)}"
        date = random_date(start_of_month, end_of_month)
        transactions.append([transaction_id, date.strftime('%Y-%m-%d'), customer_id, f"${net_revenue}"])
        
        remaining_volume -= net_revenue

    return transactions

# Generate data for all required months
all_transactions = []
current_month = start_date
for volume in monthly_volumes:
    month_transactions = generate_transactions(current_month, volume)
    all_transactions.extend(month_transactions)
    current_month += timedelta(days=(current_month.replace(day=28) + timedelta(days=4)).day)

# Convert to DataFrame and save to CSV
transactions_df = pd.DataFrame(all_transactions, columns=['transaction_id', 'date', 'customer_id', 'net_revenue_earned'])
transactions_df

Unnamed: 0,transaction_id,date,customer_id,net_revenue_earned
0,TXN632477,2023-10-30,CUST9974,$4
1,TXN241575,2023-10-08,CUST2934,$4
2,TXN969212,2023-10-10,CUST4482,$4
3,TXN944638,2023-10-30,CUST6803,$150
4,TXN100484,2023-10-11,CUST5152,$4
...,...,...,...,...
427407,TXN795975,2023-10-26,CUST2618,$5.0
427408,TXN314981,2023-10-29,CUST7009,$4.0
427409,TXN602574,2023-10-11,CUST2749,$5.0
427410,TXN897647,2023-10-05,CUST1160,$4.0


In [27]:
# Total users data per month
monthly_users = data.iloc[3, 4:11].astype(int)

def generate_transactions_adjusted_users(month, volume, total_users):
    """Generate transaction data ensuring one transaction per user."""
    start_of_month = datetime(month.year, month.month, 1)
    end_of_month = (start_of_month + timedelta(days=32)).replace(day=1) - timedelta(days=1)
    
    transactions = []
    average_revenue = volume / total_users
    # Determine revenue per user in a way that sums up closely to the total volume
    revenues = np.random.normal(loc=average_revenue, scale=average_revenue/10, size=total_users)
    revenues = np.maximum(revenues, 4)  # Ensure minimum transaction value
    revenues = np.round(revenues, 2)  # Round to two decimal places
    revenues[-1] = volume - np.sum(revenues[:-1])  # Adjust last transaction to perfectly match the volume
    revenues = list(revenues)

    for _ in range(total_users):
        transaction_id = f"TXN{np.random.randint(100000, 999999)}"
        customer_id = f"CUST{np.random.randint(1000, 9999)}"
        date = random_date(start_of_month, end_of_month)

        net_revenue = revenues.pop() if len(revenues) > 0 else average_revenue
        transactions.append([transaction_id, date.strftime('%Y-%m-%d'), customer_id, f"${net_revenue}"])
        
    return transactions

# Generate data for all required months respecting total users
all_transactions_adjusted = []
current_month = start_date
idx = 0  # Index for extracting user counts and volumes
for volume, users in zip(monthly_volumes, monthly_users):
    month_transactions = generate_transactions_adjusted_users(current_month, volume, users)
    all_transactions_adjusted.extend(month_transactions)
    current_month += timedelta(days=(current_month.replace(day=28) + timedelta(days=4)).day)
    idx += 1

# Convert to DataFrame and save to CSV
transactions_df_adjusted = pd.DataFrame(all_transactions_adjusted, columns=['transaction_id', 'date', 'customer_id', 'net_revenue_earned'])



In [29]:
transactions_df_adjusted["date"].value_counts()

date
2023-10-21    26453
2023-10-17    26447
2023-10-28    26412
2023-10-25    26354
2023-10-18    26337
2023-10-15    26327
2023-10-23    26316
2023-10-01    26307
2023-10-27    26302
2023-10-08    26301
2023-10-29    26279
2023-10-13    26255
2023-10-02    26252
2023-10-10    26231
2023-10-30    26229
2023-10-12    26218
2023-10-04    26212
2023-10-09    26211
2023-10-24    26198
2023-10-05    26193
2023-10-06    26162
2023-10-22    26136
2023-10-07    26129
2023-10-14    26124
2023-10-16    26099
2023-10-11    26077
2023-10-19    25992
2023-10-26    25976
2023-10-03    25899
2023-10-20    25605
Name: count, dtype: int64

In [38]:
import numpy as np
import random
from datetime import datetime, timedelta

# Function to generate random transaction dates within a month
def generate_transaction_dates(month, year, num_transactions):
    date_list = [datetime(year, month, random.randint(1, 28)) for _ in range(num_transactions)]
    return date_list

# Function to generate transaction IDs and Customer IDs
def generate_ids(prefix, num_transactions):
    return [f"{prefix}{random.randint(1000, 9999)}" for _ in range(num_transactions)]

# Function to generate transactions for a month
def generate_transactions(month, year, total_value):
    transactions = []
    remaining_value = total_value
    avg_small_transaction = 4.5
    large_transaction_value = 150
    large_transaction_frequency = 10  # 1 large transaction for every 10 small transactions

    # Estimate initial number of transactions based on mostly small transactions
    estimated_transactions = int(total_value / avg_small_transaction)

    transaction_dates = generate_transaction_dates(month, year, estimated_transactions)
    transaction_ids = generate_ids("TXN", estimated_transactions)
    customer_ids = generate_ids("CUST", estimated_transactions)

    for i in range(estimated_transactions):
        if i % large_transaction_frequency == 0 and remaining_value > large_transaction_value:
            # Assign a large transaction
            current_value = min(large_transaction_value, remaining_value)
        else:
            # Assign a small transaction
            current_value = min(avg_small_transaction, remaining_value)

        transactions.append({
            "transaction_id": transaction_ids[i],
            "date": transaction_dates[i].date().isoformat(),
            "customer_id": customer_ids[i],
            "net_revenue_earned": f"${current_value:.2f}"
        })
        remaining_value -= current_value

        # Stop if the remaining value is too small to continue
        if remaining_value < avg_small_transaction:
            break

    return transactions

#


In [32]:
data

1,Revenue Streams,2023-07-01 00:00:00,2023-08-01 00:00:00,2023-09-01 00:00:00,2023-10-01 00:00:00,2023-11-01 00:00:00,2023-12-01 00:00:00,2024-01-01 00:00:00,2024-02-01 00:00:00,2024-03-01 00:00:00,2024-04-01 00:00:00,Total
0,Total Transactional Value,-,-,-,28946,128950,349060,503987.28,1286496.95,1739505.63,3420567.89,
1,Plutus Take RateEnabled Value,-,-,-,-,-,139760,374996.46,586496.95,786554.62,1667460.0,
2,Net Revenue/Taxable Value (Excluding Taxes),,,,0,-,12586,32983.59,49885.0,74860.0,140738.87,
3,Total Users,,,,6450,16723,48723,78347.0,128994.0,197853.0,308943.0,


In [30]:
transactions_volume_df = data

In [37]:
data.columns[-2]

datetime.datetime(2024, 4, 1, 0, 0)

In [40]:
# Extract the relevant row for "Total Transactional Value"
total_transaction_values = transactions_volume_df.loc[0, datetime(2023, 7, 1, 0, 0): datetime(2024, 4, 1, 0, 0)]

# Convert to numeric and filter out NaN and '-' values
total_transaction_values = pd.to_numeric(total_transaction_values, errors='coerce')
total_transaction_values = total_transaction_values.dropna()

total_transaction_values


1
2023-10-01 00:00:00      28946.00
2023-11-01 00:00:00     128950.00
2023-12-01 00:00:00     349060.00
2024-01-01 00:00:00     503987.28
2024-02-01 00:00:00    1286496.95
2024-03-01 00:00:00    1739505.63
2024-04-01 00:00:00    3420567.89
Name: 0, dtype: float64

In [42]:
# Extract the relevant row for "Total Users"
total_users_values = transactions_volume_df.loc[3, datetime(2023, 7, 1, 0, 0): datetime(2024, 4, 1, 0, 0)]

# Convert to numeric and filter out NaN and '-' values
total_users_values = pd.to_numeric(total_users_values, errors='coerce')
total_users_values = total_users_values.dropna()

total_users_values


1
2023-10-01 00:00:00      6450.0
2023-11-01 00:00:00     16723.0
2023-12-01 00:00:00     48723.0
2024-01-01 00:00:00     78347.0
2024-02-01 00:00:00    128994.0
2024-03-01 00:00:00    197853.0
2024-04-01 00:00:00    308943.0
Name: 3, dtype: float64

In [44]:
# Improved function to ensure no negative transactions and better distribution of values
def generate_refined_transactions(month, year, total_value, num_users):
    transactions = []
    remaining_value = total_value

    # Average transaction value should be dynamically adjusted based on number of large transactions needed
    base_small_value = 4.5  # Base value for small transactions
    large_transaction_value = 150
    large_transaction_frequency = 10  # Every 10th transaction is large
    num_large_transactions = num_users // large_transaction_frequency
    total_large_transaction_value = num_large_transactions * large_transaction_value

    # Adjust average small transaction value to ensure total balances correctly
    total_small_transaction_value = total_value - total_large_transaction_value
    avg_small_value = total_small_transaction_value / (num_users - num_large_transactions)

    transaction_dates = generate_transaction_dates(month, year, num_users)
    transaction_ids = generate_ids("TXN", num_users)
    customer_ids = generate_ids("CUST", num_users)

    for i in range(num_users):
        if i % large_transaction_frequency == 0:
            # Assign a large transaction
            current_value = large_transaction_value
        else:
            # Assign a small transaction, adjusted to balance the total
            current_value = avg_small_value

        transactions.append({
            "transaction_id": transaction_ids[i],
            "date": transaction_dates[i].date().isoformat(),
            "customer_id": customer_ids[i],
            "net_revenue_earned": f"${current_value:.2f}"
        })
        remaining_value -= current_value

    return transactions

# Regenerate refined transactions using the new method
all_refined_transactions = []
for date_str, value in total_transaction_values.items():
    date = datetime.strptime(str(date_str), '%Y-%m-%d %H:%M:%S')
    num_users = int(total_users_values.get(date_str, 0))
    if num_users > 0:
        month_transactions = generate_refined_transactions(date.month, date.year, value, num_users)
        all_refined_transactions.extend(month_transactions)

# Create DataFrame from all_refined_transactions
all_refined_transactions_df = pd.DataFrame(all_refined_transactions)
all_refined_transactions_df.head()


Unnamed: 0,transaction_id,date,customer_id,net_revenue_earned
0,TXN5481,2023-10-14,CUST6134,$150.00
1,TXN3769,2023-10-17,CUST5743,$-11.68
2,TXN6761,2023-10-08,CUST8436,$-11.68
3,TXN9848,2023-10-02,CUST2475,$-11.68
4,TXN7172,2023-10-28,CUST3063,$-11.68


In [None]:
all_refined_transactions_df["net_revenue_earned"].sum()

In [9]:
transactions_info = transactions_data

In [19]:
# Filter the rows for Total Transactional Value and Total Users to get the transaction volumes and number of users
total_transactions = transactions_info.loc[2, transactions_info.columns[1:-1]]
total_users = transactions_info.loc[3, transactions_info.columns[1:-1]]

# Display the extracted total transaction values and total user counts for each month to confirm correctness
total_transactions, total_users


(1
 2023-07-01 00:00:00          NaN
 2023-08-01 00:00:00          NaN
 2023-09-01 00:00:00          NaN
 2023-10-01 00:00:00            0
 2023-11-01 00:00:00            -
 2023-12-01 00:00:00        12586
 2024-01-01 00:00:00     32983.59
 2024-02-01 00:00:00        49885
 2024-03-01 00:00:00        74860
 2024-04-01 00:00:00    140738.87
 Name: 2, dtype: object,
 1
 2023-07-01 00:00:00       NaN
 2023-08-01 00:00:00       NaN
 2023-09-01 00:00:00       NaN
 2023-10-01 00:00:00      6450
 2023-11-01 00:00:00     16723
 2023-12-01 00:00:00     48723
 2024-01-01 00:00:00     78347
 2024-02-01 00:00:00    128994
 2024-03-01 00:00:00    197853
 2024-04-01 00:00:00    308943
 Name: 3, dtype: object)

In [48]:
import numpy as np
from datetime import timedelta, datetime
from math import isnan


# Function to generate random dates within a specific month
def random_dates(start_date, n):
    start_date = datetime.strptime(start_date, "%Y-%m-%d %H:%M:%S")
    end_date = start_date + timedelta(days=30)  # Assuming 30 days per month for simplicity
    return [start_date + timedelta(days=np.random.randint(0, 30)) for _ in range(n)]

# Generate data for each month with transaction data available
transaction_data = []
np.random.seed(0)  # Seed for reproducibility

# Iterate over months and generate data
for month, total_value in total_transactions.items():
    if total_value != '-':
        if not isnan(float(total_value)) :
            print(month, total_value,isnan(total_value))  # Ignore months without transaction data
            num_transactions = int(total_users[month])  # Number of transactions equals number of users
            total_value = float(total_value)
            dates = random_dates(str(month), num_transactions)
            
            # Generate transaction amounts
            # th
            values = np.random.normal(4.5, 0.5, num_transactions)  # Most transactions around $4-$5
            #values = np.array([v.round(2) for v in values])
            # Add some larger transactions
            large_transaction_indices = np.random.choice(num_transactions, size=int(num_transactions * 0.05), replace=False)
            values[large_transaction_indices] = np.random.uniform(100, 200, size=len(large_transaction_indices))
            
            # Adjust values to match the total value
            scale_factor = total_value / values.sum()
            values *= scale_factor
            
            # Generate transaction IDs and customer IDs
            transaction_ids = [f"TXN{1000000 + i}" for i in range(num_transactions)]
            customer_ids = [f"CUST{1000 + np.random.randint(1, 10000)}" for _ in range(num_transactions)]
            
            # Combine all data into transaction_data
            month_data = zip(transaction_ids, dates, customer_ids, values)
            transaction_data.extend(month_data)

# Convert transaction data to a DataFrame
transaction_df = pd.DataFrame(transaction_data, columns=['transaction_id', 'date', 'customer_id', 'net_revenue_earned'])
transaction_df.head()


2023-10-01 00:00:00 0 False
2023-12-01 00:00:00 12586 False
2024-01-01 00:00:00 32983.59 False
2024-02-01 00:00:00 49885 False
2024-03-01 00:00:00 74860 False
2024-04-01 00:00:00 140738.87 False


Unnamed: 0,transaction_id,date,customer_id,net_revenue_earned
0,TXN1000000,2023-10-13,CUST4068,0.0
1,TXN1000001,2023-10-16,CUST9121,0.0
2,TXN1000002,2023-10-22,CUST3275,0.0
3,TXN1000003,2023-10-01,CUST6410,0.0
4,TXN1000004,2023-10-04,CUST10619,0.0


In [73]:
# Correct the date handling in the transaction generation function
def generate_final_transactions(month, num_users, total_revenue):
    transactions = []
    start_date = pd.Timestamp(month)
    end_date = start_date + pd.offsets.MonthEnd()
    dates = pd.date_range(start_date, end_date, freq='D')

    # Distribute users evenly across available days
    user_dates = np.random.choice(dates, size=int(num_users), replace=True)

    # Generate high and low revenues
    high_value_ratio = 0.05  # Approximately 5% high value transactions
    high_value_count = int(num_users * high_value_ratio)
    low_value_count = num_users - high_value_count
    high_revenues = np.random.uniform(100, 200, size=high_value_count)
    low_revenues = np.random.uniform(4, 5, size=low_value_count)

    # Calculate total initial revenue and adjust if necessary
    total_initial_revenue = high_revenues.sum() + low_revenues.sum()
    revenue_difference = total_revenue - total_initial_revenue

    # Adjust low revenues to fit the total revenue requirement
    if revenue_difference < 0:
        # Scale down low revenues proportionally if total is exceeded
        scale_factor = (low_revenues.sum() + revenue_difference) / low_revenues.sum()
        low_revenues *= max(scale_factor, 0)  # Ensure scale factor is not negative
        low_revenues = np.maximum(low_revenues, 4)  # Ensure no values fall below $4
    else:
        # Proportionally increase low revenues if there is a deficit
        scale_factor = (low_revenues.sum() + revenue_difference) / low_revenues.sum()
        low_revenues *= scale_factor

    # Ensure combined revenues match exactly
    final_low_revenue_sum = low_revenues.sum()
    final_adjustment_needed = total_revenue - (high_revenues.sum() + final_low_revenue_sum)
    low_revenues[-1] += final_adjustment_needed  # Adjust the last transaction to perfectly match the total

    # Combine and shuffle revenues
    all_revenues = np.concatenate([high_revenues, low_revenues])
    np.random.shuffle(all_revenues)

    # Generate unique IDs for transactions and customers
    customer_ids = [f"CUST{np.random.randint(1000, 9999)}" for _ in range(num_users)]
    transaction_ids = [f"TXN{np.random.randint(10000, 99999)}" for _ in range(num_users)]

    for i in range(num_users):
        transactions.append({
            "transaction_id": transaction_ids[i],
            "date": user_dates[i].astype('datetime64[D]').astype(str),  # Correct date formatting
            "customer_id": customer_ids[i],
            "net_revenue_earned": np.round(all_revenues[i], 2)
        })

    return transactions
valid_months = []
valid_user_counts = []
total_revenues = []
for month, total_value in total_transactions.items():
    if total_value != '-':
        if not isnan(float(total_value)) :
            valid_months.append(month)
            valid_user_counts.append(total_users[month])
            total_revenues.append(float(total_value))
# Regenerate transaction data for all valid months with correct total revenue
all_fixed_transactions = []
for month, users, revenue in zip(valid_months, valid_user_counts, map(float, total_revenues)):
    month_transactions = generate_final_transactions(month, int(users), revenue)
    all_fixed_transactions.extend(month_transactions)

# Convert the list of transaction dictionaries to a DataFrame
df_fixed_transactions = pd.DataFrame(all_fixed_transactions)

# Preview the fixed transactions DataFrame
df_fixed_transactions.head(), df_fixed_transactions.describe()


(  transaction_id        date customer_id  net_revenue_earned
 0       TXN49224  2023-10-20    CUST3599                4.00
 1       TXN66317  2023-10-02    CUST2286              197.18
 2       TXN52551  2023-10-29    CUST1052                4.00
 3       TXN70532  2023-10-01    CUST8620                4.00
 4       TXN52102  2023-10-23    CUST4507                4.00,
        net_revenue_earned
 count        7.693100e+05
 mean         4.043285e-01
 std          4.955760e+03
 min         -3.348604e+06
 25%          4.000000e+00
 50%          4.000000e+00
 75%          4.000000e+00
 max          2.000000e+02)

: 

In [70]:
df_fixed_transactions["net_revenue_earned"].sum()

8699996.140000002

In [11]:
from datetime import timedelta
import numpy as np

# Generate random transactions respecting monthly targets
total_entries = 1202414
amount_options = [4.89, 10.18, 25.27, 50.15]

# Create empty DataFrame for new transactions
new_transactions_df = pd.DataFrame(columns=["transaction_id", "date", "customer_id", "amount"])

# Month start dates for generating transactions
months = pd.to_datetime(['2023-12-01', '2024-01-01', '2024-02-01', '2024-03-01', '2024-04-01'])

# Data from summary
monthly_users = [16723,48723,78347,128994,197853]
monthly_values = [128950,349060,503987.28,1286496.95,1739505.63]

# Helper function to generate random dates within a month
def generate_dates(month, number_of_days, n):
    start_date = month
    end_date = month + timedelta(days=number_of_days)
    date_range = pd.date_range(start_date, end_date, freq='D')
    return np.random.choice(date_range, n)

# Generate data month by month
for month, users, total_value in zip(months, monthly_users, monthly_values):
    # Calculate how many transactions per user this month (randomly decide)
    transactions_per_user = np.random.randint(1, 4, users)
    total_transactions = transactions_per_user.sum()
    
    # Generate transaction_ids
    transaction_ids = [f"TIN{np.random.randint(100000, 999999)}" for _ in range(total_transactions)]
    
    # Generate dates for transactions within the month
    dates = generate_dates(month, 30, total_transactions)
    
    # Generate customer_ids
    customer_ids = np.repeat([f"CUST{np.random.randint(1000, 9999)}" for _ in range(users)], transactions_per_user)
    
    # Generate amounts
    amount_prob = np.random.dirichlet(np.ones(len(amount_options)) * 10, size=1).flatten()  # Slightly random probabilities
    amounts = np.random.choice(amount_options, size=total_transactions, p=amount_prob)
    
    # Make sure the total sums up to at least the monthly target value
    while amounts.sum() < total_value:
        # Top up amounts if they fall short of the expected value
        additional_amounts = np.random.choice(amount_options, size=500, p=amount_prob)  # Random top-up
        amounts = np.append(amounts, additional_amounts)
        additional_dates = generate_dates(month, 30, 500)
        dates = np.append(dates, additional_dates)
        additional_customer_ids = np.random.choice(customer_ids, 500)
        customer_ids = np.append(customer_ids, additional_customer_ids)
    
    # Create DataFrame for the month
    month_transactions_df = pd.DataFrame({
        "transaction_id": transaction_ids,
        "date": dates,
        "customer_id": customer_ids,
        "amount": amounts
    })
    
    # Append to the main DataFrame
    new_transactions_df = pd.concat([new_transactions_df, month_transactions_df], ignore_index=True)

new_transactions_df.head()


Unnamed: 0,transaction_id,date,customer_id,amount
0,TIN407235,2023-12-26,CUST4487,4.89
1,TIN440200,2023-12-22,CUST4487,50.15
2,TIN796697,2023-12-20,CUST2838,25.27
3,TIN913279,2023-12-05,CUST2838,25.27
4,TIN432200,2023-12-07,CUST3443,10.18


In [14]:
# Adjust amounts to exactly match the required total transactional value for each month
for month, total_value in zip(months, monthly_values):
    month_mask = new_transactions_df['date'].dt.month == month.month
    month_indices = new_transactions_df[month_mask].index
    
    current_sum = new_transactions_df.loc[month_indices, 'amount'].sum()
    difference = total_value - current_sum
    
    # Adjusting the amounts
    while abs(difference) > 0.01:  # Use a small threshold to avoid floating-point precision issues
        if difference > 0:
            # Need to increase the sum
            possible_indices = month_indices[new_transactions_df.loc[month_indices, 'amount'] < max(amount_options)]
        else:
            # Need to decrease the sum
            possible_indices = month_indices[new_transactions_df.loc[month_indices, 'amount'] > min(amount_options)]
        
        # If no possible indices are available to adjust, break to avoid infinite loop
        if len(possible_indices) == 0:
            break
        
        # Select random index to adjust
        random_index = np.random.choice(possible_indices)
        current_amount = new_transactions_df.loc[random_index, 'amount']
        if difference > 0:
            # Try to increase to the next higher amount if possible
            higher_amounts = [amt for amt in amount_options if amt > current_amount]
            if higher_amounts:  # There is a higher amount available
                new_amount = min(higher_amounts, key=lambda x: abs(x - current_amount - difference))
            else:
                new_amount = current_amount
        else:
            # Try to decrease to the next lower amount if possible
            lower_amounts = [amt for amt in amount_options if amt < current_amount]
            if lower_amounts:  # There is a lower amount available
                new_amount = max(lower_amounts, key=lambda x: abs(x - current_amount + difference))
            else:
                new_amount = current_amount
        
        # Update the amount
        new_transactions_df.loc[random_index, 'amount'] = new_amount
        new_difference = total_value - new_transactions_df.loc[month_indices, 'amount'].sum()
        
        # Check if the adjustment is not effective
        if abs(new_difference) >= abs(difference):
            # Revert change if it doesn't help
            new_transactions_df.loc[random_index, 'amount'] = current_amount
            break  # Break to avoid endless loop if no effective adjustment can be made
        else:
            difference = new_difference

# Verify the final adjustments
new_transactions_df.head()


KeyboardInterrupt: 

In [23]:
#grooup the new transactions by month and sum the amounts
new_transactions_df["month"] = new_transactions_df["date"].dt.to_period('M')
grouped_by_month = new_transactions_df.groupby("month")["amount"].count()
print(grouped_by_month)

month
2023-12     33562
2024-01     97650
2024-02    146351
2024-03    268284
2024-04    382720
2024-05     12680
Freq: M, Name: amount, dtype: int64


In [50]:
import pandas as pd

# Load the existing transaction data to analyze the distribution
file_path = 'transactions.csv'
existing_transactions = pd.read_csv(file_path)

# Display the first few rows of the dataframe and describe it to understand the structure and content
existing_transactions.head()


Unnamed: 0,r,2023-12-01 00:00:00,2024-01-01 00:00:00,2024-02-01 00:00:00,2024-03-01 00:00:00,2024-04-01 00:00:00,Total,totall
0,Total Transactional Value,349060.0,503987.28,1486496.95,1897401.63,3220567.89,44444,
1,Total Users,16723.0,48723.0,78347.0,128994.0,197853.0,308943,


In [124]:
# Remove non-date entries and convert the index properly to datetime
monthly_data = existing_transactions.transpose()
monthly_data = monthly_data.drop(columns='Total', errors='ignore')  # Drop 'Total' if present
monthly_data = monthly_data.drop(columns='totall', errors='ignore')  # Drop 'Total' if present
monthly_data.index = pd.to_datetime(monthly_data.index, errors='coerce')  # Coerce errors will handle any non-date entries
#drop last two rows
monthly_data = monthly_data.iloc[:-2]
# Displaying the updated data
#make the first as the column name
monthly_data.columns = monthly_data.iloc[0]
monthly_data= monthly_data.iloc[1:]
monthly_data


  monthly_data.index = pd.to_datetime(monthly_data.index, errors='coerce')  # Coerce errors will handle any non-date entries


NaT,Total Transactional Value,Total Users
2023-12-01,349060.0,16723.0
2024-01-01,503987.28,48723.0
2024-02-01,1486496.95,78347.0
2024-03-01,1897401.63,128994.0
2024-04-01,3220567.89,197853.0


In [125]:
monthly_data.index

DatetimeIndex(['2023-12-01', '2024-01-01', '2024-02-01', '2024-03-01',
               '2024-04-01'],
              dtype='datetime64[ns]', freq=None)

In [126]:
import numpy as np

# Defining parameters for data generation
total_entries = 1202414  # Total number of transactions to generate
monthly_indices = monthly_data.index.dropna()  # Only valid date indices
transaction_values = [i+0.5 for i in range(60)]  # Possible transaction amounts

# Initialize the DataFrame to store generated data
generated_transactions = pd.DataFrame(columns=['transaction_id', 'date', 'customer_id', 'amount'])

# Helper function to generate transaction IDs
def generate_transaction_id(n):
    return ["TIN{:05d}".format(i) for i in range(n)]

# Generate data for each month
for date in monthly_indices:
    num_users = int(monthly_data.loc[date, '16723'])  # Number of users for the month
    monthly_value = monthly_data.loc[date, '128950']  # Total transaction value for the month
    num_transactions = int(np.random.uniform(500, 600) + monthly_value / np.mean(transaction_values))  # Adjust number of transactions
    
    # Generate transaction data for the month
    transactions = pd.DataFrame({
        'transaction_id': generate_transaction_id(num_transactions),
        'date': [date] * num_transactions,
        'customer_id': ['CUST' + str(np.random.randint(1, num_users + 1)).zfill(5) for _ in range(num_transactions)],
        'amount': np.random.choice(transaction_values, num_transactions, p=[0.25, 0.25, 0.25, 0.25])
    })
    
    # Append to the main DataFrame
    generated_transactions = pd.concat([generated_transactions, transactions])

# Reset index of the generated DataFrame
generated_transactions.reset_index(drop=True, inplace=True)

# Display a snapshot of the generated data
generated_transactions.head(), generated_transactions.shape


KeyError: '16723'

In [127]:
# Correcting the column names for clarity and re-running the data generation with the correct keys
monthly_data.columns = ['total_value', 'total_users']  # Renaming columns for better readability

# Generate data for each month with corrected column names
generated_transactions = pd.DataFrame(columns=['transaction_id', 'date', 'customer_id', 'amount'])

# Generate data for each month
for date in monthly_indices:
    num_users = int(monthly_data.loc[date, 'total_users'])  # Number of users for the month
    monthly_value = monthly_data.loc[date, 'total_value']  # Total transaction value for the month
    num_transactions = int(np.random.uniform(500, 600) + monthly_value / np.mean(transaction_values))  # Adjust number of transactions
    
    # Generate transaction data for the month
    transactions = pd.DataFrame({
        'transaction_id': generate_transaction_id(num_transactions),
        'date': [date] * num_transactions,
        'customer_id': ['CUST' + str(np.random.randint(1, num_users + 1)).zfill(5) for _ in range(num_transactions)],
        'amount': np.random.choice(transaction_values, num_transactions)
    })
    
    # Append to the main DataFrame
    generated_transactions = pd.concat([generated_transactions, transactions])

# Reset index of the generated DataFrame
generated_transactions.reset_index(drop=True, inplace=True)

# Display a snapshot of the generated data
generated_transactions.head(), generated_transactions.shape


(  transaction_id       date customer_id  amount
 0       TIN00000 2023-12-01   CUST08188    21.5
 1       TIN00001 2023-12-01   CUST05474    36.5
 2       TIN00002 2023-12-01   CUST00017    38.5
 3       TIN00003 2023-12-01   CUST01017    56.5
 4       TIN00004 2023-12-01   CUST11372    51.5,
 (251278, 4))

In [128]:
# Adjust the amount generation to match monthly totals exactly with some variability allowed

def generate_custom_transactions(monthly_value, num_transactions, possible_values, max_custom=600):
    """ Generate transaction amounts that sum to a specific monthly value with some custom adjustments allowed """
    # Start with a random distribution of the predefined values
    amounts = np.random.choice(possible_values, num_transactions)
    current_total = amounts.sum()
    
    # Calculate the difference to match the exact total
    difference = monthly_value - current_total
    
    # Adjust the amounts to close the gap with up to 'max_custom' custom values
    adjustments = min(max_custom, num_transactions)
    adjustment_indices = np.random.choice(range(num_transactions), adjustments, replace=False)
    
    # Apply a simple increment to match the difference evenly across the adjustments
    for idx in adjustment_indices:
        # Calculate per transaction adjustment without exceeding the difference
        per_adjustment = difference / adjustments
        amounts[idx] += per_adjustment
        difference -= per_adjustment
        adjustments -= 1
    
    return amounts

# Regenerate the transactions with exact monthly totals
generated_transactions = pd.DataFrame(columns=['transaction_id', 'date', 'customer_id', 'amount'])

for date in monthly_indices:
    num_users = int(monthly_data.loc[date, 'total_users'])
    monthly_value = monthly_data.loc[date, 'total_value']
    num_transactions = int(monthly_value / np.mean(transaction_values) + np.random.uniform(500, 600))  # Number of transactions based on average amount

    # Generate amounts with the exact total matching
    amounts = generate_custom_transactions(monthly_value, num_transactions, transaction_values)
    
    transactions = pd.DataFrame({
        'transaction_id': generate_transaction_id(num_transactions),
        'date': [date] * num_transactions,
        'customer_id': ['CUST' + str(np.random.randint(1, num_users + 1)).zfill(5) for _ in range(num_transactions)],
        'amount': amounts
    })
    
    generated_transactions = pd.concat([generated_transactions, transactions])

generated_transactions.reset_index(drop=True, inplace=True)

# Display a snapshot and summary of the data to ensure the totals are correct
generated_transactions.groupby(generated_transactions['date'].dt.month).agg({'amount': ['sum', 'count']})


Unnamed: 0_level_0,amount,amount
Unnamed: 0_level_1,sum,count
date,Unnamed: 1_level_2,Unnamed: 2_level_2
1,503987.28,17308
2,1486496.95,50073
3,1897401.63,63803
4,3220567.89,107886
12,349060.0,12140


In [129]:
# Calculate the proportion of transactions for each month based on the amount and distribute the total number of entries accordingly
total_transactions = 1202414
monthly_proportions = monthly_data['total_value'] / monthly_data['total_value'].sum()
monthly_transaction_counts = (monthly_proportions * total_transactions).astype(int)

# Adjusting so that the sum of all monthly transactions matches the exact total required
difference = total_transactions - monthly_transaction_counts.sum()
monthly_transaction_counts.iloc[-1] += difference  # Adjust the last month to make up any rounding difference

# Regenerate the transactions with adjusted counts
generated_transactions = pd.DataFrame(columns=['transaction_id', 'date', 'customer_id', 'amount'])

for date, num_transactions in monthly_transaction_counts.items():
    num_users = int(monthly_data.loc[date, 'total_users'])
    monthly_value = monthly_data.loc[date, 'total_value']
    
    # Generate amounts to exactly match the monthly total value
    amounts = generate_custom_transactions(monthly_value, num_transactions, transaction_values)
    
    transactions = pd.DataFrame({
        'transaction_id': generate_transaction_id(num_transactions),
        'date': [date] * num_transactions,
        'customer_id': ['CUST' + str(np.random.randint(1, num_users + 1)).zfill(5) for _ in range(num_transactions)],
        'amount': amounts
    })
    
    generated_transactions = pd.concat([generated_transactions, transactions])

generated_transactions.reset_index(drop=True, inplace=True)

# Display final verification of amounts and counts to ensure correctness
generated_transactions.groupby(generated_transactions['date'].dt.month).agg({'amount': ['sum'], 'transaction_id': ['count']})


Unnamed: 0_level_0,amount,transaction_id
Unnamed: 0_level_1,sum,count
date,Unnamed: 1_level_2,Unnamed: 2_level_2
1,503987.28,81260
2,1486496.95,239675
3,1897401.63,305928
4,3220567.89,519271
12,349060.0,56280


In [130]:
# Check and handle any NaN or infinite values in the total_value column
if monthly_data['total_value'].isna().any() or np.isinf(monthly_data['total_value']).any():
    monthly_data = monthly_data.dropna(subset=['total_value'])  # Drop any rows where total_value is NaN or inf

# Recalculate the proportion of transactions for each month
monthly_proportions = monthly_data['total_value'] / monthly_data['total_value'].sum()
monthly_transaction_counts = (monthly_proportions * total_transactions).astype(int)

# Ensure the sum of monthly transaction counts equals the required total by adjusting the last month
difference = total_transactions - monthly_transaction_counts.sum()
monthly_transaction_counts.iloc[-1] += difference

# Verify the adjustments and the sum of transactions
monthly_transaction_counts.sum(), monthly_transaction_counts


TypeError: ufunc 'isinf' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting rule ''safe''

In [131]:
# Regenerate the transactions with the exact transaction counts and ensure total value matches
generated_transactions = pd.DataFrame(columns=['transaction_id', 'date', 'customer_id', 'amount'])

for date, num_transactions in monthly_transaction_counts.items():
    num_users = int(monthly_data.loc[date, 'total_users'])
    monthly_value = monthly_data.loc[date, 'total_value']
    
    # Generate transaction amounts that exactly match the total monthly value
    amounts = generate_custom_transactions(monthly_value, num_transactions, transaction_values)
    
    transactions = pd.DataFrame({
        'transaction_id': generate_transaction_id(num_transactions),
        'date': [date] * num_transactions,
        'customer_id': ['CUST' + str(np.random.randint(1, num_users + 1)).zfill(5) for _ in range(num_transactions)],
        'amount': amounts
    })
    
    # Append to the main DataFrame
    generated_transactions = pd.concat([generated_transactions, transactions], ignore_index=True)

# Verify the final counts and totals
verification = generated_transactions.groupby(generated_transactions['date'].dt.month).agg({
    'amount': 'sum', 
    'transaction_id': 'count'
})
verification, generated_transactions.shape


(          amount  transaction_id
 date                            
 1      503987.28           81260
 2     1486496.95          239675
 3     1897401.63          305928
 4     3220567.89          519271
 12     349060.00           56280,
 (1202414, 4))

In [132]:
# Function to generate adjusted transactions ensuring user count and total value are matched
def generate_adjusted_transactions(date, num_users, num_transactions, monthly_value, transaction_values):
    # Generate unique customer IDs to match the exact number of users
    customer_ids = ['CUST' + str(i).zfill(5) for i in range(1, num_users + 1)]
    if num_transactions > num_users:  # If more transactions than users, repeat some customer IDs
        extra_ids = ['CUST' + str(np.random.randint(1, num_users + 1)).zfill(5) for _ in range(num_transactions - num_users)]
        customer_ids.extend(extra_ids)
    np.random.shuffle(customer_ids)  # Shuffle to randomize transaction ownership
    
    # Generate transaction amounts with exactly matching total value
    amounts = generate_custom_transactions(monthly_value, num_transactions, transaction_values)
    
    # Generate transaction IDs
    transaction_ids = generate_transaction_id(num_transactions)
    
    # Create the transaction DataFrame
    transactions = pd.DataFrame({
        'transaction_id': transaction_ids,
        'date': [date] * num_transactions,
        'customer_id': customer_ids[:num_transactions],  # Ensure list is not out of bounds
        'amount': amounts
    })
    
    return transactions

# Regenerate the entire set of transactions with adjusted user counts
adjusted_transactions = pd.DataFrame(columns=['transaction_id', 'date', 'customer_id', 'amount'])

for date, num_transactions in monthly_transaction_counts.items():
    num_users = int(monthly_data.loc[date, 'total_users'])
    monthly_value = monthly_data.loc[date, 'total_value']
    
    # Generate the adjusted transactions for the month
    transactions = generate_adjusted_transactions(date, num_users, num_transactions, monthly_value, transaction_values)
    
    # Append to the main DataFrame
    adjusted_transactions = pd.concat([adjusted_transactions, transactions], ignore_index=True)

# Verification of the adjusted transactions
verification_adjusted = adjusted_transactions.groupby(adjusted_transactions['date'].dt.to_period("M")).agg({
    'amount': 'sum', 
    'customer_id': pd.Series.nunique
})

verification_adjusted, adjusted_transactions.shape


(             amount  customer_id
 date                            
 2023-12   349060.00        16723
 2024-01   503987.28        48723
 2024-02  1486496.95        78347
 2024-03  1897401.63       128994
 2024-04  3220567.89       197853,
 (1202414, 4))

In [133]:
# Calculate unique users per month in the generated transactions
unique_users_per_month = adjusted_transactions.groupby(adjusted_transactions['date'].dt.to_period("M"))['customer_id'].nunique()

# Retrieve the original expected number of users per month from the monthly_data for comparison
original_users_per_month = monthly_data['total_users']

# Create a comparison DataFrame
user_verification = pd.DataFrame({
    'Original Users': original_users_per_month,
    'Generated Unique Users': unique_users_per_month
})

user_verification


Unnamed: 0,Original Users,Generated Unique Users
2023-12-01 00:00:00,16723.0,
2024-01-01 00:00:00,48723.0,
2024-02-01 00:00:00,78347.0,
2024-03-01 00:00:00,128994.0,
2024-04-01 00:00:00,197853.0,
2023-12,,16723.0
2024-01,,48723.0
2024-02,,78347.0
2024-03,,128994.0
2024-04,,197853.0


In [134]:
adjusted_transactions[adjusted_transactions["amount"]<0]

Unnamed: 0,transaction_id,date,customer_id,amount
42,TIN00042,2023-12-01,CUST15403,-2218.878333
98,TIN00098,2023-12-01,CUST12361,-2177.878333
259,TIN00259,2023-12-01,CUST05855,-2182.878333
269,TIN00269,2023-12-01,CUST03487,-2182.878333
322,TIN00322,2023-12-01,CUST01501,-2183.878333
...,...,...,...,...
1197451,TIN514308,2024-04-01,CUST138638,-20571.251017
1197869,TIN514726,2024-04-01,CUST116597,-20558.251017
1200040,TIN516897,2024-04-01,CUST134163,-20537.251017
1201077,TIN517934,2024-04-01,CUST189385,-20523.251017


In [135]:
verification_adjusted = adjusted_transactions.groupby(adjusted_transactions['date'].dt.to_period("M")).agg({
    'amount': 'sum', 
    'customer_id': pd.Series.nunique
})

verification_adjusted

Unnamed: 0_level_0,amount,customer_id
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2023-12,349060.0,16723
2024-01,503987.28,48723
2024-02,1486496.95,78347
2024-03,1897401.63,128994
2024-04,3220567.89,197853


In [137]:

def rebalance_transactions(transactions, monthly_value):
    # Ensure no negative amounts and prepare for rebalancing
    transactions['amount'] = transactions['amount'].apply(lambda x: max(x, 1.3))  # Replace negative values with a minimum amount
    current_total = transactions['amount'].sum()

    # Calculate the difference to be adjusted
    difference = monthly_value - current_total

    # Distribute the difference across transactions proportionally, avoiding negatives
    adjustments = (transactions['amount'] / current_total) * difference
    transactions['amount'] += adjustments

    # Ensure no negative values after adjustment
    transactions['amount'] = transactions['amount'].apply(lambda x: max(x, 1.3))
    
    return transactions

# Apply rebalancing to each month's transactions
for date in monthly_transaction_counts.index:
    # Filter transactions for the specific month
    month_mask = (adjusted_transactions['date'].dt.to_period("M") == date.to_period("M"))
    monthly_transactions = adjusted_transactions[month_mask]
    monthly_value = monthly_data.loc[date, 'total_value']

    # Rebalance the transactions for this month
    rebalanced_transactions = rebalance_transactions(monthly_transactions.copy(), monthly_value)

    # Update the transactions in the main DataFrame
    adjusted_transactions.loc[month_mask, 'amount'] = rebalanced_transactions['amount']

# Final verification of adjustments
final_verification = adjusted_transactions.groupby(adjusted_transactions['date'].dt.to_period("M")).agg({
    'amount': ['sum', 'min', 'max'],  # Check sums and that there are no negative values
    'customer_id': pd.Series.nunique  # Ensure user count remains correct
})

final_verification


Unnamed: 0_level_0,amount,amount,amount,customer_id
Unnamed: 0_level_1,sum,min,max,nunique
date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
2023-12,349060.0,2.220886,11.96135,16723
2024-01,503987.28,2.222665,11.940096,48723
2024-02,1486496.95,2.225118,11.911619,78347
2024-03,1897401.63,2.225592,11.903125,128994
2024-04,3220567.89,2.225847,11.910366,197853


In [110]:
# go throught the adjusted dataframe if the amount is 0.01 add 3 to it and substract from an the next amount taht is >4
for i in range(len(adjusted_transactions)):
    if adjusted_transactions.loc[i,"amount"]==0.01:
        adjusted_transactions.loc[i,"amount"]=3
        for j in range(i+1,len(adjusted_transactions)):
            if adjusted_transactions.loc[j,"amount"]>4:
                adjusted_transactions.loc[j,"amount"]-=3
                break

In [77]:
adjusted_transactions.to_csv("adjusted_transactions.csv", index=False)

In [138]:
import string
import random

# Function to generate a random alphanumeric string of a given length
def generate_random_string(length):
    return ''.join(random.choices(string.ascii_letters + string.digits, k=length))

# Function to update the customer and card IDs in the transactions
def update_ids(transactions, id_length, customer_prefix, card_prefix):
    transactions['customer_id'] = [customer_prefix + generate_random_string(id_length) for _ in range(len(transactions))]
    transactions['card_id'] = [card_prefix + generate_random_string(id_length) for _ in range(len(transactions))]
    return transactions

# Define the ID patterns from the provided examples
customer_id_prefix = 'ch_'
card_id_prefix = 'pm_'
random_string_length = 24  # Approximate length from the examples

# Apply the ID updates to the entire DataFrame of adjusted transactions
updated_transactions = update_ids(adjusted_transactions.copy(), random_string_length, customer_id_prefix, card_id_prefix)

# Display a snapshot of the updated transactions to confirm the changes
updated_transactions.head()
final_verification_updated = updated_transactions.groupby(updated_transactions['date'].dt.to_period("M")).agg({
    'amount': ['sum', 'min', 'count'], 
    'customer_id': 'nunique'
})



In [139]:
final_verification_updated

Unnamed: 0_level_0,amount,amount,amount,customer_id
Unnamed: 0_level_1,sum,min,count,nunique
date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
2023-12,349060.0,2.220886,56280,56280
2024-01,503987.28,2.222665,81260,81260
2024-02,1486496.95,2.225118,239675,239675
2024-03,1897401.63,2.225592,305928,305928
2024-04,3220567.89,2.225847,519271,519271


In [140]:
# add a column net transaction which is 10 percent of amount
updated_transactions["net_transaction"] = updated_transactions["amount"]*0.10 


In [144]:
#find top 10 present amount values
top_10_amounts = updated_transactions["amount"].value_counts().head(10)
top_10_amounts

amount
2.225847     95864
2.225592     56441
2.225118     44371
2.222665     15403
2.220886     10801
11.109669     8816
4.103571      8794
11.910366     8761
5.304617      8759
8.907753      8755
Name: count, dtype: int64

In [145]:
updated_transactions.to_csv("updated_transactions.csv", index=False)

In [109]:
# Set the minimum transaction amount to $2.00
minimum_amount = 5.00

# Function to rebalance the amounts ensuring each transaction is at least $2.00 and monthly totals are preserved
def enforce_minimum_and_rebalance(transactions, monthly_value):
    # Calculate the total that needs to be redistributed to ensure all transactions are at least $2.00
    shortfall = transactions[transactions['amount'] < minimum_amount]['amount'].count() * (minimum_amount - transactions['amount'].min())
    excess = monthly_value - transactions['amount'].sum() + shortfall
    
    # Transactions eligible for reduction (those greater than $2.00)
    eligible_for_reduction = transactions['amount'] > minimum_amount

    # Reduce the eligible transactions proportionally
    transactions.loc[eligible_for_reduction, 'amount'] += (transactions.loc[eligible_for_reduction, 'amount'] / transactions.loc[eligible_for_reduction, 'amount'].sum()) * excess

    # Ensure no transaction is below the minimum amount
    transactions['amount'] = transactions['amount'].clip(lower=minimum_amount)

    return transactions

# Apply the rebalancing to each month's transactions
for date in monthly_indices:
    # Filter transactions for the specific month
    month_transactions = updated_transactions[updated_transactions['date'] == date]
    monthly_value = monthly_data.loc[date, 'total_value']

    # Rebalance the transactions for this month
    updated_transactions.loc[updated_transactions['date'] == date, 'amount'] = enforce_minimum_and_rebalance(month_transactions, monthly_value)['amount']

# Final verification
final_verification_updated = updated_transactions.groupby(updated_transactions['date'].dt.to_period("M")).agg({
    'amount': ['sum', 'min', 'count'], 
    'customer_id': 'nunique'
})

final_verification_updated


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  transactions['amount'] = transactions['amount'].clip(lower=minimum_amount)


Unnamed: 0_level_0,amount,amount,amount,customer_id
Unnamed: 0_level_1,sum,min,count,nunique
date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
2023-12,460246.1,5.0,56280,56280
2024-01,664253.7,5.0,81260,81260
2024-02,1957051.0,5.0,239675,239675
2024-03,2496461.0,5.0,305928,305928
2024-04,4235481.0,5.0,519271,519271


In [146]:
# change the day to a random day in the date column while keeping the same month 
updated_transactions["date"] = updated_transactions["date"].apply(lambda x: x.replace(day=np.random.randint(1, 28)))

In [148]:
updated_transactions.to_csv("final_transactions.csv", index=False)