In [35]:
import pandas as pd
import os
import numpy as np

In [37]:
files = [
    r"C:/Users/PC/Documents/Water datasets/2023/2023 OTHERS.csv",
    r"C:/Users/PC/Documents/Water datasets/2023/2023 WA10 AND WA13.csv",
    r"C:/Users/PC/Documents/Water datasets/2023/2023 WA60-WA64.csv",
    r"C:/Users/PC/Documents/Water datasets/2023/2023 WA70.csv",
]

In [39]:
for f in files:
    print(f, "✅" if os.path.exists(f) else "❌ File not found")

C:/Users/PC/Documents/Water datasets/2023/2023 OTHERS.csv ✅
C:/Users/PC/Documents/Water datasets/2023/2023 WA10 AND WA13.csv ✅
C:/Users/PC/Documents/Water datasets/2023/2023 WA60-WA64.csv ✅
C:/Users/PC/Documents/Water datasets/2023/2023 WA70.csv ✅


In [41]:
dfs = [pd.read_csv(f) for f in files]

In [43]:
combined2023 = pd.concat(dfs, ignore_index=True)

In [45]:
combined2023.to_excel(r"C:/Users/PC/Documents/Water datasets/2023/combined2023.xlsx", sheet_name="Combined2023", index=False)

In [46]:
# Check the exact column names
print(combined2023.columns)

Index(['ACCOUNT_NO', 'ACCOUNT_NAME', 'TOWNSHIP', 'METER_NO', 'METER_STATUS',
       'TARIFF CODE', 'JANUARY', 'FEBRUARY', 'MARCH', 'APRIL', 'MAY', 'JUNE',
       'JULY', 'AUGUST', 'SEPTEMBER', 'OCTOBER', 'NOVEMBR', 'DECEMBER',
       'TOTAL'],
      dtype='object')


In [47]:
new_order = [
    "ACCOUNT_NO",
    "TOWNSHIP",
    "METER_NO",
    "METER_STATUS",
    "TARIFF CODE",
    "JANUARY",
    "FEBRUARY",
    "MARCH",
    "APRIL",
    "MAY",
    "JUNE",
    "JULY",
    "AUGUST",
    "SEPTEMBER",
    "OCTOBER",
    "NOVEMBR",
    "DECEMBER",
    "TOTAL"
]

In [48]:
# Reorder the DataFrame columns
combined2023 = combined2023[new_order]

In [53]:
# Show all duplicated meter numbers (i.e., appearing more than once)
duplicates = combined2023[combined2023.duplicated(subset=['METER_NO'], keep=False)]

In [55]:
# Sort for easier viewing
duplicates = duplicates.sort_values(by='METER_NO')

In [57]:
print(f"Total duplicated meter numbers: {duplicates['METER_NO'].nunique()}")
duplicates.head(20)

Total duplicated meter numbers: 2625


Unnamed: 0,ACCOUNT_NO,TOWNSHIP,METER_NO,METER_STATUS,TARIFF CODE,JANUARY,FEBRUARY,MARCH,APRIL,MAY,JUNE,JULY,AUGUST,SEPTEMBER,OCTOBER,NOVEMBR,DECEMBER,TOTAL
1934,10283404,WINDHOEK/WINDHOEK BLOCKS,HZN596,INACTIVE,WA12,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0
1935,10283404,WINDHOEK/WINDHOEK BLOCKS,HZN596,INACTIVE,WA12,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0
1937,13130277,LAFRENZ,HZN766,INACTIVE,WA20,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0
1936,13130277,LAFRENZ,HZN766,INACTIVE,WA20,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0
1939,13006487,PROSPERITA,HZN769,INACTIVE,WAFHC,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0
1938,13006487,PROSPERITA,HZN769,INACTIVE,WAFHC,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0
1940,10256148,PIONIERSPARK,HZN770,INACTIVE,WA22,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0
1941,10256148,PIONIERSPARK,HZN770,INACTIVE,WA22,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0
1953,10839488,KLEIN WINDHOEK/KLEIN WINDHOEK BLOCKS/LUDWIGSDORF,PZN393,INACTIVE,WAFHNV,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0
1954,10839488,KLEIN WINDHOEK/KLEIN WINDHOEK BLOCKS/LUDWIGSDORF,PZN393,INACTIVE,WAFHNV,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0


In [59]:
months_12 = [
    'JANUARY','FEBRUARY','MARCH','APRIL','MAY','JUNE',
    'JULY','AUGUST','SEPTEMBER','OCTOBER','NOVEMBR','DECEMBER'
]

In [61]:
# Make sure columns exist and are numeric
months_12 = [c for c in months_12 if c in combined2023.columns]
combined2023[months_12] = combined2023[months_12].apply(pd.to_numeric, errors='coerce').fillna(0)

In [63]:
# Rule:
# 1️⃣ If METER_NO appears more than once with the SAME TARIFF CODE → keep first
# 2️⃣ If METER_NO appears with DIFFERENT TARIFF CODEs → sum month values
#     and keep the TARIFF CODE with the most non-zero months

In [65]:
# --- Step 1: find single- vs multi-tariff meters
tariff_counts = combined2023.groupby('METER_NO')['TARIFF CODE'].nunique()
single_meters = tariff_counts[tariff_counts == 1].index
multi_meters = tariff_counts[tariff_counts > 1].index

In [67]:
# --- Step 2: handle single-tariff meters
single_df = (combined2023[combined2023['METER_NO'].isin(single_meters)]
             .sort_values(['METER_NO'])
             .drop_duplicates(subset=['METER_NO'], keep='first'))

In [69]:
# --- Step 3: handle multi-tariff meters
multi_df = combined2023[combined2023['METER_NO'].isin(multi_meters)].copy()
multi_df['_nonzero_months'] = (multi_df[months_12].gt(0)).sum(axis=1)

In [71]:
# Count month entries per tariff
entries_per_tariff = (multi_df.groupby(['METER_NO','TARIFF CODE'])['_nonzero_months']
                      .sum().reset_index(name='month_entries'))

In [73]:
# Pick tariff with the most month entries
best_tariff = (entries_per_tariff.sort_values(['METER_NO','month_entries'], ascending=[True, False])
               .drop_duplicates(subset=['METER_NO']))

In [75]:
# Sum month totals across all rows per meter
summed = multi_df.groupby('METER_NO', as_index=False)[months_12].sum()

In [79]:
# Get one representative row from the chosen tariff per meter
rep_rows = (multi_df.merge(best_tariff[['METER_NO','TARIFF CODE']], on=['METER_NO','TARIFF CODE'])
                  .sort_values(['METER_NO'])
                  .groupby('METER_NO', as_index=False)
                  .first()[['METER_NO','ACCOUNT_NO',
                            'TOWNSHIP','METER_STATUS','TARIFF CODE']])

In [81]:
# Merge summed months back in
multi_clean = rep_rows.merge(summed, on='METER_NO', how='left')
multi_clean['Total'] = multi_clean[months_12].sum(axis=1)

In [83]:
# --- Step 4: combine single + multi sets
cleaned = pd.concat([single_df, multi_clean], ignore_index=True)

In [85]:
# Recompute Total to be sure
cleaned['Total'] = cleaned[months_12].sum(axis=1)

In [87]:
# Optional flag
cleaned['MERGED_FROM_MULTI_TARIFF'] = cleaned['METER_NO'].isin(multi_meters).map({True:'Yes', False:'No'})

In [89]:
# --- Step 5: save output
output_path = r"C:/Users/PC/Documents/Water datasets/2023/combined2023_cleaned_by_rule.xlsx"
cleaned.to_excel(output_path, index=False)

In [90]:
months_12 = [
    'JANUARY','FEBRUARY','MARCH','APRIL','MAY','JUNE',
    'JULY','AUGUST','SEPTEMBER','OCTOBER','NOVEMBR','DECEMBER'
]

In [93]:
# ensure numeric months (zeros where not numeric)
cleaned[months_12] = cleaned[months_12].apply(pd.to_numeric, errors='coerce').fillna(0.0)

In [95]:
def backward_distribution_row(row, month_cols=months_12):
    vals = row[month_cols].astype(float).to_numpy().copy()

    # walk forward through months
    for i in range(len(vals)):
        if vals[i] > 0:
            # count consecutive zeros immediately before month i
            j = i - 1
            zeros = 0
            while j >= 0 and vals[j] == 0:
                zeros += 1
                j -= 1
            if zeros > 0:
                share = vals[i] / (zeros + 1)        # equal split over the block + this month
                for k in range(i - zeros, i + 1):    # fill the zero block and this month
                    vals[k] = share
    return pd.Series(vals, index=month_cols)

In [97]:
# Apply to all rows (or filter e.g., ACTIVE meters only)
# active_mask = cleaned['METER_STATUS'].eq('ACTIVE')
# cleaned.loc[active_mask, months_12] = cleaned.loc[active_mask].apply(backward_distribution_row, axis=1)
cleaned[months_12] = cleaned.apply(backward_distribution_row, axis=1)

In [98]:
# Recompute Total exactly from months
cleaned['Total'] = cleaned[months_12].sum(axis=1)

In [101]:
output_path = r"C:/Users/PC/Documents/Water datasets/2023/combined_backward_distributed_2023.xlsx"
cleaned.to_excel(output_path, index=False)