In [15]:
import pandas as pd
import os
import numpy as np

In [17]:
files = [
    r"C:/Users/PC/Documents/Water datasets/2020/2020 OTHERS.csv",
    r"C:/Users/PC/Documents/Water datasets/2020/2020 WA10 AND WA13.csv",
    r"C:/Users/PC/Documents/Water datasets/2020/2020 WA60-WA64.csv",
    r"C:/Users/PC/Documents/Water datasets/2020/2020 WA70.csv",
]


In [19]:
for f in files:
    print(f, "✅" if os.path.exists(f) else "❌ File not found")

C:/Users/PC/Documents/Water datasets/2020/2020 OTHERS.csv ✅
C:/Users/PC/Documents/Water datasets/2020/2020 WA10 AND WA13.csv ✅
C:/Users/PC/Documents/Water datasets/2020/2020 WA60-WA64.csv ✅
C:/Users/PC/Documents/Water datasets/2020/2020 WA70.csv ✅


In [21]:
dfs = [pd.read_csv(f) for f in files]

In [25]:
combined2020 = pd.concat(dfs, ignore_index=True)

In [27]:
combined2020.to_excel(r"C:/Users/PC/Documents/Water datasets/2020/combined2020.xlsx", sheet_name="Combined2020", index=False)

In [33]:
# Check the exact column names
print(combined2020.columns)

Index(['ACCOUNT_NO', 'ACCOUNT_NAME', 'UNIT_NO', 'STAND_ADDRESS', 'TOWNSHIP',
       'METER_NO', 'METER_STATUS', 'TARIFF CODE', 'JANUARY', 'FEBRUARY',
       'MARCH', 'APRIL', 'MAY', 'JUNE', 'JULY', 'AUGUST', 'SEPTEMBER',
       'OCTOBER', 'NOVEMBR', 'DECEMBER', 'Total'],
      dtype='object')


In [35]:
# Show all duplicated meter numbers (i.e., appearing more than once)
duplicates = combined2020[combined2020.duplicated(subset=['METER_NO'], keep=False)]

In [37]:
# Sort for easier viewing
duplicates = duplicates.sort_values(by='METER_NO')

In [39]:
print(f"Total duplicated meter numbers: {duplicates['METER_NO'].nunique()}")
duplicates.head(20)

Total duplicated meter numbers: 456


Unnamed: 0,ACCOUNT_NO,ACCOUNT_NAME,UNIT_NO,STAND_ADDRESS,TOWNSHIP,METER_NO,METER_STATUS,TARIFF CODE,JANUARY,FEBRUARY,...,APRIL,MAY,JUNE,JULY,AUGUST,SEPTEMBER,OCTOBER,NOVEMBR,DECEMBER,Total
2642,10640312,TJIRONDERO JV,1500000000208400000000R0000,HOFSANGER ST,KHOMASDAL ...,BLAH610,ACTIVE,WA12,10,5,...,7,0,19,7,7,8,0,0,0,73
34602,10640312,TJIRONDERO JV,1500000000208400000000R0000,HOFSANGER ST,KHOMASDAL ...,BLAH610,ACTIVE,WA10,0,0,...,0,0,0,0,0,0,26,16,0,42
1877,10361510,KAMBONDE(PEN) ANNA NDINELAGO,2600000000565400000000R0000,LAZARUS ST,KATUTURA ...,BLAM598,ACTIVE,WA12,0,50,...,0,0,0,0,0,0,0,0,0,50
27413,10361510,KAMBONDE(PEN) ANNA NDINELAGO,2600000000565400000000R0000,LAZARUS ST,KATUTURA ...,BLAM598,ACTIVE,WA10,0,0,...,0,0,0,0,36,0,10,9,0,55
27440,10375449,MBARIMUUO ELSIE BJ,260000000086630000000000000,SHANGHAI ST,KATUTURA ...,BLAM816,ACTIVE,WA10,0,0,...,0,0,0,0,0,0,0,0,0,0
1879,10375449,MBARIMUUO ELSIE BJ,260000000086630000000000000,SHANGHAI ST,KATUTURA ...,BLAM816,ACTIVE,WA12,0,17,...,0,0,16,0,0,0,0,0,0,41
837,10477357,AKOMENA ESTER,180000000014000000000000000,GREEN MOUNTAIN DAM RD,GOREANGAB ...,BLAN374,ACTIVE,WA12,0,7,...,9,0,20,7,5,10,5,8,7,88
18946,10477357,AKOMENA ESTER,180000000014000000000000000,GREEN MOUNTAIN DAM RD,GOREANGAB ...,BLAN374,ACTIVE,WA10,10,0,...,0,0,0,0,0,0,0,0,0,10
48609,10551921,KUHANGA LEONARD,140000000002550000000000000,COPENHAGEN ST,OTJOMUISE ...,CLAE996,ACTIVE,WA10,0,23,...,8,0,21,8,0,0,0,0,0,68
5746,10551921,KUHANGA LEONARD,140000000002550000000000000,COPENHAGEN ST,OTJOMUISE ...,CLAE996,ACTIVE,WA12,0,0,...,0,0,0,0,15,10,10,16,8,59


In [41]:
months_12 = [
    'JANUARY','FEBRUARY','MARCH','APRIL','MAY','JUNE',
    'JULY','AUGUST','SEPTEMBER','OCTOBER','NOVEMBR','DECEMBER'
]

In [45]:
# Make sure columns exist and are numeric
months_12 = [c for c in months_12 if c in combined2020.columns]
combined2020[months_12] = combined2020[months_12].apply(pd.to_numeric, errors='coerce').fillna(0)

In [47]:
# Rule:
# 1️⃣ If METER_NO appears more than once with the SAME TARIFF CODE → keep first
# 2️⃣ If METER_NO appears with DIFFERENT TARIFF CODEs → sum month values
#     and keep the TARIFF CODE with the most non-zero months

In [49]:
# --- Step 1: find single- vs multi-tariff meters
tariff_counts = combined2020.groupby('METER_NO')['TARIFF CODE'].nunique()
single_meters = tariff_counts[tariff_counts == 1].index
multi_meters = tariff_counts[tariff_counts > 1].index

In [51]:
# --- Step 2: handle single-tariff meters
single_df = (combined2020[combined2020['METER_NO'].isin(single_meters)]
             .sort_values(['METER_NO'])
             .drop_duplicates(subset=['METER_NO'], keep='first'))

In [53]:
# --- Step 3: handle multi-tariff meters
multi_df = combined2020[combined2020['METER_NO'].isin(multi_meters)].copy()
multi_df['_nonzero_months'] = (multi_df[months_12].gt(0)).sum(axis=1)

In [55]:
# Count month entries per tariff
entries_per_tariff = (multi_df.groupby(['METER_NO','TARIFF CODE'])['_nonzero_months']
                      .sum().reset_index(name='month_entries'))

In [57]:
# Pick tariff with the most month entries
best_tariff = (entries_per_tariff.sort_values(['METER_NO','month_entries'], ascending=[True, False])
               .drop_duplicates(subset=['METER_NO']))

In [59]:
# Sum month totals across all rows per meter
summed = multi_df.groupby('METER_NO', as_index=False)[months_12].sum()

In [61]:
# Get one representative row from the chosen tariff per meter
rep_rows = (multi_df.merge(best_tariff[['METER_NO','TARIFF CODE']], on=['METER_NO','TARIFF CODE'])
                  .sort_values(['METER_NO'])
                  .groupby('METER_NO', as_index=False)
                  .first()[['METER_NO','ACCOUNT_NO','UNIT_NO','STAND_ADDRESS',
                            'TOWNSHIP','METER_STATUS','TARIFF CODE']])

In [63]:
# Merge summed months back in
multi_clean = rep_rows.merge(summed, on='METER_NO', how='left')
multi_clean['Total'] = multi_clean[months_12].sum(axis=1)

In [65]:
# --- Step 4: combine single + multi sets
cleaned = pd.concat([single_df, multi_clean], ignore_index=True)

In [67]:
# Recompute Total to be sure
cleaned['Total'] = cleaned[months_12].sum(axis=1)

In [69]:
# Optional flag
cleaned['MERGED_FROM_MULTI_TARIFF'] = cleaned['METER_NO'].isin(multi_meters).map({True:'Yes', False:'No'})

In [71]:
# --- Step 5: save output
output_path = r"C:/Users/PC/Documents/Water datasets/2020/combined2020_cleaned_by_rule.xlsx"
cleaned.to_excel(output_path, index=False)

In [72]:
months_12 = [
    'JANUARY','FEBRUARY','MARCH','APRIL','MAY','JUNE',
    'JULY','AUGUST','SEPTEMBER','OCTOBER','NOVEMBR','DECEMBER'
]

In [75]:
# ensure numeric months (zeros where not numeric)
cleaned[months_12] = cleaned[months_12].apply(pd.to_numeric, errors='coerce').fillna(0.0)

In [77]:
def backward_distribution_row(row, month_cols=months_12):
    vals = row[month_cols].astype(float).to_numpy().copy()

    # walk forward through months
    for i in range(len(vals)):
        if vals[i] > 0:
            # count consecutive zeros immediately before month i
            j = i - 1
            zeros = 0
            while j >= 0 and vals[j] == 0:
                zeros += 1
                j -= 1
            if zeros > 0:
                share = vals[i] / (zeros + 1)        # equal split over the block + this month
                for k in range(i - zeros, i + 1):    # fill the zero block and this month
                    vals[k] = share
    return pd.Series(vals, index=month_cols)


In [79]:
# Apply to all rows (or filter e.g., ACTIVE meters only)
# active_mask = cleaned['METER_STATUS'].eq('ACTIVE')
# cleaned.loc[active_mask, months_12] = cleaned.loc[active_mask].apply(backward_distribution_row, axis=1)
cleaned[months_12] = cleaned.apply(backward_distribution_row, axis=1)

In [None]:
# Recompute Total exactly from months
cleaned['Total'] = cleaned[months_12].sum(axis=1)

In [81]:
output_path = r"C:/Users/PC/Documents/Water datasets/2020/combined_backward_distributed_2020.xlsx"
cleaned.to_excel(output_path, index=False)