# P1/P2 Mapping
     1. Use unmapped account + address from #1 and map P1 SDU
     2. Use unmapped account + address from #2 and map P2 SDU
     
Note, this notebook was created on 13.12.2022. The below codes were originally copied from the AWS Glue Job: amzar-address_standardization-prod-process_gapi_tm on 13.12.2022 at 11.30am with the intention of uploading directly to Amzar's GitHub to keep a version of this before any edits on AWS Glue (version control)

In [None]:
## on 9/9/2022 - Fakhrul and I agreed for me to change the lines which could potentially give SettingWithCopyWarnings i.e replacing lines which use double bracket with .loc methods 
## I decided to add loc to lines which give both warnings and those where the LHS & RHS of the assignment step does not have the same column i.e I won't add loc to df['COL'] = df['COL'].str.upper()
## on 9/9/2022 - I also added .copy() to steps where it's just df1 = df i.e this would now be df1 = df.copy()

#!/usr/bin/env python
# coding: utf-8

# ### P1/P2 Mapping
#      1. Run similarity using fuzzy wuzzy and map P1 MDU
#      2. Use unmapped account + address from #1 and map P1 SDU
#      3. Use unmapped account + address from #2 and map P2 SDU

## import section
import pandas as pd
import numpy as np
import re
import resource
import awswrangler as wr
# For similarity
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
import time
import sys
from awsglue.utils import getResolvedOptions
from datetime import datetime
args = getResolvedOptions(sys.argv,
                          ['job_name',
                           'ISP_Name',
                           'isp_new_std_path',
                           'astro_new_std_path',
                           'uams_mdu_path',
                           'uams_sdu_path',
                           'temp_isp_corrected_save_path'])

ISP_Name = args['ISP_Name']
isp_new_std_path = args['isp_new_std_path'] #new std path
astro_new_std_path = args['astro_new_std_path'] #new std path
uams_mdu_path = args['uams_mdu_path'] #pipeline bucket # Save in pipeline bucket - to be used in UAMS generation
uams_sdu_path = args['uams_sdu_path'] #pipeline bucket # Save in pipeline bucket - to be used in UAMS generation                           
temp_isp_corrected_save_path = args['temp_isp_corrected_save_path']

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)


## Run this notebook for all ISPs
ISP_Name = 'TM'
# ISP_Name = 'ALLO'
# ISP_Name = 'Maxis'
# ISP_Name = 'CTS'

## Amzar 9/9/2022 -> added a datetime key for tracking file creation
curr_date = str(pd.datetime.today().strftime('%Y%m%d'))
# curr_date = '20220909' --> used this line the first night I ran this Glue job
print('Date notebook was run & files were created', curr_date)

# ### ---------------------------------------------------- P1 MDU Mapping - MDU --------------------------------------------------------------------   

import pandas as pd
# %cd /Users/zmmzohreh/OneDrive - MEASAT Broadcast Network Systems Sdn. Bhd/Shared Documents/Input Data
#get_ipython().run_line_magic('cd', '/Users/zmmzohreh/OneDrive - MEASAT Broadcast Network Systems Sdn. Bhd/Shared Documents/Fakhrul')

print('reading tm')

isp_corrected = wr.s3.read_csv(path = isp_new_std_path, usecols = ['Combined_Building','HouseNo',
'Street_1','Street_2','AREA','STD_CITY','STATE', 'POSTCODE', 'ServiceType','Standard_Building_Name','match'], 
dtype= str, compression='gzip') # Amzar 10/9/2022 --> added reading in the file as dtype = str. 14/10/2022 --> added compression='gzip' as TM_New_Std was saved as gzip due to large file size

# astro_corrected = pd.read_csv('Astro_New_Standardised.csv')
#astro_corrected = pd.read_csv('astro_new_standardized.csv', usecols = ['service_add_objid', 'ACCOUNT_NO','HOUSE_NO', 'AREA', 'STD_CITY',
                                #'ASTRO_STATE', 'POSTCODE', 'Combined_Building','Street_1',
                                 #'Street_2','Standard_Building_Name','Source','match'], engine='python', error_bad_lines=False)

print('reading astro')

astro_corrected = wr.s3.read_csv(astro_new_std_path, usecols = ['ACCOUNT_NO','service_add_objid','HOUSE_NO','Combined_Building','Street_1', 'Street_2','Standard_Building_Name','AREA','POSTCODE','STD_CITY','ASTRO_STATE', 'Source', 'match'], dtype= str) # Amzar 10/9/2022 --> added reading in the file as dtype = str


print('this is tm corrected shape: ', isp_corrected.shape)
print('this is astro corrected shape: ', astro_corrected.shape)

isp_corrected.Combined_Building.unique()
isp_corrected.head()

# print(isp_corrected.shape,astro_corrected.shape) # Amzar 9/9/2022 --> commented out coz duplicate of above print step

isp_corrected.info()

isp_corrected.head()

astro_corrected.info()

astro_corrected = astro_corrected.drop_duplicates()
print('Astro Corrected Shape after first dedupe on all columns, keep first: ', astro_corrected.shape, 'Astro Unique Account No: ', astro_corrected.ACCOUNT_NO.nunique()) # Amzar 9/9/2022 --> added more words to the print statement for easier tracking

astro_corrected.head()

isp_corrected.Combined_Building.unique()

## Updated after sharing
# Capitalize all G_Condo and G_City form both isp_corrected and astro_corrected
isp_corrected['HouseNo'] = isp_corrected['HouseNo'].str.upper()
isp_corrected['Street_1'] = isp_corrected['Street_1'].str.upper()
isp_corrected['Combined_Building'] = isp_corrected['Combined_Building'].astype(str).str.upper()
isp_corrected['STD_CITY'] = isp_corrected['STD_CITY'].str.upper()

astro_corrected['HOUSE_NO'] = astro_corrected['HOUSE_NO'].str.upper()
astro_corrected['Street_1'] = astro_corrected['Street_1'].str.upper()
astro_corrected['Combined_Building'] = astro_corrected['Combined_Building'].astype(str).str.upper()
astro_corrected['STD_CITY'] = astro_corrected['STD_CITY'].str.upper()

isp_corrected['Combined_Building'].unique()

isp_corrected.head()

# Fix HOUSE_NO
astro_corrected["HOUSE_NO"] = astro_corrected['HOUSE_NO'].str.replace('nan ','', case = False)
astro_corrected["HOUSE_NO"] = astro_corrected["HOUSE_NO"].str.replace('[,.]','', case = False)
astro_corrected["HOUSE_NO"] = astro_corrected["HOUSE_NO"].str.replace("[",'')
astro_corrected["HOUSE_NO"] = astro_corrected["HOUSE_NO"].str.replace("]","")
astro_corrected["HOUSE_NO"] = astro_corrected["HOUSE_NO"].str.replace("'","", case = False)
astro_corrected["HOUSE_NO"] = astro_corrected["HOUSE_NO"].str.replace("NO. ","", case = False)
astro_corrected["HOUSE_NO"] = astro_corrected["HOUSE_NO"].str.replace("NO.","", case = False)
astro_corrected["HOUSE_NO"] = astro_corrected["HOUSE_NO"].str.replace("NO","", case = False)
astro_corrected["HOUSE_NO"] = astro_corrected["HOUSE_NO"].str.replace("LOT","", case = False)
astro_corrected["HOUSE_NO"] = astro_corrected["HOUSE_NO"].str.replace("UNIT","", case = False)
astro_corrected["HOUSE_NO"] = astro_corrected["HOUSE_NO"].str.replace(",","", case = False)
astro_corrected["HOUSE_NO"] = astro_corrected["HOUSE_NO"].str.replace("\.","", case = False)
astro_corrected["HOUSE_NO"] = astro_corrected["HOUSE_NO"].str.replace("BLOCK","", case = False)
astro_corrected["HOUSE_NO"] = astro_corrected["HOUSE_NO"].str.replace("BLOK","", case = False)
astro_corrected["HOUSE_NO"] = astro_corrected["HOUSE_NO"].str.replace("BLK","", case = False)

# Fix HOUSE_NO that are converted to date
astro_corrected['HOUSE_NO'] = astro_corrected['HOUSE_NO'].str.replace("JAN-","01-", case = False)
astro_corrected['HOUSE_NO'] = astro_corrected['HOUSE_NO'].str.replace("-JAN","-01", case = False)
astro_corrected['HOUSE_NO'] = astro_corrected['HOUSE_NO'].str.replace('FEB-','02-', case = False)
astro_corrected['HOUSE_NO'] = astro_corrected['HOUSE_NO'].str.replace('-FEB','-02', case = False)
astro_corrected['HOUSE_NO'] = astro_corrected['HOUSE_NO'].str.replace("MAR-",'03-', case = False)
astro_corrected['HOUSE_NO'] = astro_corrected['HOUSE_NO'].str.replace("-MAR","-03", case = False)
astro_corrected['HOUSE_NO'] = astro_corrected['HOUSE_NO'].str.replace("APR-","04-", case = False)
astro_corrected['HOUSE_NO'] = astro_corrected['HOUSE_NO'].str.replace("-APR","-04", case = False)
astro_corrected['HOUSE_NO'] = astro_corrected['HOUSE_NO'].str.replace("MAY-","05-", case = False)
astro_corrected['HOUSE_NO'] = astro_corrected['HOUSE_NO'].str.replace("-MAY","-05", case = False)
astro_corrected['HOUSE_NO'] = astro_corrected['HOUSE_NO'].str.replace("JUN-","06-", case = False)
astro_corrected['HOUSE_NO'] = astro_corrected['HOUSE_NO'].str.replace("-JUN","-06", case = False)
astro_corrected['HOUSE_NO'] = astro_corrected['HOUSE_NO'].str.replace('JUL-','07-', case = False)
astro_corrected['HOUSE_NO'] = astro_corrected['HOUSE_NO'].str.replace('-JUL','-07', case = False)
astro_corrected['HOUSE_NO'] = astro_corrected['HOUSE_NO'].str.replace("AUG-",'08-', case = False)
astro_corrected['HOUSE_NO'] = astro_corrected['HOUSE_NO'].str.replace("-AUG","-08", case = False)
astro_corrected['HOUSE_NO'] = astro_corrected['HOUSE_NO'].str.replace("SEP-","09-", case = False)
astro_corrected['HOUSE_NO'] = astro_corrected['HOUSE_NO'].str.replace("-SEP","-09", case = False)
astro_corrected['HOUSE_NO'] = astro_corrected['HOUSE_NO'].str.replace("OCT-","10-", case = False)
astro_corrected['HOUSE_NO'] = astro_corrected['HOUSE_NO'].str.replace("-OCT","-10", case = False)
astro_corrected['HOUSE_NO'] = astro_corrected['HOUSE_NO'].str.replace("NOV-","11-", case = False)
astro_corrected['HOUSE_NO'] = astro_corrected['HOUSE_NO'].str.replace("-NOV","-11", case = False)
astro_corrected['HOUSE_NO'] = astro_corrected['HOUSE_NO'].str.replace("DEC-","12-", case = False)
astro_corrected['HOUSE_NO'] = astro_corrected['HOUSE_NO'].str.replace("-DEC","-12", case = False)


astro_corrected["COMBINED_ADD"] = astro_corrected["HOUSE_NO"].map(str) + "  ," +                             astro_corrected["Combined_Building"].map(str) + "  ," +                             astro_corrected["Street_1"].map(str) + "  ," +                             astro_corrected["AREA"].map(str) + " ," +                             astro_corrected["POSTCODE"].map(str) + " ," +                             astro_corrected["STD_CITY"].map(str)+ " ," +                             astro_corrected["ASTRO_STATE"].map(str)


astro_corrected['ASTRO_HOUSE_NO1']= astro_corrected['HOUSE_NO'].str.pad(width=10)


# Add new column in astro_corrected -- G_Condo + G_City = HNUM_STRT
astro_corrected["HNUM_STRT"] = astro_corrected["Combined_Building"].map(str) + " ," +astro_corrected["STD_CITY"].map(str)
astro_corrected.head()

# Clean HNUM_STRT column
astro_corrected["HNUM_STRT"] = astro_corrected['HNUM_STRT'].str.replace('nan ','', case = False)
astro_corrected["HNUM_STRT"] = astro_corrected['HNUM_STRT'].str.replace('[,.]','', case = False)
astro_corrected["HNUM_STRT"] = astro_corrected["HNUM_STRT"].str.replace(" ","")
astro_corrected["HNUM_STRT"] = astro_corrected["HNUM_STRT"].str.replace("\.","")
astro_corrected["HNUM_STRT"] = astro_corrected["HNUM_STRT"].str.replace(",","")

print('Astro corrected shape after cleaning address columns & creating HNUM_STRT: ', astro_corrected.shape) # Amzar 9/9/2022 --> added more text to print statement 


# Capitalize HNUM_STRT
astro_corrected['HNUM_STRT'] = astro_corrected['HNUM_STRT'].str.upper() 

# astro_corrected.head()


# Fix house no for isp_corrected
isp_corrected["HouseNo"] = isp_corrected['HouseNo'].str.replace('nan ','', case = False)
isp_corrected["HouseNo"] = isp_corrected["HouseNo"].str.replace('[,.]','', case = False)
isp_corrected["HouseNo"] = isp_corrected["HouseNo"].str.replace("[",'', )
isp_corrected["HouseNo"] = isp_corrected["HouseNo"].str.replace("]","")
isp_corrected["HouseNo"] = isp_corrected["HouseNo"].str.replace("'","", case = False)
isp_corrected["HouseNo"] = isp_corrected["HouseNo"].str.replace("NO. ","", case = False)
isp_corrected["HouseNo"] = isp_corrected["HouseNo"].str.replace("NO.","", case = False)
isp_corrected["HouseNo"] = isp_corrected["HouseNo"].str.replace("NO","", case = False)
isp_corrected["HouseNo"] = isp_corrected["HouseNo"].str.replace("LOT","", case = False)
isp_corrected["HouseNo"] = isp_corrected["HouseNo"].str.replace("UNIT","", case = False)
isp_corrected["HouseNo"] = isp_corrected["HouseNo"].str.replace(",","", case = False)
isp_corrected["HouseNo"] = isp_corrected["HouseNo"].str.replace("\.","", case = False)
isp_corrected["HouseNo"] = isp_corrected["HouseNo"].str.replace("BLOCK","", case = False)
isp_corrected["HouseNo"] = isp_corrected["HouseNo"].str.replace("BLOK","", case = False)
isp_corrected["HouseNo"] = isp_corrected["HouseNo"].str.replace("BLK","", case = False)


# Fix HouseNo that are converted to date
isp_corrected['HouseNo'] = isp_corrected['HouseNo'].str.replace("JAN-","01-", case = False)
isp_corrected['HouseNo'] = isp_corrected['HouseNo'].str.replace("-JAN","-01", case = False)
isp_corrected['HouseNo'] = isp_corrected['HouseNo'].str.replace('FEB-','02-', case = False)
isp_corrected['HouseNo'] = isp_corrected['HouseNo'].str.replace('-FEB','-02', case = False)
isp_corrected['HouseNo'] = isp_corrected['HouseNo'].str.replace("MAR-",'03-', case = False)
isp_corrected['HouseNo'] = isp_corrected['HouseNo'].str.replace("-MAR","-03", case = False)
isp_corrected['HouseNo'] = isp_corrected['HouseNo'].str.replace("APR-","04-", case = False)
isp_corrected['HouseNo'] = isp_corrected['HouseNo'].str.replace("-APR","-04", case = False)
isp_corrected['HouseNo'] = isp_corrected['HouseNo'].str.replace("MAY-","05-", case = False)
isp_corrected['HouseNo'] = isp_corrected['HouseNo'].str.replace("-MAY","-05", case = False)
isp_corrected['HouseNo'] = isp_corrected['HouseNo'].str.replace("JUN-","06-", case = False)
isp_corrected['HouseNo'] = isp_corrected['HouseNo'].str.replace("-JUN","-06", case = False)
isp_corrected['HouseNo'] = isp_corrected['HouseNo'].str.replace('JUL-','07-', case = False)
isp_corrected['HouseNo'] = isp_corrected['HouseNo'].str.replace('-JUL','-07', case = False)
isp_corrected['HouseNo'] = isp_corrected['HouseNo'].str.replace("AUG-",'08-', case = False)
isp_corrected['HouseNo'] = isp_corrected['HouseNo'].str.replace("-AUG","-08", case = False)
isp_corrected['HouseNo'] = isp_corrected['HouseNo'].str.replace("SEP-","09-", case = False)
isp_corrected['HouseNo'] = isp_corrected['HouseNo'].str.replace("-SEP","-09", case = False)
isp_corrected['HouseNo'] = isp_corrected['HouseNo'].str.replace("OCT-","10-", case = False)
isp_corrected['HouseNo'] = isp_corrected['HouseNo'].str.replace("-OCT","-10", case = False)
isp_corrected['HouseNo'] = isp_corrected['HouseNo'].str.replace("NOV-","11-", case = False)
isp_corrected['HouseNo'] = isp_corrected['HouseNo'].str.replace("-NOV","-11", case = False)
isp_corrected['HouseNo'] = isp_corrected['HouseNo'].str.replace("DEC-","12-", case = False)
isp_corrected['HouseNo'] = isp_corrected['HouseNo'].str.replace("-DEC","-12", case = False)

isp_corrected['TM_HOUSE_NO1']= isp_corrected['HouseNo'].str.pad(width=10)

isp_corrected.Combined_Building.unique()

isp_corrected.head()

# Add new column in isp_corrected -- G_Condo + G_City = HNUM_STRT_TM
isp_corrected["HNUM_STRT_TM"] = isp_corrected["Combined_Building"].map(str) + " ," + isp_corrected["STD_CITY"].map(str)
isp_corrected.head()

isp_corrected.Combined_Building.unique()


# Clean HNUM_STRT_TM column
isp_corrected["HNUM_STRT_TM"] = isp_corrected['HNUM_STRT_TM'].str.replace('nan ','', case = False)
isp_corrected["HNUM_STRT_TM"] = isp_corrected['HNUM_STRT_TM'].str.replace('[,.]','', case = False)
isp_corrected["HNUM_STRT_TM"] = isp_corrected["HNUM_STRT_TM"].str.replace(" ","")
isp_corrected["HNUM_STRT_TM"] = isp_corrected["HNUM_STRT_TM"].str.replace(",","")
isp_corrected["HNUM_STRT_TM"] = isp_corrected["HNUM_STRT_TM"].str.replace("\.","")

print('TM corrected shape after cleaning address columns & creating HNUM_STRT_TM: ', isp_corrected.shape) # Amzar 9/9/2022 --> added more text to print statement

isp_corrected.head()

# Capitalize HNUM_STRT_TM
isp_corrected['HNUM_STRT_TM'] = isp_corrected['HNUM_STRT_TM'].str.upper() 

# Remove nulls in HNUM_STRT and HNUM_STRT_TM
astro_corrected = astro_corrected[astro_corrected.HNUM_STRT.notnull()] # # Amzar 9/9/2022 --> this is the last transformation to astro_corrected before the end of P1 MDU step. So before going to P1 SDU mapping, we may actually be missing some addresses as we transfer astro_corrected to astro_unmapped
isp_corrected = isp_corrected[isp_corrected.HNUM_STRT_TM.notnull()]

print('Astro corrected shape after filtering out null HNUM_STRT: ', astro_corrected.shape) # Amzar 9/9/2022 --> added more text to print statement
print('TM corrected shape after filtering out null HNUM_STRT_TM: ', isp_corrected.shape) # Amzar 9/9/2022 --> added more text to print statement


#REMOVE DUPLICATES. Line 1 (words) creates a list of ALL HNUM_STRT_TM. Line 2 (selection) only keeps unique HNUM_STRT_TM
words = " ".join(isp_corrected.HNUM_STRT_TM).split() # default splits on whitespace
selection = set(words)
selection1 = list(selection)
selection2 = pd.DataFrame(selection1)
selection2.columns = ['MAPPED_HNUM_STRT_TM']
selection2.head()

len(selection2)

# JOIN astro_corrected to isp_corrected but only those with matching HNUM_STRT
MAPPED_STRT_HNUM_Df = astro_corrected.merge(selection2,left_on ='HNUM_STRT', right_on = 'MAPPED_HNUM_STRT_TM', how = 'inner')
MAPPED_STRT_HNUM_Df.shape

MAPPED_STRT_HNUM_Df.head()

# After merge, the Combined_Building in MAPPED_STRT_HNUM_Df is from astro_corrected
MAPPED_STRT_HNUM_Df.Combined_Building.unique()

MAPPED_STRT_HNUM_Df.head()


MAPPED_STRT_HNUM_Df.head()

MAPPED_STRT_HNUM_Df.Combined_Building.unique()

MAPPED_STRT_HNUM_Df.info()

MAPPED_STRT_HNUM_Df.Combined_Building.unique()


# Making sure that G_Condo has valid value ## 28.9.2022 - not sure why this comment says G_Condo... Maybe this is why Rev was confused, coz she thinks that P1 MDU NEEDS to be mapped to a standardized vendor list - but in the codes below, we're checking that Combined_Building has valid value
MAPPED_STRT_HNUM_Df["Combined_Building"] = MAPPED_STRT_HNUM_Df['Combined_Building'].str.replace('[,.]','', case = False)
MAPPED_STRT_HNUM_Df["Combined_Building"] = MAPPED_STRT_HNUM_Df["Combined_Building"].str.replace(",","")

# Removing those with null Combined_Building (for P1 MDU)
MAPPED_STRT_HNUM_Df = MAPPED_STRT_HNUM_Df[MAPPED_STRT_HNUM_Df['Combined_Building'].notnull()]
MAPPED_STRT_HNUM_Df = MAPPED_STRT_HNUM_Df[MAPPED_STRT_HNUM_Df['Combined_Building']!= ""]
MAPPED_STRT_HNUM_Df = MAPPED_STRT_HNUM_Df[MAPPED_STRT_HNUM_Df['Combined_Building']!= " "]
MAPPED_STRT_HNUM_Df = MAPPED_STRT_HNUM_Df[MAPPED_STRT_HNUM_Df['Combined_Building']!= "NAN"]
MAPPED_STRT_HNUM_Df = MAPPED_STRT_HNUM_Df[MAPPED_STRT_HNUM_Df['Combined_Building']!= "nan"]
MAPPED_STRT_HNUM_Df.shape


## Cleaning of Street Name
MAPPED_STRT_HNUM_Df.loc[MAPPED_STRT_HNUM_Df["Street_1"]=='nan',"Street_1"] = ''
MAPPED_STRT_HNUM_Df.loc[MAPPED_STRT_HNUM_Df["Street_1"]=='NAN',"Street_1"] = ''

# --> is this a repeat code of earlier step?
MAPPED_STRT_HNUM_Df["Combined_Building"] = MAPPED_STRT_HNUM_Df['Combined_Building'].str.replace('[,.]','', case = False)


MAPPED_STRT_HNUM_Df.Combined_Building.unique()

MAPPED_STRT_HNUM_Df.head()

MAPPED_STRT_HNUM_Df['ACCOUNT_NO'].nunique()

# this step is to keep only ONE record for 1 address even if there are 2 Sources (Vendor or GAPI)
MAPPED_STRT_HNUM_Df = MAPPED_STRT_HNUM_Df.sort_values(by='Source', ascending=False)
MAPPED_STRT_HNUM_Df = MAPPED_STRT_HNUM_Df.drop_duplicates(subset=['ACCOUNT_NO','HOUSE_NO'],keep='first')
MAPPED_STRT_HNUM_Df = MAPPED_STRT_HNUM_Df.drop_duplicates(subset='ACCOUNT_NO',keep='last')

print('P1 MDU MAPPED_STRT_HNUM_Df - Shape: ', MAPPED_STRT_HNUM_Df.shape, ' & Unique Acc_No: ', MAPPED_STRT_HNUM_Df.ACCOUNT_NO.nunique()) # Amzar 9/9/2022 --> added more text to print statement 

MAPPED_STRT_HNUM_Df['ACCOUNT_NO'].nunique()

MAPPED_STRT_HNUM_Df.info()

## Amzar 10/9/2022 --> Found that these 2 lines below ONLY happens for P1 MDU mapping step. It may be why we had issues of 1 acc_no having more than 1 P1P2 MDU/SDU tag (fixed by reading in all files as dtype=str)
MAPPED_STRT_HNUM_Df['ACCOUNT_NO'] = MAPPED_STRT_HNUM_Df['ACCOUNT_NO'].astype(str)
MAPPED_STRT_HNUM_Df['ACCOUNT_NO'] = MAPPED_STRT_HNUM_Df['ACCOUNT_NO'].str.replace('\.0','', case = False)

P1_MDU = MAPPED_STRT_HNUM_Df.copy() # Amzar 9/9/2022 --> added copy() to create an explicit copy

print('P1_MDU dataframe shape: ', P1_MDU.shape) # Amzar 9/9/2022 --> added new print statement to see shape of P1_MDU variable

isp_corrected.info() # not sure why this is here


# ### Generating UAMS format and getting ServiceType


STRT_P1 = MAPPED_STRT_HNUM_Df.copy() # Amzar 9/9/2022 --> added copy() to create an explicit copy
print('P1 MDU STRT_P1 dataframe shape: ', STRT_P1.shape) # Amzar 9/9/2022 --> added new print statement to see shape of STRT_P1 variable

STRT_P1['Street_1'] = STRT_P1['Street_1'].astype(str)

STRT_P1.reset_index(inplace=True, drop=True)
test = STRT_P1.loc[STRT_P1['Street_1'].apply(lambda x: x.startswith('AA')), :].index # Amzar 9/9/2022 --> added loc statement
test = list(test)

STRT_P1.loc[test,'Street_1'] = ''

STRT_P1.loc[STRT_P1['match']=='Match','Street_2'] = ''
STRT_P1[STRT_P1['match']=='Match']

import re

def extract_street(item):

    street_type = ""
    
    r1 = "JALAN|LORONG|CHANGKAT|LAMAN|LAHAT|LEBUH|LEBUHRAYA|LENGKOK|LINGKARAN|PERSIARAN"


    m = re.search(r1,item)
    if m:
        street_type = m.group()
    return street_type        
        
STRT_P1["Street_Type_1"] = STRT_P1["Street_1"].apply(extract_street)
STRT_P1["Street_Type_2"] = STRT_P1["Street_2"].map(str).apply(extract_street)
STRT_P1.head()

street_type_list = ['JALAN ', 'LORONG ','CHANGKAT ', 'LAMAN ', 'LAHAT ', 'LEBUH ', 'LEBUHRAYA ', 'LENGKOK ','LINGKARAN ', 'PERSIARAN ' ]
STRT_P1["Street_1_New"] = STRT_P1["Street_1"].str.replace('|'.join(street_type_list), '')

STRT_P1["Street_2"] = STRT_P1["Street_2"].str.upper()
STRT_P1["Street_2_New"] = STRT_P1["Street_2"].str.replace('|'.join(street_type_list), '')
STRT_P1.head()

## Getting the ServiceType---- 22Jun  >> remove ServiceType ERROR

service_list = isp_corrected.loc[:, ['ServiceType','HNUM_STRT_TM']].drop_duplicates() # Amzar 9/9/2022 --> added loc 
service_list["ServiceType"] = service_list["ServiceType"].str.upper()
service_list = service_list[service_list['ServiceType']!='ERROR']


New_fields1 = pd.merge(STRT_P1,service_list,left_on ='HNUM_STRT',right_on='HNUM_STRT_TM', how = 'left')
New_fields1.info()


#MDU NEW
New_fields2 = New_fields1[['ACCOUNT_NO','service_add_objid','ASTRO_HOUSE_NO1',
                           'Combined_Building','Street_Type_1','Street_1_New','Standard_Building_Name', 
                           'Street_Type_2','Street_2_New','AREA','POSTCODE','STD_CITY','ASTRO_STATE', 'ServiceType', 'HNUM_STRT_TM']]


New_fields2.loc[:, 'Servicable']= str(ISP_Name) # Amzar 9/9/2022 --> added loc 


## ---- 22Jun  >> keep both ServiceType FTTH/VDSL
## ---- 22Jun  >> New_fields3 = New_fields2.drop_duplicates(subset= ['ACCOUNT_NO','ServiceType'], keep = 'first')


New_fields3 = New_fields2.sort_values(['ServiceType']).drop_duplicates(subset= 'ACCOUNT_NO', keep = 'first')
astro_cleaned = New_fields3.copy() # Amzar 9/9/2022 --> added copy()
print('P1 MDU astro_cleaned shape: ', astro_cleaned.shape, '& New_fields3 shape: ', New_fields3.shape) # Amzar 9/9/2022 --> added new print statement to compare

# Fix HOUSE_NO that are converted to date
astro_cleaned['HOUSE_NO'] = astro_cleaned['ASTRO_HOUSE_NO1'] 
astro_cleaned['HOUSE_NO'] = astro_cleaned['HOUSE_NO'].str.replace("JAN-","01-", case = False)
astro_cleaned['HOUSE_NO'] = astro_cleaned['HOUSE_NO'].str.replace("-JAN","-01", case = False)
astro_cleaned['HOUSE_NO'] = astro_cleaned['HOUSE_NO'].str.replace("Jan-","01-", case = False)
astro_cleaned['HOUSE_NO'] = astro_cleaned['HOUSE_NO'].str.replace("-Jan","-01", case = False)
astro_cleaned['HOUSE_NO'] = astro_cleaned['HOUSE_NO'].str.replace("FEB-","02-", case = False)
astro_cleaned['HOUSE_NO'] = astro_cleaned['HOUSE_NO'].str.replace("-FEB",'-02', case = False)
astro_cleaned['HOUSE_NO'] = astro_cleaned['HOUSE_NO'].str.replace("Feb-","02-", case = False)
astro_cleaned['HOUSE_NO'] = astro_cleaned['HOUSE_NO'].str.replace("-Feb","-02", case = False)
astro_cleaned['HOUSE_NO'] = astro_cleaned['HOUSE_NO'].str.replace("MAR-",'03-', case = False)
astro_cleaned['HOUSE_NO'] = astro_cleaned['HOUSE_NO'].str.replace("-MAR","-03", case = False)
astro_cleaned['HOUSE_NO'] = astro_cleaned['HOUSE_NO'].str.replace("Mar-",'03-', case = False)
astro_cleaned['HOUSE_NO'] = astro_cleaned['HOUSE_NO'].str.replace("-Mar","-03", case = False)
astro_cleaned['HOUSE_NO'] = astro_cleaned['HOUSE_NO'].str.replace("APR-","04-", case = False)
astro_cleaned['HOUSE_NO'] = astro_cleaned['HOUSE_NO'].str.replace("-APR","-04", case = False)
astro_cleaned['HOUSE_NO'] = astro_cleaned['HOUSE_NO'].str.replace("Apr-","04-", case = False)
astro_cleaned['HOUSE_NO'] = astro_cleaned['HOUSE_NO'].str.replace("-Apr","-04", case = False)
astro_cleaned['HOUSE_NO'] = astro_cleaned['HOUSE_NO'].str.replace("MAY-","05-", case = False)
astro_cleaned['HOUSE_NO'] = astro_cleaned['HOUSE_NO'].str.replace("-MAY","-05", case = False)
astro_cleaned['HOUSE_NO'] = astro_cleaned['HOUSE_NO'].str.replace("May-","05-", case = False)
astro_cleaned['HOUSE_NO'] = astro_cleaned['HOUSE_NO'].str.replace("-May","-05", case = False)
astro_cleaned['HOUSE_NO'] = astro_cleaned['HOUSE_NO'].str.replace("JUN-","06-", case = False)
astro_cleaned['HOUSE_NO'] = astro_cleaned['HOUSE_NO'].str.replace("-JUN","-06", case = False)
astro_cleaned['HOUSE_NO'] = astro_cleaned['HOUSE_NO'].str.replace("Jun-","06-", case = False)
astro_cleaned['HOUSE_NO'] = astro_cleaned['HOUSE_NO'].str.replace("-Jun","-06", case = False)
astro_cleaned['HOUSE_NO'] = astro_cleaned['HOUSE_NO'].str.replace("JUL-","07-", case = False)
astro_cleaned['HOUSE_NO'] = astro_cleaned['HOUSE_NO'].str.replace("-JUL","-07", case = False)
astro_cleaned['HOUSE_NO'] = astro_cleaned['HOUSE_NO'].str.replace("Jul-","07-", case = False)
astro_cleaned['HOUSE_NO'] = astro_cleaned['HOUSE_NO'].str.replace("-Jul","-07", case = False)
astro_cleaned['HOUSE_NO'] = astro_cleaned['HOUSE_NO'].str.replace("AUG-",'08-', case = False)
astro_cleaned['HOUSE_NO'] = astro_cleaned['HOUSE_NO'].str.replace("-AUG","-08", case = False)
astro_cleaned['HOUSE_NO'] = astro_cleaned['HOUSE_NO'].str.replace("Aug-",'08-', case = False)
astro_cleaned['HOUSE_NO'] = astro_cleaned['HOUSE_NO'].str.replace("-Aug","-08", case = False)
astro_cleaned['HOUSE_NO'] = astro_cleaned['HOUSE_NO'].str.replace("SEP-","09-", case = False)
astro_cleaned['HOUSE_NO'] = astro_cleaned['HOUSE_NO'].str.replace("-SEP","-09", case = False)
astro_cleaned['HOUSE_NO'] = astro_cleaned['HOUSE_NO'].str.replace("Sep-","09-", case = False)
astro_cleaned['HOUSE_NO'] = astro_cleaned['HOUSE_NO'].str.replace("-Sep","-09", case = False)
astro_cleaned['HOUSE_NO'] = astro_cleaned['HOUSE_NO'].str.replace("OCT-","10-", case = False)
astro_cleaned['HOUSE_NO'] = astro_cleaned['HOUSE_NO'].str.replace("-OCT","-10", case = False)
astro_cleaned['HOUSE_NO'] = astro_cleaned['HOUSE_NO'].str.replace("Oct-","10-", case = False)
astro_cleaned['HOUSE_NO'] = astro_cleaned['HOUSE_NO'].str.replace("-Oct","-10", case = False)
astro_cleaned['HOUSE_NO'] = astro_cleaned['HOUSE_NO'].str.replace("NOV-","11-", case = False)
astro_cleaned['HOUSE_NO'] = astro_cleaned['HOUSE_NO'].str.replace("-NOV","-11", case = False)
astro_cleaned['HOUSE_NO'] = astro_cleaned['HOUSE_NO'].str.replace("Nov-","11-", case = False)
astro_cleaned['HOUSE_NO'] = astro_cleaned['HOUSE_NO'].str.replace("-Nov","-11", case = False)
astro_cleaned['HOUSE_NO'] = astro_cleaned['HOUSE_NO'].str.replace("DEC-","12-", case = False)
astro_cleaned['HOUSE_NO'] = astro_cleaned['HOUSE_NO'].str.replace("-DEC","-12", case = False)
astro_cleaned['HOUSE_NO'] = astro_cleaned['HOUSE_NO'].str.replace("Dec-","12-", case = False)
astro_cleaned['HOUSE_NO'] = astro_cleaned['HOUSE_NO'].str.replace("-Dec","-12", case = False)

# Fix HOUSE_NO that are converted to date (DD/MM/YYYY format)
# Filter date HOUSE_NO
date_house = astro_cleaned[astro_cleaned['HOUSE_NO'].str.match('^([0-2][0-9]|(3)[0-1])(\/)(((0)[0-9])|((1)[0-2]))(\/)\d{4}$')== True]
# Spliting the HOUSE_NO
date_house.loc[:, 'block'] = (date_house.HOUSE_NO.str[0:2]) # Amzar 9/9/2022 --> added loc 
date_house.loc[:, 'floor'] = (date_house.HOUSE_NO.str[3:5]) # Amzar 9/9/2022 --> added loc 
date_house.loc[:, 'unit'] = (date_house.HOUSE_NO.str[8:10]) # Amzar 9/9/2022 --> added loc 
# Combine the split HOUSE_NO with -
date_house.loc[:, 'HOUSE_NO_ASTRO'] = date_house['block'] + "-" + date_house['floor'] + "-" + date_house['unit'] # Amzar 9/9/2022 --> added loc 
# Filter not date HOUSE_NO
not_date_house = astro_cleaned[~(astro_cleaned['HOUSE_NO'].str.match('^([0-2][0-9]|(3)[0-1])(\/)(((0)[0-9])|((1)[0-2]))(\/)\d{4}$')== True)]
not_date_house.loc[:, 'HOUSE_NO_ASTRO'] = not_date_house['HOUSE_NO'] # Amzar 9/9/2022 --> added loc 
not_date_house.head()
# Append the 2 df again
frame = [date_house,not_date_house]
astro_cleaned = pd.concat(frame)
astro_cleaned.shape

# Remove additional column created to combine HOUSE_NO
astro_cleaned = astro_cleaned.drop(['block','floor','unit'],axis=1)
astro_cleaned.info()


astro_cleaned['ASTRO_HOUSE_NO1']= astro_cleaned['HOUSE_NO_ASTRO'].str.pad(width=10)


#MDU NEW
astro_cleaned2 = astro_cleaned[['ACCOUNT_NO','service_add_objid', 'ASTRO_HOUSE_NO1', 
                                   'Combined_Building','Street_Type_1','Street_1_New','Street_Type_2',
                                   'Street_2_New', 'AREA','STD_CITY' ,'POSTCODE', 'ASTRO_STATE',
                                   'Standard_Building_Name',
                                   'ServiceType', 'Servicable', 'HNUM_STRT_TM']]


UAMS_MDU_Base = astro_cleaned2.copy() # Amzar 9/9/2022 --> added copy() 
print('P1 MDU UAMS_MDU_BASE df shape: ', UAMS_MDU_Base.shape, ' & astro_cleaned2 df shape: ', astro_cleaned2.shape) # Amzar 9/9/2022 --> added new print statement to compare

UAMS_MDU_Base.loc[:, 'ACCOUNT_NO'] = UAMS_MDU_Base.loc[:, 'ACCOUNT_NO'].astype(str) # Amzar 9/9/2022 --> added loc 
UAMS_MDU_Base.loc[:, 'ACCOUNT_NO'] = UAMS_MDU_Base.loc[:, 'ACCOUNT_NO'].str.replace('\.0','', case = False) # Amzar 9/9/2022 --> added loc 

print('P1 MDU UAMS_MDU_Base shape after converting ACC_NO col to str type: ', UAMS_MDU_Base.shape) # Amzar 9/9/2022 --> added more text to the print statement

UAMS_MDU_Base = UAMS_MDU_Base.drop_duplicates(subset=['ACCOUNT_NO'], keep='first')
print('P1 MDU UAMS_MDU_Base shape AFTER dedupe on ACC_NO, keep first: ', UAMS_MDU_Base.shape) # Amzar 9/9/2022 --> added more text to the print statement



UAMS_MDU_Base=UAMS_MDU_Base.rename({'ASTRO_HOUSE_NO1':'House_No', 
                                      'ACCOUNT_NO': 'Account_No'}, axis=1)


#UAMS_MDU_Base.to_csv('UAMS_Format_stndrd_'+str(ISP_Name)+'_P1_MDU.csv') # Save in pipeline bucket - to be used in UAMS generation

print('this is p1 mdu: :', UAMS_MDU_Base.shape)
wr.s3.to_csv(df = UAMS_MDU_Base, path = uams_mdu_path + 'UAMS_Format_stndrd_' + str(ISP_Name)+ '_P1_MDU.csv.gz', compression='gzip', index=False) # 18/11/22: AFTER running the job, decided to add this line to generate a file for easier automation. 5/12/22: added gz
wr.s3.to_csv(df = UAMS_MDU_Base, path = uams_mdu_path + 'historical_folder/UAMS_Format_stndrd_' + str(ISP_Name)+ '_P1_MDU_' + str(curr_date) + '.csv.gz', compression='gzip', index=False) # 18/11/22: added underscore after 'MDU' and AFTER running it to generate the files, decided to add the dated one to historical_folder 

## revision - 31/5/22 - fakhrul - need to separate this process as it consumes too much memory
wr.s3.to_csv(df = isp_corrected, path = temp_isp_corrected_save_path + 'temp_isp_corrected.csv.gz', index = False, compression='gzip') # 5/12/22: made this the automated version
wr.s3.to_csv(df = isp_corrected, path = temp_isp_corrected_save_path + 'historical_folder/temp_isp_corrected_' + str(curr_date) + '.csv.gz', index = False, compression='gzip') # 18/11/22: added underscore after 'MDU'. 5/12/22: added gzip & historical_folder


print('P1 MDU done')

# Remove the mapped account no
MAPPED_STRT_HNUM_Df_list = list(MAPPED_STRT_HNUM_Df['ACCOUNT_NO']) # all values here should be string already from above step
astro_unmapped = astro_corrected.loc[~astro_corrected['ACCOUNT_NO'].isin(MAPPED_STRT_HNUM_Df_list), :] # Amzar 9/9/2022 --> added loc 
print('Astro Corrected shape right before end of P1 MDU mapping step and generating astro_unmapped: ', astro_corrected.shape) # Amzar 9/9/2022 --> added new print statement
print('After P1 MDU mapping step --> Astro Unmapped Shape : ', astro_unmapped.shape, ' & no of Unique Account No: ', astro_unmapped.ACCOUNT_NO.nunique()) # Amzar 9/9/2022 --> added more text to the print statement


# ### ---------------------------------------------------------- P1 SDU Mapping ----------------------------------------------------------

## Using the unmapped base from P1 MDU mapping as the new astro_corrected
astro_corrected = astro_unmapped.copy() # Amzar 9/9/2022 --> added copy()

isp_corrected.info()

print('Start of P1 SDU Mapping. First, check TM Corrected shape: ', isp_corrected.shape) # Amzar 9/9/2022 --> added more text to print statement 
print('astro_corrected shape for P1 SDU Mapping (unmapped base after P1 MDU Mapping): ', astro_corrected.shape) # Amzar 9/9/2022 --> added more text to print statement 

# astro_corrected.ACCOUNT_NO.nunique()


# Add new column in astro_corrected -- HOUSE_NO + G_Street_Name_1 + G_City = HNUM_STRT
astro_corrected["HNUM_STRT"] = astro_corrected["HOUSE_NO"].map(str) + " ," +                             astro_corrected["Street_1"].map(str) + " ," +                             astro_corrected["STD_CITY"].map(str)
astro_corrected.tail()


# Clean HNUM_STRT column
astro_corrected["HNUM_STRT"] = astro_corrected['HNUM_STRT'].str.replace('nan ','', case = False)
astro_corrected["HNUM_STRT"] = astro_corrected['HNUM_STRT'].str.replace('[,.]','', case = False)
astro_corrected["HNUM_STRT"] = astro_corrected["HNUM_STRT"].str.replace(" ","")
astro_corrected["HNUM_STRT"] = astro_corrected["HNUM_STRT"].str.replace("\.","")
astro_corrected["HNUM_STRT"] = astro_corrected["HNUM_STRT"].str.replace(",","")

print('Astro corrected shape after cleaning address columns & creating HNUM_STRT: ', astro_corrected.shape) # Amzar 9/9/2022 --> added more text to print statement 


# Capitalize HNUM_STRT
astro_corrected['HNUM_STRT'] = astro_corrected['HNUM_STRT'].str.upper() 


astro_corrected.head()

pd.set_option('display.max_columns', 500)


# Add new column in isp_corrected -- HouseNo + G_Street_Name_1 + G_City = HNUM_STRT_TM
isp_corrected["HNUM_STRT_TM"] = isp_corrected["HouseNo"].map(str) + " ," +                             isp_corrected["Street_1"].map(str) + " ," +                             isp_corrected["STD_CITY"].map(str)
isp_corrected.head()


# Clean HNUM_STRT_TM column
isp_corrected["HNUM_STRT_TM"] = isp_corrected['HNUM_STRT_TM'].str.replace('nan ','', case = False)
isp_corrected["HNUM_STRT_TM"] = isp_corrected['HNUM_STRT_TM'].str.replace('[,.]','', case = False)
isp_corrected["HNUM_STRT_TM"] = isp_corrected["HNUM_STRT_TM"].str.replace(" ","")
isp_corrected["HNUM_STRT_TM"] = isp_corrected["HNUM_STRT_TM"].str.replace(",","")
isp_corrected["HNUM_STRT_TM"] = isp_corrected["HNUM_STRT_TM"].str.replace("\.","")

print('TM corrected shape after cleaning address columns & creating HNUM_STRT_TM: ', isp_corrected.shape) # Amzar 9/9/2022 --> added more text to print statement


# Capitalize HNUM_STRT_TM
isp_corrected['HNUM_STRT_TM'] = isp_corrected['HNUM_STRT_TM'].str.upper() 


# Remove nulls in HNUM_STRT and HNUM_STRT_TM
astro_corrected = astro_corrected[astro_corrected.HNUM_STRT.notnull()] # Amzar 9/9/2022 --> this is the last transformation to astro_corrected before the end of P1 MDU step. So before going to P1 SDU mapping, we may actually be missing some addresses as we transfer astro_corrected to astro_unmapped
isp_corrected = isp_corrected[isp_corrected.HNUM_STRT_TM.notnull()]

print('Astro corrected shape after filtering out null HNUM_STRT: ', astro_corrected.shape) # Amzar 9/9/2022 --> added more text to print statement
print('TM corrected shape after filtering out null HNUM_STRT_TM: ', isp_corrected.shape) # Amzar 9/9/2022 --> added more text to print statement

# REMOVE DUPLICATES. Line 1 (words) creates a list of ALL HNUM_STRT_TM. Line 2 (selection) only keeps unique HNUM_STRT_TM
words = " ".join(isp_corrected.HNUM_STRT_TM).split() 
selection = set(words)
selection1 = list(selection)
selection2 = pd.DataFrame(selection1)
selection2.columns = ['MAPPED_HNUM_STRT_TM']
selection2.head()


MAPPED_STRT_HNUM_Df = astro_corrected.merge(selection2,left_on ='HNUM_STRT', right_on = 'MAPPED_HNUM_STRT_TM', how = 'inner')
MAPPED_STRT_HNUM_Df.shape


MAPPED_STRT_HNUM_Df.head()


# Making sure that HOUSE_NO has valid value
MAPPED_STRT_HNUM_Df = MAPPED_STRT_HNUM_Df[MAPPED_STRT_HNUM_Df['HOUSE_NO'].notnull()]
MAPPED_STRT_HNUM_Df = MAPPED_STRT_HNUM_Df[MAPPED_STRT_HNUM_Df['HOUSE_NO']!= ""]
MAPPED_STRT_HNUM_Df = MAPPED_STRT_HNUM_Df[MAPPED_STRT_HNUM_Df['HOUSE_NO']!= " "]
MAPPED_STRT_HNUM_Df.shape


# Making sure that G_Street_Name_1 has valid value
MAPPED_STRT_HNUM_Df = MAPPED_STRT_HNUM_Df[MAPPED_STRT_HNUM_Df['Street_1'].notnull()]
MAPPED_STRT_HNUM_Df = MAPPED_STRT_HNUM_Df[MAPPED_STRT_HNUM_Df['Street_1']!= ""]
MAPPED_STRT_HNUM_Df = MAPPED_STRT_HNUM_Df[MAPPED_STRT_HNUM_Df['Street_1']!= " "]
MAPPED_STRT_HNUM_Df = MAPPED_STRT_HNUM_Df[MAPPED_STRT_HNUM_Df['Street_1']!= "NAN"]
MAPPED_STRT_HNUM_Df = MAPPED_STRT_HNUM_Df[MAPPED_STRT_HNUM_Df['Street_1']!= "nan"]
MAPPED_STRT_HNUM_Df.shape

# this step is to keep only ONE record for 1 address even if there are 2 Sources (Vendor or GAPI)
MAPPED_STRT_HNUM_Df = MAPPED_STRT_HNUM_Df.sort_values(by='Source', ascending=False)
MAPPED_STRT_HNUM_Df = MAPPED_STRT_HNUM_Df.drop_duplicates(subset=['ACCOUNT_NO','HOUSE_NO'],keep='first')
MAPPED_STRT_HNUM_Df = MAPPED_STRT_HNUM_Df.drop_duplicates(subset='ACCOUNT_NO',keep='last')

print('P1 SDU MAPPED_STRT_HNUM_Df - Shape: ', MAPPED_STRT_HNUM_Df.shape, ' & Unique Acc_No: ', MAPPED_STRT_HNUM_Df.ACCOUNT_NO.nunique()) # Amzar 9/9/2022 --> added more text to print statement 

P1_SDU = MAPPED_STRT_HNUM_Df.copy() # Amzar 9/9/2022 --> added copy() to create an explicit copy
print('P1_SDU dataframe shape: ', P1_SDU.shape) # Amzar 9/9/2022 --> added new print statement to see shape of P1_SDU variable

# ### Generating UAMS format and getting ServiceType

STRT_P1 = MAPPED_STRT_HNUM_Df.copy() # Amzar 9/9/2022 --> added copy() to create an explicit copy
print('P1 SDU STRT_P1 dataframe shape: ', STRT_P1.shape) # Amzar 9/9/2022 --> added new print statement to see shape of STRT_P1 variable

STRT_P1['Street_1'] = STRT_P1['Street_1'].astype(str)

STRT_P1.reset_index(inplace=True, drop=True)
test = STRT_P1.loc[STRT_P1['Street_1'].apply(lambda x: x.startswith('AA')), :].index # Amzar 9/9/2022 --> added loc statement
test = list(test)

STRT_P1.loc[test,'Street_1'] = ''

STRT_P1.loc[STRT_P1['match']=='Match','Street_2'] = ''
STRT_P1[STRT_P1['match']=='Match']

import re

def extract_street(item):

    street_type = ""
    
    r1 = "JALAN|LORONG|CHANGKAT|LAMAN|LAHAT|LEBUH|LEBUHRAYA|LENGKOK|LINGKARAN|PERSIARAN"


    m = re.search(r1,item)
    if m:
        street_type = m.group()
    return street_type        
        
STRT_P1["Street_Type_1"] = STRT_P1["Street_1"].apply(extract_street)
STRT_P1["Street_Type_2"] = STRT_P1["Street_2"].map(str).apply(extract_street)
STRT_P1.head()

street_type_list = ['JALAN ', 'LORONG ','CHANGKAT ', 'LAMAN ', 'LAHAT ', 'LEBUH ', 'LEBUHRAYA ', 'LENGKOK ','LINGKARAN ', 'PERSIARAN ' ]
STRT_P1["Street_1_New"] = STRT_P1["Street_1"].str.replace('|'.join(street_type_list), '')

STRT_P1["Street_2"] = STRT_P1["Street_2"].str.upper()
STRT_P1["Street_2_New"] = STRT_P1["Street_2"].str.replace('|'.join(street_type_list), '')
STRT_P1.head()

# Getting the ServiceType
service_list = isp_corrected.loc[:, ['ServiceType','HNUM_STRT_TM']].drop_duplicates() # Amzar 9/9/2022 --> added loc 
service_list["ServiceType"] = service_list["ServiceType"].str.upper()
service_list = service_list[service_list['ServiceType']!='ERROR']


New_fields1 = pd.merge(STRT_P1,service_list,left_on ='HNUM_STRT',right_on='HNUM_STRT_TM', how = 'left')
New_fields1.info()


#MDU NEW
New_fields2 = New_fields1[['ACCOUNT_NO','service_add_objid','ASTRO_HOUSE_NO1',
                           'Combined_Building','Street_Type_1','Street_1_New','Standard_Building_Name', 
                           'Street_Type_2','Street_2_New','AREA','POSTCODE','STD_CITY','ASTRO_STATE', 'ServiceType','HNUM_STRT_TM']]


New_fields2.loc[:, 'Servicable']= str(ISP_Name) # Amzar 9/9/2022 --> added loc 


# New_fields3 = New_fields2.drop_duplicates(subset= 'ACCOUNT_NO', keep = 'first')
New_fields3 = New_fields2.sort_values(['ServiceType']).drop_duplicates(subset= 'ACCOUNT_NO', keep = 'first')
astro_cleaned = New_fields3.copy() # Amzar 9/9/2022 --> added copy()
print('P1 SDU astro_cleaned shape: ', astro_cleaned.shape, '& New_fields3 shape: ', New_fields3.shape) # Amzar 9/9/2022 --> added new print statement to compare


# Fix HOUSE_NO that are converted to date
astro_cleaned['HOUSE_NO'] = astro_cleaned['ASTRO_HOUSE_NO1'] 
astro_cleaned['HOUSE_NO'] = astro_cleaned['HOUSE_NO'].str.replace("JAN-","01-", case = False)
astro_cleaned['HOUSE_NO'] = astro_cleaned['HOUSE_NO'].str.replace("-JAN","-01", case = False)
astro_cleaned['HOUSE_NO'] = astro_cleaned['HOUSE_NO'].str.replace("Jan-","01-", case = False)
astro_cleaned['HOUSE_NO'] = astro_cleaned['HOUSE_NO'].str.replace("-Jan","-01", case = False)
astro_cleaned['HOUSE_NO'] = astro_cleaned['HOUSE_NO'].str.replace("FEB-","02-", case = False)
astro_cleaned['HOUSE_NO'] = astro_cleaned['HOUSE_NO'].str.replace("-FEB",'-02', case = False)
astro_cleaned['HOUSE_NO'] = astro_cleaned['HOUSE_NO'].str.replace("Feb-","02-", case = False)
astro_cleaned['HOUSE_NO'] = astro_cleaned['HOUSE_NO'].str.replace("-Feb","-02", case = False)
astro_cleaned['HOUSE_NO'] = astro_cleaned['HOUSE_NO'].str.replace("MAR-",'03-', case = False)
astro_cleaned['HOUSE_NO'] = astro_cleaned['HOUSE_NO'].str.replace("-MAR","-03", case = False)
astro_cleaned['HOUSE_NO'] = astro_cleaned['HOUSE_NO'].str.replace("Mar-",'03-', case = False)
astro_cleaned['HOUSE_NO'] = astro_cleaned['HOUSE_NO'].str.replace("-Mar","-03", case = False)
astro_cleaned['HOUSE_NO'] = astro_cleaned['HOUSE_NO'].str.replace("APR-","04-", case = False)
astro_cleaned['HOUSE_NO'] = astro_cleaned['HOUSE_NO'].str.replace("-APR","-04", case = False)
astro_cleaned['HOUSE_NO'] = astro_cleaned['HOUSE_NO'].str.replace("Apr-","04-", case = False)
astro_cleaned['HOUSE_NO'] = astro_cleaned['HOUSE_NO'].str.replace("-Apr","-04", case = False)
astro_cleaned['HOUSE_NO'] = astro_cleaned['HOUSE_NO'].str.replace("MAY-","05-", case = False)
astro_cleaned['HOUSE_NO'] = astro_cleaned['HOUSE_NO'].str.replace("-MAY","-05", case = False)
astro_cleaned['HOUSE_NO'] = astro_cleaned['HOUSE_NO'].str.replace("May-","05-", case = False)
astro_cleaned['HOUSE_NO'] = astro_cleaned['HOUSE_NO'].str.replace("-May","-05", case = False)
astro_cleaned['HOUSE_NO'] = astro_cleaned['HOUSE_NO'].str.replace("JUN-","06-", case = False)
astro_cleaned['HOUSE_NO'] = astro_cleaned['HOUSE_NO'].str.replace("-JUN","-06", case = False)
astro_cleaned['HOUSE_NO'] = astro_cleaned['HOUSE_NO'].str.replace("Jun-","06-", case = False)
astro_cleaned['HOUSE_NO'] = astro_cleaned['HOUSE_NO'].str.replace("-Jun","-06", case = False)
astro_cleaned['HOUSE_NO'] = astro_cleaned['HOUSE_NO'].str.replace("JUL-","07-", case = False)
astro_cleaned['HOUSE_NO'] = astro_cleaned['HOUSE_NO'].str.replace("-JUL","-07", case = False)
astro_cleaned['HOUSE_NO'] = astro_cleaned['HOUSE_NO'].str.replace("Jul-","07-", case = False)
astro_cleaned['HOUSE_NO'] = astro_cleaned['HOUSE_NO'].str.replace("-Jul","-07", case = False)
astro_cleaned['HOUSE_NO'] = astro_cleaned['HOUSE_NO'].str.replace("AUG-",'08-', case = False)
astro_cleaned['HOUSE_NO'] = astro_cleaned['HOUSE_NO'].str.replace("-AUG","-08", case = False)
astro_cleaned['HOUSE_NO'] = astro_cleaned['HOUSE_NO'].str.replace("Aug-",'08-', case = False)
astro_cleaned['HOUSE_NO'] = astro_cleaned['HOUSE_NO'].str.replace("-Aug","-08", case = False)
astro_cleaned['HOUSE_NO'] = astro_cleaned['HOUSE_NO'].str.replace("SEP-","09-", case = False)
astro_cleaned['HOUSE_NO'] = astro_cleaned['HOUSE_NO'].str.replace("-SEP","-09", case = False)
astro_cleaned['HOUSE_NO'] = astro_cleaned['HOUSE_NO'].str.replace("Sep-","09-", case = False)
astro_cleaned['HOUSE_NO'] = astro_cleaned['HOUSE_NO'].str.replace("-Sep","-09", case = False)
astro_cleaned['HOUSE_NO'] = astro_cleaned['HOUSE_NO'].str.replace("OCT-","10-", case = False)
astro_cleaned['HOUSE_NO'] = astro_cleaned['HOUSE_NO'].str.replace("-OCT","-10", case = False)
astro_cleaned['HOUSE_NO'] = astro_cleaned['HOUSE_NO'].str.replace("Oct-","10-", case = False)
astro_cleaned['HOUSE_NO'] = astro_cleaned['HOUSE_NO'].str.replace("-Oct","-10", case = False)
astro_cleaned['HOUSE_NO'] = astro_cleaned['HOUSE_NO'].str.replace("NOV-","11-", case = False)
astro_cleaned['HOUSE_NO'] = astro_cleaned['HOUSE_NO'].str.replace("-NOV","-11", case = False)
astro_cleaned['HOUSE_NO'] = astro_cleaned['HOUSE_NO'].str.replace("Nov-","11-", case = False)
astro_cleaned['HOUSE_NO'] = astro_cleaned['HOUSE_NO'].str.replace("-Nov","-11", case = False)
astro_cleaned['HOUSE_NO'] = astro_cleaned['HOUSE_NO'].str.replace("DEC-","12-", case = False)
astro_cleaned['HOUSE_NO'] = astro_cleaned['HOUSE_NO'].str.replace("-DEC","-12", case = False)
astro_cleaned['HOUSE_NO'] = astro_cleaned['HOUSE_NO'].str.replace("Dec-","12-", case = False)
astro_cleaned['HOUSE_NO'] = astro_cleaned['HOUSE_NO'].str.replace("-Dec","-12", case = False)

# Fix HOUSE_NO that are converted to date (DD/MM/YYYY format)
# Filter date HOUSE_NO
date_house = astro_cleaned[astro_cleaned['HOUSE_NO'].str.match('^([0-2][0-9]|(3)[0-1])(\/)(((0)[0-9])|((1)[0-2]))(\/)\d{4}$')== True]
# Spliting the HOUSE_NO
date_house.loc[:, 'block'] = (date_house.HOUSE_NO.str[0:2]) # Amzar 9/9/2022 --> added loc 
date_house.loc[:, 'floor'] = (date_house.HOUSE_NO.str[3:5]) # Amzar 9/9/2022 --> added loc 
date_house.loc[:, 'unit'] = (date_house.HOUSE_NO.str[8:10]) # Amzar 9/9/2022 --> added loc 
# Combine the split HOUSE_NO with -
date_house.loc[:, 'HOUSE_NO_ASTRO'] = date_house['block'] + "-" + date_house['floor'] + "-" + date_house['unit'] # Amzar 9/9/2022 --> added loc 
# Filter not date HOUSE_NO
not_date_house = astro_cleaned[~(astro_cleaned['HOUSE_NO'].str.match('^([0-2][0-9]|(3)[0-1])(\/)(((0)[0-9])|((1)[0-2]))(\/)\d{4}$')== True)]
not_date_house.loc[:, 'HOUSE_NO_ASTRO'] = not_date_house['HOUSE_NO'] # Amzar 9/9/2022 --> added loc 
not_date_house.head()
# Append the 2 df again
frame = [date_house,not_date_house]
astro_cleaned = pd.concat(frame)
astro_cleaned.shape

# Remove additional column created to combine HOUSE_NO
astro_cleaned = astro_cleaned.drop(['block','floor','unit'],axis=1)
astro_cleaned.info()



astro_cleaned['ASTRO_HOUSE_NO1']= astro_cleaned['HOUSE_NO_ASTRO'].str.pad(width=10)


#MDU NEW
astro_cleaned2 = astro_cleaned[['ACCOUNT_NO','service_add_objid', 'ASTRO_HOUSE_NO1', 
                                   'Combined_Building','Street_Type_1','Street_1_New','Street_Type_2',
                                   'Street_2_New',  'AREA','STD_CITY' ,'POSTCODE', 'ASTRO_STATE',
                                   'Standard_Building_Name',
                                   'ServiceType', 'Servicable', 'HNUM_STRT_TM']]


UAMS_SDU_Base = astro_cleaned2.copy() # Amzar 9/9/2022 --> added copy() 
print('P1 SDU UAMS_SDU_BASE df shape: ', UAMS_SDU_Base.shape, ' & astro_cleaned2 df shape: ', astro_cleaned2.shape) # Amzar 9/9/2022 --> added new print statement to compare

UAMS_SDU_Base.loc[:, 'ACCOUNT_NO'] = UAMS_SDU_Base.loc[:, 'ACCOUNT_NO'].astype(str) # Amzar 9/9/2022 --> added loc 
UAMS_SDU_Base.loc[:, 'ACCOUNT_NO'] = UAMS_SDU_Base.loc[:, 'ACCOUNT_NO'].str.replace('\.0','', case = False) # Amzar 9/9/2022 --> added loc 

print('P1 SDU UAMS_SDU_Base shape after converting ACC_NO col to str type: ', UAMS_SDU_Base.shape) # Amzar 9/9/2022 --> added more text to the print statement

UAMS_SDU_Base = UAMS_SDU_Base.drop_duplicates(subset=['ACCOUNT_NO'], keep='first')
print('P1 SDU UAMS_SDU_Base shape AFTER dedupe on ACC_NO, keep first: ', UAMS_SDU_Base.shape) # Amzar 9/9/2022 --> added more text to the print statement



UAMS_SDU_Base = UAMS_SDU_Base.rename({'ASTRO_HOUSE_NO1':'House_No', 
                                      'ACCOUNT_NO': 'Account_No'}, axis=1)


#UAMS_SDU_Base.to_csv('UAMS_Format_stndrd_'+str(ISP_Name)+'_P1_SDU.csv') # Save in pipeline bucket - to be used in UAMS generation
print('this is p1 sdu: ', UAMS_SDU_Base.shape)
wr.s3.to_csv(df = UAMS_SDU_Base, path = uams_sdu_path + 'UAMS_Format_stndrd_' + str(ISP_Name)+ '_P1_SDU.csv.gz', compression='gzip', index=False) # 18/11/22: AFTER running the job, decided to add this line to generate a file for easier automation. 5/12/22: added gz
wr.s3.to_csv(df = UAMS_SDU_Base, path = uams_sdu_path + 'historical_folder/UAMS_Format_stndrd_' + str(ISP_Name)+ '_P1_SDU_' + str(curr_date) + '.csv.gz', compression='gzip', index=False) # 18/11/22: added underscore after 'MDU' and AFTER running it to generate the files, decided to add the dated one to historical_folder 


# Remove the mapped account no
MAPPED_STRT_HNUM_Df_list = list(MAPPED_STRT_HNUM_Df['ACCOUNT_NO'])
astro_unmapped = astro_corrected.loc[~astro_corrected['ACCOUNT_NO'].isin(MAPPED_STRT_HNUM_Df_list), :] # Amzar 9/9/2022 --> added loc 
print('Astro Corrected shape right before end of P1 MDU mapping step and generating astro_unmapped: ', astro_corrected.shape) # Amzar 9/9/2022 --> added new print statement
print('After P1 SDU mapping step --> Astro Unmapped Shape : ', astro_unmapped.shape, ' & no of Unique Account No: ', astro_unmapped.ACCOUNT_NO.nunique()) # Amzar 9/9/2022 --> added more text to the print statement

# --> what is this below step for? There does not seem to be a use later on in this job. Maybe it's leftover from Maryam's local notebook
MDU_IN_SDU = UAMS_SDU_Base[UAMS_SDU_Base['Standard_Building_Name']!='']
MDU_IN_SDU.shape


# ### ---------------------------------------------------------- P2 Mapping - SDU ----------------------------------------------------------

## Using the unmapped base from P1 SDU mapping as the new astro_corrected
astro_corrected = astro_unmapped.copy() # Amzar 9/9/2022 --> added copy()

isp_corrected.info()

print('Start of P2 SDU Mapping. First, check TM Corrected shape: ', isp_corrected.shape) # Amzar 9/9/2022 --> added more text to print statement 
print('astro_corrected shape for P2 SDU Mapping (unmapped base after P1 SDU Mapping): ', astro_corrected.shape) # Amzar 9/9/2022 --> added more text to print statement 

# astro_corrected.ACCOUNT_NO.nnunique()


# Add new column in astro_corrected -- HOUSE_NO + G_Street_Name_1 + G_City = HNUM_STRT
astro_corrected["HNUM_STRT"] = astro_corrected["Street_1"].map(str) + " ," +                             astro_corrected["STD_CITY"].map(str)
astro_corrected.tail()


# Clean HNUM_STRT column
astro_corrected["HNUM_STRT"] = astro_corrected['HNUM_STRT'].str.replace('nan ','', case = False)
astro_corrected["HNUM_STRT"] = astro_corrected['HNUM_STRT'].str.replace('[,.]','', case = False)
astro_corrected["HNUM_STRT"] = astro_corrected["HNUM_STRT"].str.replace(" ","")
astro_corrected["HNUM_STRT"] = astro_corrected["HNUM_STRT"].str.replace("\.","")
astro_corrected["HNUM_STRT"] = astro_corrected["HNUM_STRT"].str.replace(",","")

print('Astro corrected shape after cleaning address columns & creating HNUM_STRT: ', astro_corrected.shape) # Amzar 9/9/2022 --> added more text to print statement 


# Capitalize HNUM_STRT
astro_corrected['HNUM_STRT'] = astro_corrected['HNUM_STRT'].str.upper() 


astro_corrected.head()

pd.set_option('display.max_columns', 500)


# Add new column in isp_corrected -- HouseNo + G_Street_Name_1 + G_City = HNUM_STRT_TM
isp_corrected["HNUM_STRT_TM"] = isp_corrected["Street_1"].map(str) + " ," +                             isp_corrected["STD_CITY"].map(str)
isp_corrected.head()


# Clean HNUM_STRT_TM column
isp_corrected["HNUM_STRT_TM"] = isp_corrected['HNUM_STRT_TM'].str.replace('nan ','', case = False)
isp_corrected["HNUM_STRT_TM"] = isp_corrected['HNUM_STRT_TM'].str.replace('[,.]','', case = False)
isp_corrected["HNUM_STRT_TM"] = isp_corrected["HNUM_STRT_TM"].str.replace(" ","")
isp_corrected["HNUM_STRT_TM"] = isp_corrected["HNUM_STRT_TM"].str.replace(",","")
isp_corrected["HNUM_STRT_TM"] = isp_corrected["HNUM_STRT_TM"].str.replace("\.","")

print('TM corrected shape after cleaning address columns & creating HNUM_STRT_TM: ', isp_corrected.shape) # Amzar 9/9/2022 --> added more text to print statement


# Capitalize HNUM_STRT_TM
isp_corrected['HNUM_STRT_TM'] = isp_corrected['HNUM_STRT_TM'].str.upper() 


# Remove nulls in HNUM_STRT and HNUM_STRT_TM
astro_corrected = astro_corrected[astro_corrected.HNUM_STRT.notnull()] # Amzar 9/9/2022 --> this is the last transformation to astro_corrected before the end of P1 MDU step. So before going to P1 SDU mapping, we may actually be missing some addresses as we transfer astro_corrected to astro_unmapped
isp_corrected = isp_corrected[isp_corrected.HNUM_STRT_TM.notnull()]

print('Astro corrected shape after filtering out null HNUM_STRT: ', astro_corrected.shape) # Amzar 9/9/2022 --> added more text to print statement
print('TM corrected shape after filtering out null HNUM_STRT_TM: ', isp_corrected.shape) # Amzar 9/9/2022 --> added more text to print statement


words = " ".join(isp_corrected.HNUM_STRT_TM).split() 
selection = set(words)
selection1 = list(selection)
selection2 = pd.DataFrame(selection1)
selection2.columns = ['MAPPED_HNUM_STRT_TM']
selection2.head()


MAPPED_STRT_HNUM_Df = astro_corrected.merge(selection2,left_on ='HNUM_STRT', right_on = 'MAPPED_HNUM_STRT_TM', how = 'inner')
MAPPED_STRT_HNUM_Df.shape

MAPPED_STRT_HNUM_Df.head()


# Making sure that G_Street_Name_1 has valid value
MAPPED_STRT_HNUM_Df = MAPPED_STRT_HNUM_Df[MAPPED_STRT_HNUM_Df['Street_1'].notnull()]
MAPPED_STRT_HNUM_Df = MAPPED_STRT_HNUM_Df[MAPPED_STRT_HNUM_Df['Street_1']!= ""]
MAPPED_STRT_HNUM_Df = MAPPED_STRT_HNUM_Df[MAPPED_STRT_HNUM_Df['Street_1']!= " "]
MAPPED_STRT_HNUM_Df = MAPPED_STRT_HNUM_Df[MAPPED_STRT_HNUM_Df['Street_1']!= "NAN"]
MAPPED_STRT_HNUM_Df = MAPPED_STRT_HNUM_Df[MAPPED_STRT_HNUM_Df['Street_1']!= "nan"]
MAPPED_STRT_HNUM_Df.shape

# this step is to keep only ONE record for 1 address even if there are 2 Sources (Vendor or GAPI)
MAPPED_STRT_HNUM_Df = MAPPED_STRT_HNUM_Df.sort_values(by='Source', ascending=False)
MAPPED_STRT_HNUM_Df = MAPPED_STRT_HNUM_Df.drop_duplicates(subset=['ACCOUNT_NO','HOUSE_NO'],keep='first')
MAPPED_STRT_HNUM_Df = MAPPED_STRT_HNUM_Df.drop_duplicates(subset='ACCOUNT_NO',keep='last')


print('P2 SDU MAPPED_STRT_HNUM_Df - Shape: ', MAPPED_STRT_HNUM_Df.shape, ' & Unique Acc_No: ', MAPPED_STRT_HNUM_Df.ACCOUNT_NO.nunique()) # Amzar 9/9/2022 --> added more text to print statement 

P2_SDU = MAPPED_STRT_HNUM_Df.copy() # Amzar 9/9/2022 --> added copy() to create an explicit copy
print('P2_SDU dataframe shape: ', P2_SDU.shape) # Amzar 9/9/2022 --> added new print statement to see shape of P1_SDU variable


# ### Generating UAMS format and getting ServiceType

STRT_P1 = MAPPED_STRT_HNUM_Df.copy() # Amzar 9/9/2022 --> added copy() to create an explicit copy
print('P2 SDU STRT_P1 dataframe shape: ', STRT_P1.shape) # Amzar 9/9/2022 --> added new print statement to see shape of STRT_P1 variable

STRT_P1['Street_1'] = STRT_P1['Street_1'].astype(str)

STRT_P1.reset_index(inplace=True, drop=True)
test = STRT_P1.loc[STRT_P1['Street_1'].apply(lambda x: x.startswith('AA')), :].index # Amzar 9/9/2022 --> added loc statement
test = list(test)

STRT_P1.loc[test,'Street_1'] = ''

STRT_P1.loc[STRT_P1['match']=='Match','Street_2'] = ''
STRT_P1[STRT_P1['match']=='Match']

import re

def extract_street(item):

    street_type = ""
    
    r1 = "JALAN|LORONG|CHANGKAT|LAMAN|LAHAT|LEBUH|LEBUHRAYA|LENGKOK|LINGKARAN|PERSIARAN"


    m = re.search(r1,item)
    if m:
        street_type = m.group()
    return street_type        
        
STRT_P1["Street_Type_1"] = STRT_P1["Street_1"].apply(extract_street)
STRT_P1["Street_Type_2"] = STRT_P1["Street_2"].map(str).apply(extract_street)
STRT_P1.head()

street_type_list = ['JALAN ', 'LORONG ','CHANGKAT ', 'LAMAN ', 'LAHAT ', 'LEBUH ', 'LEBUHRAYA ', 'LENGKOK ','LINGKARAN ', 'PERSIARAN ' ]
STRT_P1["Street_1_New"] = STRT_P1["Street_1"].str.replace('|'.join(street_type_list), '')

STRT_P1["Street_2"] = STRT_P1["Street_2"].str.upper()
STRT_P1["Street_2_New"] = STRT_P1["Street_2"].str.replace('|'.join(street_type_list), '')
STRT_P1.head()

# Getting the ServiceType
service_list = isp_corrected.loc[:, ['ServiceType','HNUM_STRT_TM']].drop_duplicates() # Amzar 9/9/2022 --> added loc 
service_list["ServiceType"] = service_list["ServiceType"].str.upper()
service_list = service_list[service_list['ServiceType']!='ERROR']


New_fields1 = pd.merge(STRT_P1,service_list,left_on ='HNUM_STRT',right_on='HNUM_STRT_TM', how = 'left')
New_fields1.info()


#MDU NEW
New_fields2 = New_fields1[['ACCOUNT_NO','service_add_objid','ASTRO_HOUSE_NO1',
                           'Combined_Building','Street_Type_1','Street_1_New','Standard_Building_Name', 
                           'Street_Type_2','Street_2_New','AREA','POSTCODE','STD_CITY','ASTRO_STATE', 'ServiceType','HNUM_STRT_TM']]


New_fields2.loc[:, 'Servicable']= str(ISP_Name) # Amzar 9/9/2022 --> added loc 


# New_fields3 = New_fields2.drop_duplicates(subset= 'ACCOUNT_NO', keep = 'first')
New_fields3 = New_fields2.sort_values(['ServiceType']).drop_duplicates(subset= 'ACCOUNT_NO', keep = 'first')


import pandas as pd

astro_cleaned = New_fields3.copy() # Amzar 9/9/2022 --> added copy()
print('P2 SDU astro_cleaned shape: ', astro_cleaned.shape, '& New_fields3 shape: ', New_fields3.shape) # Amzar 9/9/2022 --> added new print statement to compare


# Fix HOUSE_NO that are converted to date
astro_cleaned['HOUSE_NO'] = astro_cleaned['ASTRO_HOUSE_NO1'] 
astro_cleaned['HOUSE_NO'] = astro_cleaned['HOUSE_NO'].str.replace("JAN-","01-", case = False)
astro_cleaned['HOUSE_NO'] = astro_cleaned['HOUSE_NO'].str.replace("-JAN","-01", case = False)
astro_cleaned['HOUSE_NO'] = astro_cleaned['HOUSE_NO'].str.replace("Jan-","01-", case = False)
astro_cleaned['HOUSE_NO'] = astro_cleaned['HOUSE_NO'].str.replace("-Jan","-01", case = False)
astro_cleaned['HOUSE_NO'] = astro_cleaned['HOUSE_NO'].str.replace("FEB-","02-", case = False)
astro_cleaned['HOUSE_NO'] = astro_cleaned['HOUSE_NO'].str.replace("-FEB",'-02', case = False)
astro_cleaned['HOUSE_NO'] = astro_cleaned['HOUSE_NO'].str.replace("Feb-","02-", case = False)
astro_cleaned['HOUSE_NO'] = astro_cleaned['HOUSE_NO'].str.replace("-Feb","-02", case = False)
astro_cleaned['HOUSE_NO'] = astro_cleaned['HOUSE_NO'].str.replace("MAR-",'03-', case = False)
astro_cleaned['HOUSE_NO'] = astro_cleaned['HOUSE_NO'].str.replace("-MAR","-03", case = False)
astro_cleaned['HOUSE_NO'] = astro_cleaned['HOUSE_NO'].str.replace("Mar-",'03-', case = False)
astro_cleaned['HOUSE_NO'] = astro_cleaned['HOUSE_NO'].str.replace("-Mar","-03", case = False)
astro_cleaned['HOUSE_NO'] = astro_cleaned['HOUSE_NO'].str.replace("APR-","04-", case = False)
astro_cleaned['HOUSE_NO'] = astro_cleaned['HOUSE_NO'].str.replace("-APR","-04", case = False)
astro_cleaned['HOUSE_NO'] = astro_cleaned['HOUSE_NO'].str.replace("Apr-","04-", case = False)
astro_cleaned['HOUSE_NO'] = astro_cleaned['HOUSE_NO'].str.replace("-Apr","-04", case = False)
astro_cleaned['HOUSE_NO'] = astro_cleaned['HOUSE_NO'].str.replace("MAY-","05-", case = False)
astro_cleaned['HOUSE_NO'] = astro_cleaned['HOUSE_NO'].str.replace("-MAY","-05", case = False)
astro_cleaned['HOUSE_NO'] = astro_cleaned['HOUSE_NO'].str.replace("May-","05-", case = False)
astro_cleaned['HOUSE_NO'] = astro_cleaned['HOUSE_NO'].str.replace("-May","-05", case = False)
astro_cleaned['HOUSE_NO'] = astro_cleaned['HOUSE_NO'].str.replace("JUN-","06-", case = False)
astro_cleaned['HOUSE_NO'] = astro_cleaned['HOUSE_NO'].str.replace("-JUN","-06", case = False)
astro_cleaned['HOUSE_NO'] = astro_cleaned['HOUSE_NO'].str.replace("Jun-","06-", case = False)
astro_cleaned['HOUSE_NO'] = astro_cleaned['HOUSE_NO'].str.replace("-Jun","-06", case = False)
astro_cleaned['HOUSE_NO'] = astro_cleaned['HOUSE_NO'].str.replace("JUL-","07-", case = False)
astro_cleaned['HOUSE_NO'] = astro_cleaned['HOUSE_NO'].str.replace("-JUL","-07", case = False)
astro_cleaned['HOUSE_NO'] = astro_cleaned['HOUSE_NO'].str.replace("Jul-","07-", case = False)
astro_cleaned['HOUSE_NO'] = astro_cleaned['HOUSE_NO'].str.replace("-Jul","-07", case = False)
astro_cleaned['HOUSE_NO'] = astro_cleaned['HOUSE_NO'].str.replace("AUG-",'08-', case = False)
astro_cleaned['HOUSE_NO'] = astro_cleaned['HOUSE_NO'].str.replace("-AUG","-08", case = False)
astro_cleaned['HOUSE_NO'] = astro_cleaned['HOUSE_NO'].str.replace("Aug-",'08-', case = False)
astro_cleaned['HOUSE_NO'] = astro_cleaned['HOUSE_NO'].str.replace("-Aug","-08", case = False)
astro_cleaned['HOUSE_NO'] = astro_cleaned['HOUSE_NO'].str.replace("SEP-","09-", case = False)
astro_cleaned['HOUSE_NO'] = astro_cleaned['HOUSE_NO'].str.replace("-SEP","-09", case = False)
astro_cleaned['HOUSE_NO'] = astro_cleaned['HOUSE_NO'].str.replace("Sep-","09-", case = False)
astro_cleaned['HOUSE_NO'] = astro_cleaned['HOUSE_NO'].str.replace("-Sep","-09", case = False)
astro_cleaned['HOUSE_NO'] = astro_cleaned['HOUSE_NO'].str.replace("OCT-","10-", case = False)
astro_cleaned['HOUSE_NO'] = astro_cleaned['HOUSE_NO'].str.replace("-OCT","-10", case = False)
astro_cleaned['HOUSE_NO'] = astro_cleaned['HOUSE_NO'].str.replace("Oct-","10-", case = False)
astro_cleaned['HOUSE_NO'] = astro_cleaned['HOUSE_NO'].str.replace("-Oct","-10", case = False)
astro_cleaned['HOUSE_NO'] = astro_cleaned['HOUSE_NO'].str.replace("NOV-","11-", case = False)
astro_cleaned['HOUSE_NO'] = astro_cleaned['HOUSE_NO'].str.replace("-NOV","-11", case = False)
astro_cleaned['HOUSE_NO'] = astro_cleaned['HOUSE_NO'].str.replace("Nov-","11-", case = False)
astro_cleaned['HOUSE_NO'] = astro_cleaned['HOUSE_NO'].str.replace("-Nov","-11", case = False)
astro_cleaned['HOUSE_NO'] = astro_cleaned['HOUSE_NO'].str.replace("DEC-","12-", case = False)
astro_cleaned['HOUSE_NO'] = astro_cleaned['HOUSE_NO'].str.replace("-DEC","-12", case = False)
astro_cleaned['HOUSE_NO'] = astro_cleaned['HOUSE_NO'].str.replace("Dec-","12-", case = False)
astro_cleaned['HOUSE_NO'] = astro_cleaned['HOUSE_NO'].str.replace("-Dec","-12", case = False)

# Fix HOUSE_NO that are converted to date (DD/MM/YYYY format)
# Filter date HOUSE_NO
date_house = astro_cleaned[astro_cleaned['HOUSE_NO'].str.match('^([0-2][0-9]|(3)[0-1])(\/)(((0)[0-9])|((1)[0-2]))(\/)\d{4}$')== True]
# Spliting the HOUSE_NO
date_house.loc[:, 'block'] = (date_house.HOUSE_NO.str[0:2]) # Amzar 9/9/2022 --> added loc 
date_house.loc[:, 'floor'] = (date_house.HOUSE_NO.str[3:5]) # Amzar 9/9/2022 --> added loc 
date_house.loc[:, 'unit'] = (date_house.HOUSE_NO.str[8:10]) # Amzar 9/9/2022 --> added loc 
# Combine the split HOUSE_NO with -
date_house.loc[:, 'HOUSE_NO_ASTRO'] = date_house['block'] + "-" + date_house['floor'] + "-" + date_house['unit'] # Amzar 9/9/2022 --> added loc 
# Filter not date HOUSE_NO
not_date_house = astro_cleaned[~(astro_cleaned['HOUSE_NO'].str.match('^([0-2][0-9]|(3)[0-1])(\/)(((0)[0-9])|((1)[0-2]))(\/)\d{4}$')== True)]
not_date_house.loc[:, 'HOUSE_NO_ASTRO'] = not_date_house['HOUSE_NO'] # Amzar 9/9/2022 --> added loc 
not_date_house.head()
# Append the 2 df again
frame = [date_house,not_date_house]
astro_cleaned = pd.concat(frame)
astro_cleaned.shape

# Remove additional column created to combine HOUSE_NO
astro_cleaned = astro_cleaned.drop(['block','floor','unit'],axis=1)
astro_cleaned.info()



astro_cleaned['ASTRO_HOUSE_NO1']= astro_cleaned['HOUSE_NO_ASTRO'].str.pad(width=10)


#MDU NEW
astro_cleaned2 = astro_cleaned[['ACCOUNT_NO','service_add_objid', 'ASTRO_HOUSE_NO1', 
                                   'Combined_Building','Street_Type_1','Street_1_New','Street_Type_2',
                                   'Street_2_New',  'AREA','STD_CITY' ,'POSTCODE', 'ASTRO_STATE',
                                   'Standard_Building_Name',
                                   'ServiceType', 'Servicable', 'HNUM_STRT_TM']]


UAMS_MDU_Base = astro_cleaned2.copy() # Amzar 9/9/2022 --> added copy() 
print('P2 SDU UAMS_MDU_BASE (might need to change df name) shape: ', UAMS_MDU_Base.shape, ' & astro_cleaned2 df shape: ', astro_cleaned2.shape) # Amzar 9/9/2022 --> added new print statement to compare

UAMS_MDU_Base.loc[:, 'ACCOUNT_NO'] = UAMS_MDU_Base['ACCOUNT_NO'].astype(str) # Amzar 9/9/2022 --> added loc 
UAMS_MDU_Base.loc[:, 'ACCOUNT_NO'] = UAMS_MDU_Base['ACCOUNT_NO'].str.replace('\.0','', case = False) # Amzar 9/9/2022 --> added loc 

print('P2 SDU UAMS_MDU_BASE shape after converting ACC_NO col to str type: ', UAMS_MDU_Base.shape) # Amzar 9/9/2022 --> added more text to the print statement

# UAMS_MDU_Base = UAMS_MDU_Base.drop_duplicates(subset=['ACCOUNT_NO'], keep='first')
# print(UAMS_MDU_Base.shape) # Amzar 9/9/2022 --> commented out
print('For P2 SDU, there is no dedupe on ACC_NO as it is the final mapping step') # Amzar 9/9/2022 --> added new print statement to describe why no de-dupe


UAMS_MDU_Base=UAMS_MDU_Base.rename({'ASTRO_HOUSE_NO1':'House_No', 
                                      'ACCOUNT_NO': 'Account_No'}, axis=1)


print('this is p2 sdu: ', UAMS_MDU_Base.shape)
#UAMS_MDU_Base.to_csv('UAMS_Format_stndrd_'+str(ISP_Name)+'_P2_SDU.csv') # Save in pipeline bucket - to be used in UAMS generation
wr.s3.to_csv(df = UAMS_MDU_Base, path = uams_sdu_path + 'UAMS_Format_stndrd_' + str(ISP_Name)+ '_P2_SDU.csv.gz', compression='gzip', index=False) # 18/11/22: AFTER running the job, decided to add this line to generate a file for easier automation. 5/12/22: added gz
wr.s3.to_csv(df = UAMS_MDU_Base, path = uams_sdu_path + 'historical_folder/UAMS_Format_stndrd_' + str(ISP_Name)+ '_P2_SDU_' + str(curr_date) + '.csv.gz', compression='gzip', index=False) # 18/11/22: added underscore after 'MDU' and AFTER running it to generate the files, decided to add the dated one to historical_folder 

# Amzar 9/9/2022 --> Added this paragraph to see how many acc no left unmapped (copied from P1 SDU)
MAPPED_STRT_HNUM_Df_list = list(MAPPED_STRT_HNUM_Df['ACCOUNT_NO'])
astro_unmapped = astro_corrected.loc[~astro_corrected['ACCOUNT_NO'].isin(MAPPED_STRT_HNUM_Df_list), :] 
print('Astro Corrected shape right before end of P1 SDU mapping step and generating astro_unmapped: ', astro_corrected.shape) 
print('After P2 SDU mapping step --> Final leftover Astro Unmapped Shape : ', astro_unmapped.shape, ' & no of Unique Account No: ', astro_unmapped.ACCOUNT_NO.nunique())

usage = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
print('[debug] memory usage is (Megabytes):')
print(usage)
    
print('end')

