# 1. Initialization

In [1]:
import os
import gc
import numpy as np
import pandas as pd
import xml.etree.ElementTree as ET
import xml.dom.minidom as dom
import cx_Oracle as ora
import pyodbc as sql
import re
from datetime import datetime as dt
from sqlalchemy import create_engine

print(f'Pandas version: {pd.__version__}')
print(f'cs-Oracle version: {ora.__version__}')

Pandas version: 1.5.2
cs-Oracle version: 8.3.0


### 1.1 Set Parameters

In [2]:
# Set Parameters

#env = 'QA'
#env = 'STG'
env = 'PRD'

path = 'C:\\Users\\aolguin\\Projects\\CVE\\'
extract_path = 'Extracts\\'
xml_path = 'Build\\OI Scripts\\'
upload_path = '\\\wsidm009pd\\CS10\\'
mapping_file = 'Build\\Mappings\\Mapping Tables.xlsx'

source_path = {'FinancialImages_AD': ':Financial Images_Migrated to CDMS & redirecting:'
               ,'CorporateFinance': ':Corporate Finance_NOMIGRATE:'
               ,'JDEFinancialPDFs': ':JDE Financial PDFs and Reports_TO BE ARCHIVED:'
               ,'FinancialImages_UO': ':Financial Images:'
               ,'IncomeAccounting': ':Royalty Income Accounting:'
               ,'JDEAttachments': ':JDE Attachments:'
              }
target_path = {'FinancialImages_AD': 'Enterprise:Department Administration:Finance:z-Archives:(Application Data) Financial Images:'
               ,'CorporateFinance': 'Enterprise:Department Administration:Finance:z-Archives:(Corporate Finance) Comptrollers Corporate Finance:'
               ,'JDEFinancialPDFs': 'Enterprise:Department Administration:Finance:z-Archives:(Corporate Finance) JDE Financial PDFs:'
               ,'FinancialImages_UO': 'Enterprise:Department Administration:Finance:z-Archives:(Upstream Operations) Central Services Financial Images:'
               ,'IncomeAccounting': 'Enterprise:Department Administration:Finance:z-Archives:(Upstream Operations) Central Services Royalty Income Accounting:'
               ,'JDEAttachments': 'Enterprise:Department Administration:Finance:z-Archives:(Upstream Operations) Supply Mgmt JDE Attachments:'
              }
source_parentid = {'FinancialImages_AD': 67272049
                   ,'CorporateFinance': 1221831
                   ,'JDEFinancialPDFs': 257997
                   ,'FinancialImages_UO': 1234487
                   ,'IncomeAccounting': 177197
                   ,'JDEAttachments': 171378752
                  }

df_extract = {}
df_cat_extract = {}
df_report = {}
df_xml = {}

target_folder = 'Enterprise:Department Administration:Finance:z-Archives:'
batch_size = 20000

cnxn_str = {'cdms': 'Driver={SQL Server Native Client 11.0}; Server=csagprd01list; Database=ContentServer; UID=csReadOnly; PWD=Ry62sW781010@@',
#            'cve': 'cs10/cs10tqa55@ORAPRD24:1521/ORAPRD24?encoding=UTF-8&nencoding=UTF-8'
            'cve': 'cs10:cs10tqa55@ORAPRD24:1521/ORAPRD24?encoding=UTF-8&nencoding=UTF-8'
           }
query = {}

print(f'{dt.now().strftime("%b %d %Y %H:%M:%S %p")}: Done')

Oct 31 2023 12:42:57 PM: Done


In [3]:
# Create Database Engines
#ora_engine = create_engine('oracle://' + cnxn_str['cve'], use_nchar_for_unicode=True, coerce_to_unicode=False, text_encoding_errors='replace')
ora_engine = create_engine('oracle://' + cnxn_str['cve'], encoding_errors='replace')
print(f'{dt.now().strftime("%b %d %Y %H:%M:%S %p")}: Done')

Oct 31 2023 12:42:57 PM: Done


### 1.2 Define Functions

#### extract_data()

In [4]:
def extract_data (collection:str, folder_name: str, dataid: int, save_to_file: bool):
    
    # Build query strings
    query['containers'] = f"""
            SELECT  dt.DataId
                    ,dt.VersionNum
                    ,dt.ParentId
                    ,dt.Name
                    ,nn.NickName
                    ,dt.dcomment
                    ,dt.createdate
                    ,dt.modifydate
                    ,cb.name        createdby
                    ,CASE SubType   WHEN 136 THEN 'compounddoc'    WHEN 557 THEN 'compoundemail'
                                    WHEN 0 THEN 'folder'           WHEN 202 THEN 'project' 
                                    WHEN 751 THEN 'emailfolder'    END SubType
            --                ,REGEXP_REPLACE(Name, '[^[:print:]]', '') Name
                    ,LEVEL FolderLevel
                    ,SYS_CONNECT_BY_PATH(REPLACE(dt.Name, ':', ' '), ':') path
            FROM    DTree dt
            LEFT OUTER JOIN NickName nn ON nn.Id = dt.DataId
            LEFT OUTER JOIN Kuaf cb ON dt.createdby = cb.Id
            WHERE   SubType IN (0, 751, 202, 136, 557)
            START WITH DataID = {dataid}
            CONNECT BY NOCYCLE PRIOR DataID = ParentID
            """
    query['contents'] = f"""
            SELECT  dt.DataId
                    ,dt.VersionNum
                    ,dt.ParentId
                    ,CASE dt.SubType WHEN 1 THEN 'alias'              WHEN 140 THEN 'url'
                                     WHEN 144 THEN 'document'            WHEN 749 THEN 'email'      END SubType
                    ,REPLACE(REGEXP_REPLACE(dt.Name, '[^[:print:]]', ''), '?', '') Name
                    ,nn.NickName
                    ,dt.dcomment
                    ,dt.createdate
                    ,dt.modifydate
                    ,cb.name        createdby
                    ,dt.exatt1      url
                    ,dt.origindataid
                    ,LEVEL FolderLevel
                    ,SYS_CONNECT_BY_PATH(REPLACE(dt.Name, ':', ' '), ':') path
                    ,vd.VersionId
                    ,vd.Version
                    ,REPLACE(REPLACE(REPLACE(REGEXP_REPLACE(filename, '[^[:print:]]', ''), '?', ''), '’', ''), '%', '') FileName
                    ,vd.MimeType
                    ,vd.DataSize
                    ,vd.VercDate
                    ,vd.VermDate
                    ,pd.ProviderData
            FROM    DTree dt
            LEFT OUTER JOIN NickName nn ON nn.Id = dt.DataId
            LEFT OUTER JOIN Kuaf cb ON dt.createdby = cb.Id
            LEFT OUTER JOIN (SELECT * FROM DVersData WHERE VerType IS NULL) vd ON vd.DocId = dt.DataId AND vd.Version = dt.VersionNum
            LEFT OUTER JOIN ProviderData pd ON pd.ProviderId = vd.ProviderId
            WHERE   dt.SubType IN (1, 144, 140, 749)
            START WITH dt.DataID = {dataid}
            CONNECT BY NOCYCLE PRIOR dt.DataID = dt.ParentID
            """
        
    # Upload Data from Database
    cnxn = ora_engine.connect()
    df_extract[collection] = pd.read_sql(query[collection], cnxn)
    print(f'{dt.now().strftime("%b %d %Y %H:%M:%S %p")}:     Loaded {folder_name} = {df_extract[collection].shape[0]} rows')
    if save_to_file:
        df_extract[collection].to_csv(path + extract_path + 'Extracted_' + folder_name + '_' + collection + '.csv', index=False)
        print(f'{dt.now().strftime("%b %d %Y %H:%M:%S %p")}:     Saving to csv {folder_name}= {df_extract[collection].shape[0]} rows')
#        release_extract_memory([collection])
    cnxn.close()

print(f'{dt.now().strftime("%b %d %Y %H:%M:%S %p")}: Done')

Oct 31 2023 12:42:59 PM: Done


#### extract_category_data()

In [5]:
def extract_category_data (category: dict):
    catid = category['catid']
    catname = category['catname']
    df_cat_extract[catname] = pd.DataFrame()
    for attribute in [i for i in category if i not in ['catid', 'catname']]:
        attid = category[attribute][0]
        colname = category[attribute][1]
        query['cat_att'] = f"""
                SELECT  Id
                        ,VerNum
                        ,{colname} "{attribute}"
                FROM    LLAttrData 
                WHERE   defid = {catid}
                AND     AttrId = {attid}
                AND     {colname} IS NOT NULL
                """        
        # Upload Data from Database
        cnxn = ora_engine.connect()
        df = pd.read_sql(query['cat_att'], cnxn)
        if df_cat_extract[catname].empty:
            df_cat_extract[catname] = df
        else:
            df_cat_extract[catname] = df_cat_extract[catname].merge(df, how='outer', on=['id', 'vernum'], copy=False) 
        print(f'{dt.now().strftime("%b %d %Y %H:%M:%S %p")}:     Loaded {attribute} = {df.shape[0]} rows')
        cnxn.close()

print(f'{dt.now().strftime("%b %d %Y %H:%M:%S %p")}: Done')

Oct 31 2023 12:43:00 PM: Done


#### apply_transformation()

In [6]:
def apply_transformation(collection:str):
    
    # batch
    df_batches = pd.DataFrame(df_extract[collection].groupby(by='subtype').cumcount().apply(lambda x: int(x/batch_size+1)), columns=['batch'])
    if 'batch' in df_extract[collection].columns: df_extract[collection].drop(['batch'], axis=1, inplace=True)
    df_extract[collection] = df_extract[collection].merge(df_batches, how='left', left_index=True, right_index=True)
    print(f'{dt.now().strftime("%b %d %Y %H:%M:%S %p")}:     batch number = {df_extract[collection].shape[0]} rows affected')

    # c_location
    #df_extract[collection]['c_location'] = df_extract[collection][['path', 'name']].apply(lambda x: str(x['path']).replace(source_path[folder], target_path[folder]).replace(':' + x['name'], ''), axis=1)
    df_extract[collection]['c_location'] = df_extract[collection]['path'].apply(lambda x: x.replace(source_path[folder], target_path[folder])[::-1][x[::-1].find(':')+1:][::-1])
    print(f'{dt.now().strftime("%b %d %Y %H:%M:%S %p")}:     c_location... {df_extract[collection].shape[0]} rows affected')

    # c_metadata
    migration_date = dt.now().strftime("%Y-%b-%d %H:%M:%S")
    df_extract[collection]['c_metadata'] = df_extract[collection][['modifydate', 'createdby', 'nickname']].apply(lambda x: f'<MigrationDate={migration_date}><ModifiedDate={x["modifydate"]}><CreatedBy={x["createdby"]}><NickName={x["nickname"]}>' if pd.notnull(x["nickname"])
                                                                                                                 else f'<MigrationDate={migration_date}><ModifiedDate={x["modifydate"]}><CreatedBy={x["createdby"]}>', axis=1)
    print(f'{dt.now().strftime("%b %d %Y %H:%M:%S %p")}:     c_metadata... {df_extract[collection].shape[0]} rows affected')

    if collection == 'containers':
        # c_createdate
        df_extract[collection]['c_createdate'] = df_extract[collection][['createdate']].apply(lambda x: str(x['createdate']).replace('-', '').replace(' ', '').replace(':', ''), axis=1)
        print(f'{dt.now().strftime("%b %d %Y %H:%M:%S %p")}:     c_createdate... {df_extract[collection].shape[0]} rows affected')

        # c_modifydate
        df_extract[collection]['c_modifydate'] = df_extract[collection][['modifydate']].apply(lambda x: str(x['modifydate']).replace('-', '').replace(' ', '').replace(':', ''), axis=1)
        print(f'{dt.now().strftime("%b %d %Y %H:%M:%S %p")}:     c_modifydate... {df_extract[collection].shape[0]} rows affected')

    if collection == 'contents':
        # c_url
        df_extract[collection]['c_url'] = df_extract[collection][['dataid']][(df_extract[collection]['subtype'] == 'alias')].apply(lambda x: 'https://contentserver.cenovus.com/otcs2/cs.exe/open/' + str(x['dataid']), axis=1)
        print(f'{dt.now().strftime("%b %d %Y %H:%M:%S %p")}:     c_url... {df_extract[collection].shape[0]} rows affected')
        
        # c_createdate
        df_extract[collection]['c_createdate'] = df_extract[collection][['createdate', 'vercdate', 'version']].apply(lambda x: str(x['vercdate']).replace('-', '').replace(' ', '').replace(':', '') 
                                                                                                                     if x['version'] > 1 
                                                                                                                     else str(x['createdate']).replace('-', '').replace(' ', '').replace(':', ''), axis=1)
        print(f'{dt.now().strftime("%b %d %Y %H:%M:%S %p")}:     c_createdate... {df_extract[collection].shape[0]} rows affected')

        # c_modifydate
        df_extract[collection]['c_modifydate'] = df_extract[collection][['modifydate', 'vermdate', 'version']].apply(lambda x: str(x['vermdate']).replace('-', '').replace(' ', '').replace(':', '')
                                                                                                                     if x['version'] > 1 
                                                                                                                     else str(x['modifydate']).replace('-', '').replace(' ', '').replace(':', ''), axis=1)
        print(f'{dt.now().strftime("%b %d %Y %H:%M:%S %p")}:     c_modifydate... {df_extract[collection].shape[0]} rows affected')

        # c_sourcefile
        df_extract[collection]['c_source_filepath'] = df_extract[collection][['providerdata']][(df_extract[collection]['providerdata'].notnull())].apply(lambda x: re.findall("(?<=providerInfo'=')[0-9\\\\.dat]*", x['providerdata'])[0] if x['providerdata'].find('providerInfo') != -1 else x['providerdata'], axis=1)
        df_extract[collection]['c_providername'] = df_extract[collection][['providerdata']][(df_extract[collection]['providerdata'].notnull())].apply(lambda x: re.findall("(?<=subProviderName'=')[a-zA-Z0-9]*", x['providerdata'])[0] if x['providerdata'].find('subProviderName') != -1 else 'Default', axis=1)
        df_extract[collection]['c_sourcefile'] = df_extract[collection][['c_source_filepath', 'c_providername']].apply(lambda x: '\\\\n01svmnas1\\contentserver_prd02\\' + x['c_source_filepath'] if x['c_providername'] == 'Default'
                                                                                                                       else ('\\\\n01svmnas1\\contentserver_prd01\\' + x['c_source_filepath'] if x['c_providername'] == 'Vol01'
                                                                                                                             else ''), axis=1)
        print(f'{dt.now().strftime("%b %d %Y %H:%M:%S %p")}:     c_sourcefile = {df_extract[collection].shape[0]} rows affected')
        
        # c_targetfile
        #df_extract[collection]['c_target_filepath'] = df_extract[collection][['c_source_filepath', 'dataid', 'filename']][(df_extract[collection]['providerdata'].notnull())].apply(lambda x: x['c_source_filepath'].replace(str(int(x['dataid']))+'.dat', x['filename']), axis=1)
        #df_extract[collection]['c_targetfile'] = df_extract[collection][['batch', 'c_target_filepath']][(df_extract[collection]['providerdata'].notnull())].apply(lambda x: f"{upload_path}{folder}\\{str(x['batch'])}\\{x['c_target_filepath']}", axis=1)    
        df_extract[collection]['c_targetfile'] = df_extract[collection][['dataid', 'batch', 'version', 'filename']][(df_extract[collection]['providerdata'].notnull())].apply(lambda x: f"{upload_path}{folder}\\{str(x['batch'])}\\({str(int(x['dataid']))}-{str(int(x['version']))}) - {x['filename']}", axis=1)    
        print(f'{dt.now().strftime("%b %d %Y %H:%M:%S %p")}:     c_targetfile = {df_extract[collection].shape[0]} rows affected')


print(f'{dt.now().strftime("%b %d %Y %H:%M:%S %p")}: Done')

Oct 31 2023 12:43:02 PM: Done


#### map_to_xml()

In [7]:
def map_to_xml(collecion:str, subtype:str, category:dict):
    df = df_extract[collection][(df_extract[collection]['subtype'] == subtype)]
    xml = {}
    nodes = len(df)
    xml = {'title': df['name'].apply(lambda x: str(x).strip() if pd.notnull(x) else ''),
           'location': df['c_location'].apply(lambda x: str(x).strip() if pd.notnull(x) else ''),
           'description': df['dcomment'].apply(lambda x: str(x).strip() if pd.notnull(x) else ''),
           'created': df['c_createdate'].apply(lambda x: str(x).strip() if pd.notnull(x) else ''),
           'createdby': ['Admin'] * nodes,
           'modified': df['c_modifydate'].apply(lambda x: str(x).strip() if pd.notnull(x) else ''),
           # Id's
           'batch': df['batch'],
           'dataid': df['dataid'],
           'level': df['folderlevel'],
           # Migration System Properties
           'Source System': ['CS 10.5'] * nodes,
           'System ID': df['dataid'].apply(lambda x: str(x).strip() if pd.notnull(x) else ''),
           'Metadata': df['c_metadata'].apply(lambda x: str(x).strip() if pd.notnull(x) else '')
          }
    if subtype == 'url':
        xml['url'] = df['url'].apply(lambda x: str(x).strip() if pd.notnull(x) else '')
    if subtype == 'alias':
        xml['url'] = df['c_url'].apply(lambda x: str(x).strip() if pd.notnull(x) else '')
    if subtype in ['document', 'email']:
        # Migration System Properties
        xml['Version'] = df['version'].apply(lambda x: str(x).strip() if pd.notnull(x) else '')
        # Id's
        xml['version'] = df['version']
        # Version
        xml['filename'] = df['filename'].apply(lambda x: str(x).strip() if pd.notnull(x) else '')
        xml['file'] = df['c_targetfile'].apply(lambda x: str(x).strip() if pd.notnull(x) else '')
        xml['mime'] = df['mimetype'].apply(lambda x: str(x).strip() if pd.notnull(x) else '')
    if any(category):
        for attribute in [i for i in category if i not in ['catid', 'catname']]:
            if category[attribute][1] == 'ValInt':
                xml[attribute] = df[attribute].apply(lambda x: str(int(x)).strip() if pd.notnull(x) else '')
            else:
                xml[attribute] = df[attribute].apply(lambda x: str(x).strip() if pd.notnull(x) else '')

    df_xml[subtype] = pd.DataFrame(xml)
    print(f'{dt.now().strftime("%b %d %Y %H:%M:%S %p")}:     {subtype} = {df_xml[subtype].shape[0]} rows')
   
print(f'{dt.now().strftime("%b %d %Y %H:%M:%S %p")}: Done')


Oct 31 2023 12:43:04 PM: Done


#### build_create_oi()

In [8]:
def build_create_oi(subtype: str, output_file: str, sort_list: list, legacy_id: str, categories: dict):
    df = df_xml[subtype]
    for batch in df['batch'][(df['batch'].notnull())].sort_values().unique():
        LegacyID_ant = ''
        xml_file_name = 'create-' + subtype + '-' + output_file + '-' + str(batch).zfill(2) + '.xml'
        df_batch = df[(df['batch'] == batch)].sort_values(by=sort_list)
        root = ET.Element('import')
        tree = ET.ElementTree(root)
        for key, value in df_batch.iterrows():
            node = ET.Element('node')
            node.set('type', subtype)
            location = ET.SubElement(node, 'location')
            location.text = value['location']
            title = ET.SubElement(node, 'title')
            title.text = value['title']
            ET.SubElement(node, 'description').text = value['description']
            ET.SubElement(node, 'created').text = value['created']
            ET.SubElement(node, 'createdby').text = value['createdby']
            ET.SubElement(node, 'modified').text = value['modified']
            if subtype in ['document', 'email', 'caddocument']:
                ET.SubElement(node, 'file').text = value['file']
                ET.SubElement(node, 'filename').text = value['filename']
                ET.SubElement(node, 'mime').text = value['mime']                    
                if value[legacy_id] == LegacyID_ant:
                    node.set('action', 'addversion')
                    node.remove(location)
                    node.remove(title)
                    ET.SubElement(node, 'location').text = value['location'] + ':' + value['title']
                else:
                    node.set('action', 'create')
                LegacyID_ant = value[legacy_id]
            if subtype == 'url':
                node.set('action', 'create')
                ET.SubElement(node, 'url').text = value['url']
            if subtype in ('folder', 'emailfolder'):
                node.set('action', 'create')
            if subtype == 'alias':
                node.set('type', 'url')
                node.set('action', 'create')
                ET.SubElement(node, 'url').text = value['url']
    #            ET.SubElement(node, 'alias').text = value['alias']
            if subtype in ['compounddoc', 'compoundemail']:
                node.set('action', 'create')
            if len(categories) != 0:
                for category_name in categories:
                    category = ET.Element('category')
                    category.set('name', category_name)
                    cat_atts = categories[category_name]
                    att_count = 0
                    for att in cat_atts:
                        if len(value[att]) > 0: 
                            ET.SubElement(category, 'attribute', attrib={'name': att}).text = value[att]
                            att_count += 1
                    if att_count >0: node.append(category)
            root.append(node)
        xml = ET.tostring(root, encoding='utf-8', method='xml')
        xml_parsed = dom.parseString(xml).toprettyxml()
        xml_file = open(path + xml_path + xml_file_name, 'w',  encoding='utf-8')
        xml_file.write(xml_parsed)
        xml_file.close()
        print(f'{dt.now().strftime("%b %d %Y %H:%M:%S %p")}:     {xml_file_name} = {len(root)} nodes out of {df_batch.shape[0]}')
        root.clear()
        
print(f'{dt.now().strftime("%b %d %Y %H:%M:%S %p")}: Done')

Oct 31 2023 12:43:05 PM: Done


#### create_batch_files() 

In [9]:
def create_batch_files (folder: str, df: pd.core.frame.DataFrame):
    batches = df['batch'][(df['batch'].notnull())].sort_values().unique()
    for batch in batches:
        df_batch = df[df['batch'] == batch]
        bat_file_name = 'copy-' + folder + '-' + str(batch).zfill(2)
        bat_file = open(path + xml_path + bat_file_name  + '.bat', 'w',  encoding='utf-8')
        cmd = f'ECHO start %time% >> "{bat_file_name}.log"'
        bat_file.write(cmd + '\n')    
        for key, val in df_batch.iterrows():
            cmd = f'ECHO "{val["dataid"]} - {val["filename"]}" >> "{bat_file_name}.log"'
            bat_file.write(cmd + '\n')
            cmd = f'IF EXIST "{val["c_sourcefile"]}" (ECHO F | XCOPY /Y /Q /F "{val["c_sourcefile"]}" "{val["c_targetfile"]}" >> "{bat_file_name}.log") ELSE (ECHO "{val["c_sourcefile"]}" does not exist) >> "{bat_file_name}.log"'
            bat_file.write(cmd + '\n')
        cmd = f'ECHO end %time% >> "{bat_file_name}.log"'
        bat_file.write(cmd + '\n')    
        bat_file.close()
        print(f'{dt.now().strftime("%b %d %Y %H:%M:%S %p")}:     {bat_file_name} = {df_batch.shape[0]}')

print(f'{dt.now().strftime("%b %d %Y %H:%M:%S %p")}: Done')

Oct 31 2023 12:43:07 PM: Done


#### release_extract_memory()

In [10]:
def release_extract_memory(names:list):
    print(f'{dt.now().strftime("%b %d %Y %H:%M:%S %p")}: Releasing Memory...')
    
    total_memory_used = 0
    memory_freed = 0
    for name in names:
        print(f'Extract - {name}')
        # Calculate used memory
        memory_used = df_extract[name].memory_usage().sum()
        memory_used = memory_used/1024/1024
        print(f'{dt.now().strftime("%b %d %Y %H:%M:%S %p")}:     Memory Usage = {memory_used} MB')
        # Delete DataFrames
        del df_extract[name]
        df_extract[name] = pd.DataFrame()
        memory_freed = gc.collect()
        print(f'{dt.now().strftime("%b %d %Y %H:%M:%S %p")}:     Memory Freed = {memory_freed}')

print(f'{dt.now().strftime("%b %d %Y %H:%M:%S %p")}: Done')

Oct 31 2023 12:43:09 PM: Done


#### release_cat_extract_memory()

In [11]:
def release_cat_extract_memory(names:list):
    print(f'{dt.now().strftime("%b %d %Y %H:%M:%S %p")}: Releasing Memory...')
    
    total_memory_used = 0
    memory_freed = 0
    for name in names:
        print(f'Extract - {name}')
        # Calculate used memory
        memory_used = df_cat_extract[name].memory_usage().sum()
        memory_used = memory_used/1024/1024
        print(f'{dt.now().strftime("%b %d %Y %H:%M:%S %p")}:     Memory Usage = {memory_used} MB')
        # Delete DataFrames
        del df_cat_extract[name]
        df_cat_extract[name] = pd.DataFrame()
        memory_freed = gc.collect()
        print(f'{dt.now().strftime("%b %d %Y %H:%M:%S %p")}:     Memory Freed = {memory_freed}')

print(f'{dt.now().strftime("%b %d %Y %H:%M:%S %p")}: Done')

Oct 31 2023 12:43:11 PM: Done


#### release_xml_memory()

In [12]:
def release_xml_memory():
    for collection in df_xml:
        print(f'XML - {collection}')
        # Calculate used memory
        memory_used = df_xml[collection].memory_usage().sum()
        memory_used = memory_used/1024/1024
        print(f'{dt.now().strftime("%b %d %Y %H:%M:%S %p")}:     Memory Usage = {memory_used} MB')
        # Delete DataFrames
        del df_xml[collection]
        memory_freed = gc.collect()
        print(f'{dt.now().strftime("%b %d %Y %H:%M:%S %p")}:     Memory Freed = {memory_freed}')
        df_xml[collection] = pd.DataFrame()
#    df_xml = {}
    
print(f'{dt.now().strftime("%b %d %Y %H:%M:%S %p")}: Done')

Oct 31 2023 12:43:13 PM: Done


# 2. Extraction

### 2.1 Set Folder parameters

In [13]:
folder = ''
category = {}
##folder = 'FinancialImages_AD'
##folder = 'CorporateFinance'
##folder = 'JDEFinancialPDFs'
folder = 'FinancialImages_UO'
##folder = 'IncomeAccounting'
##folder = 'JDEAttachments'

if folder == 'JDEFinancialPDFs':
    # JDE Financial Reports
    category = {'catid': 257759,
                'catname': 'JDE Financial Reports',
                'Report Type': (2, 'ValStr'),
                'User Id': (3, 'ValStr'),
                'Version': (4, 'ValStr'),
                'Queue': (5, 'ValStr'),
                'Date': (6, "TO_CHAR(ValDate, 'YYYYMMDDHHMISS')"),
                'Host': (7, 'ValStr')
               }
if folder in ['FinancialImages_AD', 'CorporateFinance', 'FinancialImages_UO']:
    # Accounts Payable
    category = {'catid': 152558,
                'catname': 'Accounts Payable',
                'EnCana Company Number': (2, 'ValStr'),
                'EnCana Company Name': (3, 'ValStr'),
                'Invoice Number': (5, 'ValStr'),
                'Invoice Date': (6, "TO_CHAR(ValDate, 'YYYYMMDDHHMISS')"),
                'Supplier Number': (7, 'ValStr'),
                'Supplier Name': (8, 'ValStr'),
                'JDE Document ID': (9, 'ValStr'),
                'JDE Document Type': (10, 'ValStr'),
                'JDE Batch Number': (11, 'ValStr'),
                'Gross Amount': (12, 'ValStr'),
                'Invoice Type': (13, 'ValStr'),
                'Pay Item': (14, 'ValStr'),
                'G/L Class Code': (15, 'ValStr'),
                'Currency': (16, 'ValStr'),
                'Suspense Account': (17, 'ValStr'),
                'G/L Date': (18, "TO_CHAR(ValDate, 'YYYYMMDDHHMISS')"),
                'Department Code': (19, 'ValStr'),
                'Department Description': (20, 'ValStr'),
                'Scan Date': (21, "TO_CHAR(ValDate, 'YYYYMMDDHHMISS')"),
                'Supporting': (22, 'ValInt'),
                'Box ID': (23, 'ValStr'),
                'Created By': (24, 'ValStr'),
                'Manual Payments': (25, 'ValInt'),
                'Service Date': (26, "TO_CHAR(ValDate, 'YYYYMMDDHHMISS')")
               }
print(f'{dt.now().strftime("%b %d %Y %H:%M:%S %p")}: Folder: {folder}')
if any(category): print(f'{dt.now().strftime("%b %d %Y %H:%M:%S %p")}: Category: {category["catname"]}')
print(f'{dt.now().strftime("%b %d %Y %H:%M:%S %p")}: Done')    

Oct 31 2023 12:43:17 PM: Folder: FinancialImages_UO
Oct 31 2023 12:43:17 PM: Category: Accounts Payable
Oct 31 2023 12:43:17 PM: Done


### 2.2 Relase Memory if needed

In [None]:
#release_extract_memory(['containers'])
#release_extract_memory(['containers', 'contents'])

In [None]:
#release_cat_extract_memory(['Accounts Payable1', 'Accounts Payable2', 'Accounts Payable3'])

In [None]:
#release_xml_memory()
#del df_xml
#df_xml = {}

### 2.2 Container and Content data

In [None]:
# Container and Content data
#for collection in ['containers', 'contents']:
for collection in ['contents']:
#for collection in ['containers']:
    print(f'{dt.now().strftime("%b %d %Y %H:%M:%S %p")}: Loading {collection} for {folder}...')
    extract_data (collection, folder, source_parentid[folder], True)
    df_extract[collection].sort_values(by=['subtype', 'path'], inplace=True)

print(f'{dt.now().strftime("%b %d %Y %H:%M:%S %p")}: Done')

Oct 31 2023 14:58:31 PM: Loading contents for FinancialImages_UO...


In [None]:
# Container data from flat files
print(f'Loading Containers...')
df_extract['containers'] = pd.read_csv(path + extract_path + 'Extracted_' + folder_name + '_containers.csv', dtype={'dataid': 'Int64', 'versionnum': 'Int64', 'parentid': 'Int64', 'name': 'string', 'nickname': 'string', 'dcomment': 'string', 'createdate': 'string', 'modifydate': 'string', 'createdby': 'string', 'subtype': 'string', 'folderlevel': 'Int64', 'path': 'string'})
print(f'{dt.now().strftime("%b %d %Y %H:%M:%S %p")}:     Loaded {folder_name} = {df_extract['containers'].shape[0]} rows')

print(f'{dt.now().strftime("%b %d %Y %H:%M:%S %p")}: Done')

### 2.3 Category data

#### 2.3.a Load Accounts Payable category from CSV

In [None]:
# Category data from flat files
print(f'Loading Accounts Payable1...')
df_cat_extract['Accounts Payable1'] = pd.read_csv(path + extract_path + 'Extracted_category_Accounts Payable1.csv', dtype={'id': 'Int64', 'vernum': 'Int64', 'EnCana Company Number': 'string', 'EnCana Company Name': 'string', 'Invoice Number': 'string', 'Invoice Date': 'string', 'Supplier Number': 'string', 'Supplier Name': 'string', 'JDE Document ID': 'string', 'JDE Document Type': 'string'})
#df_extract['Accounts Payable1']= pd.read_csv(path + extract_path + 'Extracted_category_Accounts Payable1.csv', dtype='string')
print(f'Loading Accounts Payable2...')
df_cat_extract['Accounts Payable2'] = pd.read_csv(path + extract_path + 'Extracted_category_Accounts Payable2.csv', dtype={'id': 'Int64', 'vernum': 'Int64', 'JDE Batch Number': 'string', 'Gross Amount': 'string', 'Invoice Type': 'string', 'Pay Item': 'string', 'G/L Class Code': 'string', 'Currency': 'string', 'Suspense Account': 'string', 'G/L Date': 'string'})
#df_extract['Accounts Payable2']= pd.read_csv(path + extract_path + 'Extracted_category_Accounts Payable2.csv', dtype='string')
print(f'Loading Accounts Payable3...')
df_cat_extract['Accounts Payable3'] = pd.read_csv(path + extract_path + 'Extracted_category_Accounts Payable3.csv', dtype={'id': 'Int64', 'vernum': 'Int64', 'Department Code': 'string', 'Department Description': 'string', 'Scan Date': 'string', 'Supporting': 'Int64', 'Box ID': 'string', 'Created By': 'string', 'Manual Payments': 'Int64', 'Service Date': 'string'})
#df_extract['Accounts Payable3']= pd.read_csv(path + extract_path + 'Extracted_category_Accounts Payable3.csv', dtype='string')

print(f'{dt.now().strftime("%b %d %Y %H:%M:%S %p")}: Done')

#### 2.3.b Load Accounts Payable category in 3 pieces or JDE Financial Reports from database and save to csv

In [None]:
category = {}
category = {'catid': 152558,
            'catname': 'Accounts Payable1',
            'EnCana Company Number': (2, 'ValStr'),
            'EnCana Company Name': (3, 'ValStr'),
            'Invoice Number': (5, 'ValStr'),
            'Invoice Date': (6, "TO_CHAR(ValDate, 'YYYYMMDDHHMISS')"),
            'Supplier Number': (7, 'ValStr'),
            'Supplier Name': (8, 'ValStr'),
            'JDE Document ID': (9, 'ValStr'),
            'JDE Document Type': (10, 'ValStr')
           }

In [None]:
category = {'catid': 152558,
            'catname': 'Accounts Payable2',
            'JDE Batch Number': (11, 'ValStr'),
            'Gross Amount': (12, 'ValStr'),
            'Invoice Type': (13, 'ValStr'),
            'Pay Item': (14, 'ValStr'),
            'G/L Class Code': (15, 'ValStr'),
            'Currency': (16, 'ValStr'),
            'Suspense Account': (17, 'ValStr'),
            'G/L Date': (18, "TO_CHAR(ValDate, 'YYYYMMDDHHMISS')")
           }

In [None]:
category = {'catid': 152558,
            'catname': 'Accounts Payable3',
            'Department Code': (19, 'ValStr'),
            'Department Description': (20, 'ValStr'),
            'Scan Date': (21, "TO_CHAR(ValDate, 'YYYYMMDDHHMISS')"),
            'Supporting': (22, 'ValInt'),
            'Box ID': (23, 'ValStr'),
            'Created By': (24, 'ValStr'),
            'Manual Payments': (25, 'ValInt'),
            'Service Date': (26, "TO_CHAR(ValDate, 'YYYYMMDDHHMISS')")
           }

In [None]:
# Category data from Database
if any(category):
    print(f'{dt.now().strftime("%b %d %Y %H:%M:%S %p")}: Loading category {category["catname"]}...')
    extract_category_data(category)

    #Save to file if needed
    df_cat_extract[category["catname"]].to_csv(path + extract_path + f'Extracted_category_' + category["catname"] + '.csv', index=False)

print(f'{dt.now().strftime("%b %d %Y %H:%M:%S %p")}: Done')

# 3. Transformation

In [None]:
# Join to metadata - For JDE Attachments
if any(category):
    for collection in ['containers', 'contents']:
        print(f'{dt.now().strftime("%b %d %Y %H:%M:%S %p")}: Joining {collection} with {category["catname"]} for {folder}...')
        df_extract[collection] = df_extract[collection].merge(df_cat_extract[category["catname"]], how='left', left_on=['dataid', 'versionnum'], right_on=['id', 'vernum'])
        print(f'{dt.now().strftime("%b %d %Y %H:%M:%S %p")}:     {collection}-{category["catname"]}... {df_extract[collection].shape[0]} rows affected')
print(f'{dt.now().strftime("%b %d %Y %H:%M:%S %p")}: Done')

In [None]:
# Join to metadata - For Accounts Payable split in 3 dataframes
#for collection in ['containers', 'contents']:
#for collection in ['contents']:
for collection in ['containers']:
    for catname in ['Accounts Payable1', 'Accounts Payable2', 'Accounts Payable3']:
        print(f'{dt.now().strftime("%b %d %Y %H:%M:%S %p")}: Joining {collection} with {catname} for {folder}...')
        df_extract[collection] = df_extract[collection].merge(df_cat_extract[catname], how='left', left_on=['dataid', 'versionnum'], right_on=['id', 'vernum'])
        print(f'{dt.now().strftime("%b %d %Y %H:%M:%S %p")}:     {collection}-{catname}... {df_extract[collection].shape[0]} rows affected')
print(f'{dt.now().strftime("%b %d %Y %H:%M:%S %p")}: Done')

### 3.1 Transformation

In [None]:
# Apply Transformation
#for collection in ['containers', 'contents']:
#for collection in ['contents']:
for collection in ['containers']:
    print(f'{dt.now().strftime("%b %d %Y %H:%M:%S %p")}: Transforming {collection} for {folder}...')
    apply_transformation(collection)

print(f'{dt.now().strftime("%b %d %Y %H:%M:%S %p")}: Done')

### 3.2 Validation & Counts

In [None]:
#for objtype in ['containers', 'contents']:
for objtype in ['containers']:
    # 3.3.1.a Duplicated DataIds in EXTRACT
    # (cartesian product)
    report_name = f'dup_dataid-{objtype}'
    df_report[report_name] = df_extract[objtype][['dataid', 'name']].groupby(by='dataid').count()
    df_report[report_name] = df_report[report_name][(df_report[report_name]['name'] > 1)]
    print(f'Duplicated DataIds: \n {df_report[report_name]}')

    # 3.3.2 Duplicated Document Names in same Folder
    report_name = f'dup_docs-{objtype}'
    df_report[report_name] = df_extract[objtype][['c_location', 'name', 'subtype']].groupby(by=['c_location', 'name']).count()
    df_report[report_name] = df_report[report_name][(df_report[report_name]['subtype'] > 1)]
    print(f'\n Duplicated File Names within Target Folders: \n {df_report[report_name]}')
    
    # Duplicate details
    #df_extract['containers'].merge(df_report['dup_docs-containers'], how='right', on=['c_location', 'name']).to_excel(path+'dups.xlsx')

    # 3.3.6 Counts per SUBTYPE
    report_name = f'count_per_subtype-{objtype}'
    if objtype == 'contents': df_report[report_name] = df_extract[objtype][['subtype', 'dataid', 'versionid', 'nickname', 'datasize']].groupby(by=['subtype'], dropna=False).agg({'dataid': pd.Series.nunique, 'versionid': pd.Series.nunique, 'nickname': pd.Series.count, 'datasize': lambda x: np.sum(x)/1024/1024/1024})
    if objtype == 'containers': df_report[report_name] = df_extract[objtype][['subtype', 'dataid', 'nickname']].groupby(by=['subtype'], dropna=False).agg({'dataid': pd.Series.nunique, 'nickname': pd.Series.count})
    df_report[report_name].rename(columns={'subtype': 'Type', 'dataid': '# Items', 'nickname': 'With Nickname'}, inplace=True)
    print(f'\n Counts per SubType: \n {df_report[report_name]}')

    # 3.3.6 Counts per SUBTYPE and BATCH
    report_name = f'count_per_batch-{objtype}'
    if objtype == 'contents': df_report[report_name] = df_extract[objtype][['subtype', 'batch', 'dataid', 'versionid', 'nickname', 'datasize']].groupby(by=['subtype', 'batch'], dropna=False).agg({'dataid': pd.Series.nunique, 'versionid': pd.Series.nunique, 'nickname': pd.Series.count, 'datasize': lambda x: np.sum(x)/1024/1024/1024})
    if objtype == 'containers': df_report[report_name] = df_extract[objtype][['subtype', 'batch', 'dataid', 'nickname']].groupby(by=['subtype', 'batch'], dropna=False).agg({'dataid': pd.Series.nunique, 'nickname': pd.Series.count})
    df_report[report_name].rename(columns={'subtype': 'Type', 'dataid': '# Items', 'nickname': 'With Nickname'}, inplace=True)

with pd.ExcelWriter(path + f'CS 10.5 - Counts - {folder}.xlsx') as writer:
    for report in df_report:
        df_report[report].to_excel(writer, sheet_name=report)

print(f'{dt.now().strftime("%b %d %Y %H:%M:%S %p")}: Done')


# 4. Output Files

### 4.1 Batch Files - XCopy

In [None]:
print(f'{dt.now().strftime("%b %d %Y %H:%M:%S %p")}: Creating copy batch files {folder}...')

df = df_extract['contents'][(df_extract['contents']['subtype'].isin(['document', 'email']))]
create_batch_files (folder, df)

print(f'{dt.now().strftime("%b %d %Y %H:%M:%S %p")}: Done')

### 4.2 Object Importer Files

In [None]:
# Create XML DataFrames

#for collection in ['containers', 'contents']:
for collection in ['containers']:
#for collection in ['contents']:
    print(f'{dt.now().strftime("%b %d %Y %H:%M:%S %p")}: Creating {collection} xmls dataframes for {folder}...')
    for subtype in df_extract[collection]['subtype'].unique():
        map_to_xml(collection, subtype, category)

print(f'{dt.now().strftime("%b %d %Y %H:%M:%S %p")}: Done')

In [None]:
# Generate OI Files

print(f'{dt.now().strftime("%b %d %Y %H:%M:%S %p")}: Creating OI files for {folder}...')
for subtype in df_xml.keys():
    print(f'\n{dt.now().strftime("%b %d %Y %H:%M:%S %p")}: Processing {subtype} = {df_xml[subtype].shape[0]} rows')
    sortby = ['location', 'title']
    if 'version' in df_xml[subtype]: 
        sortby = ['location', 'dataid', 'version']
    build_create_oi(subtype = subtype,
                    output_file = folder,
                    sort_list = sortby,
                    legacy_id = 'dataid'
                    ,categories = {'Content Server Categories:Migration System Properties': ['Source System', 'System ID', 'Metadata']
                                 ,'Content Server Categories:CS10 Migration:Accounts Payable': ['EnCana Company Number', 'EnCana Company Name', 'Invoice Number', 'Invoice Date', 'Supplier Number', 'Supplier Name',
                                                                                                'JDE Document ID', 'JDE Document Type', 'JDE Batch Number', 'Gross Amount', 'Invoice Type',
                                                                                                'Pay Item', 'G/L Class Code', 'Currency', 'Suspense Account', 'G/L Date', 'Department Code',
                                                                                                'Department Description', 'Scan Date', 'Supporting', 'Box ID', 'Created By', 'Manual Payments','Service Date']
#                                 ,'Content Server Categories:CS10 Migration:JDE Financial Reports': ['Report Type', 'User Id', 'Version', 'Queue', 'Date', 'Host']
                                  }
                   )

    print(f'{dt.now().strftime("%b %d %Y %H:%M:%S %p")}: Done')

# ----- SAMPLE CODE -----

In [None]:
def build_update_cats_oi(df: pd.DataFrame, type: str, output_file: str, sort_list: list, category_name: str, cat_atts: list):
    for batch in df['batch'][(df['batch'].notnull())].sort_values().unique():
        LegacyID_ant = ''
        xml_file_name = 'update-' + type + '-' + output_file + '-' + str(batch).zfill(2) + '.xml'
        df_batch = df[(df['batch'] == batch)].sort_values(by=sort_list)
        root = ET.Element('import')
        tree = ET.ElementTree(root)
        for key, value in df_batch.iterrows():
            node = ET.Element('node')
            node.set('type', type)
            node.set('action', 'update')
            ET.SubElement(node, 'location').text = value['location'] + ':' + value['title']
#            ET.SubElement(node, 'location').text = value['location']
#            ET.SubElement(node, 'description').text = value['description']
            if len(cat_atts) != 0 and category_name != '':
                category = ET.Element('category')
                category.set('name', category_name)
                for att in cat_atts:
                    if len(value[att]) > 0:
#                        print(f'{value[att]} {len(value[att])}')
                        ET.SubElement(category, 'attribute', attrib={'name': att}).text = value[att]
                node.append(category)
            root.append(node)
        xml = ET.tostring(root, encoding='utf-8', method='xml')
        xml_parsed = dom.parseString(xml).toprettyxml()
        xml_file = open(path + xml_path + xml_file_name, 'w',  encoding='utf-8')
        xml_file.write(xml_parsed)
        xml_file.close()
        print(f'{dt.now().strftime("%b %d %Y %H:%M:%S %p")}:     {xml_file_name} = {len(root)} nodes out of {df_batch.shape[0]}')
        root.clear()
        
print(f'{dt.now().strftime("%b %d %Y %H:%M:%S %p")}: Done')

In [None]:
# 3.4.3 Update DOCUMENTS to add Legacy eB Web category
df = pd.DataFrame()
for ft in ['Y', 'N - SPEC', 'N - REFERENCE', 'N - BORE', 'N - PROCEDURE', 'UNKNOWN']:
    df = df_xml[ft]
    if not df.empty:
        print(f'\n{dt.now().strftime("%b %d %Y %H:%M:%S %p")}: Processing {ft} = {df.shape[0]} rows')
        build_update_oi(df = df,
                        type = 'document',
                        output_file = ft,
                        sort_list = ['location', 'title'],
                        category_name = 'Content Server Categories:Legacy eB Web',
                        cat_atts = ['Document Id', 'File Id', 'Repository', 'Path', 'File Name', 'File Size']
                       )

In [None]:
def build_delete_oi(df: pd.DataFrame, type: str, output_file: str, sort_list: list):
    for batch in df['batch'][(df['batch'].notnull())].sort_values().unique():
        xml_file_name = 'delete-' + type + '-' + output_file + '-' + str(batch).zfill(2) + '.xml'
        df_batch = df[(df['batch'] == batch)].sort_values(by=sort_list, ascending=False)
        root = ET.Element('import')
        tree = ET.ElementTree(root)
        for key, value in df_batch.iterrows():
            node = ET.Element('node')
            node.set('type', type)
            node.set('action', 'delete')
            ET.SubElement(node, 'location').text = value['location'] + ':' + value['title']
#            ET.SubElement(node, 'title').text = value['title']
            root.append(node)
        xml = ET.tostring(root, encoding='utf-8', method='xml')
        xml_parsed = dom.parseString(xml).toprettyxml()
        xml_file = open(path + xml_path + xml_file_name, 'w',  encoding='utf-8')
        xml_file.write(xml_parsed)
        xml_file.close()
        print(f'{dt.now().strftime("%b %d %Y %H:%M:%S %p")}:     {xml_file_name} = {len(root)} nodes out of {df_batch.shape[0]}')
        root.clear()
        
print(f'{dt.now().strftime("%b %d %Y %H:%M:%S %p")}: Done')

In [None]:
# 3.4.3 Delete DOCUMENTS
for ft in  ['Y', 'N - SPEC', 'N - REFERENCE', 'N - BORE', 'N - PROCEDURE', 'UNKNOWN']:
    df = df_xml[ft][['location', 'title', 'batch']].drop_duplicates()
    if not df.empty:
        print(f'\n{dt.now().strftime("%b %d %Y %H:%M:%S %p")}: Processing {ft} = {df.shape[0]} rows')
        build_delete_oi(df = df,
                        type = 'document',
                        output_file = ft,
                        sort_list = ['location', 'title']
                       )
print('Done')