In [1]:
import shared.config as config
import pymongo
import numpy as np
import pandas as pd
import json
import re
import ast

import sqlalchemy as db
from sqlalchemy.orm import sessionmaker
from sqlalchemy import create_engine, inspect
from sqlalchemy.engine import reflection
from sqlalchemy import text

In [2]:
myclient = pymongo.MongoClient(str(config.MONGO_URI))
stagingDb = myclient[str(config.DB_STAGING)]
analyseDb = myclient[str(config.DB_ANALYSE)]
stagingCol = stagingDb[config.COLL_PLAATJES]
stagingOud = stagingDb[config.COLL_STAGING_OUD]
stagingNieuw = stagingDb[config.COLL_STAGING_NIEUW]
stagingMonster = stagingDb[config.COLL_STAGING_MONSTER]
analyseCol = analyseDb[config.COLL_ANALYSE]
analyseColClean = analyseDb[config.COLL_ANALYSE_CLEAN]

metaCollection = stagingDb['Kolominformatie']
AIRFLOW_WASSTRAAT_CONFIG = "./wasstraat_config/Wasstraat_Config_HarmonizeV3.xlsx"

In [3]:
def AggregateData(df, fase): 
    df['teller'] = pd.to_numeric(df['teller'])
    df = df.groupby('table').agg({'projectcd':lambda x: list(x), 'teller':lambda x: sum(x)})
    df['Count'] = df.apply(lambda x: len(x.projectcd), axis=1)
    df['Stage'] = fase
    df = df.sort_values('teller', ascending=False).reset_index()
    df = df.rename(columns={'table': 'Table', 'projectcd': 'Projecten', 'teller': 'Aantal_Records', 'Count': 'Aantal_tabellen'}).sort_index()
    return df

In [4]:
xl = pd.read_excel(AIRFLOW_WASSTRAAT_CONFIG, None);
df_table = xl['Objecten']
ignore_kolommen = df_table[df_table['Object'] == 'Ignore']['Tabellen'].values[0]
ignore_lst = ast.literal_eval(ignore_kolommen)

ignore_lst

['.*backup.*', '.*kopie.*', '.*tijdelijk.*', '^Soorten', '^Ref_']

In [5]:
grp_aggr = [{"$match" : {'projectcd': {'$nin': ['MAGAZIJN', 'DELF-IT', 'Digifotos', 'D', 'DELF']}, 'table': { '$not': {'$regex':"^SYS.*"}}}},
            {"$group": { "_id": { 'projectcd': "$projectcd", 'table': "$table", 'teller': '$teller'}}},
            {'$replaceRoot': {'newRoot': {'projectcd': "$_id.projectcd", 'table': "$_id.table", 'teller': "$_id.teller"}}}]

df_brondata = AggregateData(pd.DataFrame(list(metaCollection.aggregate(grp_aggr))), 'Brondata')
df_brondata.head(5)

Unnamed: 0,Table,Projecten,Aantal_Records,Aantal_tabellen,Stage
0,plattegrond_Polyline,[DB027],34071,1,Brondata
1,Periodisering,[DC116],32400,1,Brondata
2,plattegrond_Polyline_Shape_Index,[DB027],27408,1,Brondata
3,AARDEWERK 2,"[DC005, DB001, DC116, DC060, DC116, DC033, DC0...",26557,34,Brondata
4,Monster_botanie_determinatie,[M],18083,1,Brondata


In [6]:
grp_aggr = [{"$match" : {'projectcd': {'$nin': ['MAGAZIJN', 'DELF-IT', 'Digifotos']}, 'table': { '$not': {'$regex':"^SYS.*"}}}}
           ,{"$group": { "_id": { 'projectcd': "$projectcd", 'table': "$table"}, 'teller': {"$sum": 1}}}
           ,{'$replaceRoot': {'newRoot': {'projectcd': "$_id.projectcd", 'table': "$_id.table", 'teller': "$teller"}}}]

df_staging = AggregateData(pd.concat([pd.DataFrame(list(stagingOud.aggregate(grp_aggr))), pd.DataFrame(list(stagingNieuw.aggregate(grp_aggr))), pd.DataFrame(list(stagingMonster.aggregate(grp_aggr)))]), 'Staging')
df_staging.head(5)
df_staging[df_staging.Table.str.contains('VONDST')]

Unnamed: 0,Table,Projecten,Aantal_Records,Aantal_tabellen,Stage
5,VONDSTENLIJST,"[DB112, DC030, DC039, DC087, DB114, DC082, DC0...",23102,59,Staging
39,VAK_VONDST,"[DB210, DC267, DC232, DC170, DC229, DC268, DB2...",3876,16,Staging
50,VONDSTEN,"[DB006, DB195]",2765,2,Staging
55,VONDSTINHD,"[DC171, DC172, DC166, DB157, DB210, DC164, DC2...",2182,42,Staging
77,VONDST,"[DC166, DB197, DB152, DC164, DB157, DB193, DC1...",1386,39,Staging
85,VONDSTEN_NIEUW_UNIEK,[DB113],1254,1,Staging
88,VONDSTCONTAINER,[DB195],1226,1,Staging
111,VONDSTENLIJST NIEUW,[DB113],836,1,Staging
153,VONDSTEN_NIEUW,[DB113],418,1,Staging
154,VONDST_COUPE_CEGM_UNIEK_2010,[DB113],418,1,Staging


In [7]:
grp_aggr = [{'$group': {'_id': {'projectcd': "$brondata.projectcd", 'table': "$brondata.table"},'teller': {"$sum": 1}}}
           ,{'$replaceRoot': {'newRoot': {'projectcd': "$_id.projectcd", 'table': "$_id.table", 'teller': "$teller"}}}]

df_singlestore = AggregateData(pd.DataFrame(list(analyseCol.aggregate(grp_aggr))), 'SingleStore')
df_singlestore.head(5)


Unnamed: 0,Table,Projecten,Aantal_Records,Aantal_tabellen,Stage
0,AARDEWERK 2,"[DC114, DB036, DC027, DC005, DC062, DB006, DC0...",43734,32,SingleStore
1,Foto Totaal Tabel,[D],31469,1,SingleStore
2,BOT,"[DB113, DC024, DC057, DC060, DB114, DC069, DB0...",31060,10,SingleStore
3,VONDSTENLIJST,"[DC018, DC062, DC069, DC082, DB115, DC039, DC0...",23102,59,SingleStore
4,Monster_botanie_determinatie,[M],18083,1,SingleStore


In [8]:
#grp_aggr = [{'$group': {'_id': {'projectcd': "$brondata.projectcd", 'table': "$brondata.table"},'teller': {"$sum": 1}}}
#           ,{'$replaceRoot': {'newRoot': {'projectcd': "$_id.projectcd", 'table': "$_id.table", 'teller': "$teller"}}}]

#df_singlestoreclean = AggregateData(pd.DataFrame(list(analyseColClean.aggregate(grp_aggr))), 'SingleStoreClean')
#df_singlestoreclean.head(5)

In [9]:
grp_aggr = [{"$unwind" : {
                "path" : "$wasstraat",
                "preserveNullAndEmptyArrays" : False
            }}, 
            {"$addFields" : {
                "table" : "$wasstraat.table"
            }},
            {'$group': {
                '_id': {'projectcd': "$projectcd", 'table': "$table"},
                'teller': {"$sum": 1}}},
            {'$replaceRoot': {
                'newRoot': {'projectcd': "$_id.projectcd", 'table': "$_id.table", 'teller': "$teller"}}}]

df_singlestoreclean = AggregateData(pd.DataFrame(list(analyseColClean.aggregate(grp_aggr))), 'SingleStoreClean')
df_singlestoreclean.head(5)

Unnamed: 0,Table,Projecten,Aantal_Records,Aantal_tabellen,Stage
0,Foto Totaal Tabel,"[DB120, DC005, DB113, DB119, DC018, DB121, DB1...",51727,99,SingleStoreClean
1,Monster_botanie_determinatie,[M],18083,1,SingleStoreClean
2,AARDEWERK 1,"[DB036, DC026, DB113, DB004, DB034, DB033, DC0...",15145,14,SingleStoreClean
3,magazijnlijst,"[DC103, DC028, DC179, DC114, DC064, DB127, DC0...",15113,209,SingleStoreClean
4,VONDSTENLIJST,"[DB006, DC116, DB035, DC061, DC065, DC042, DC1...",14060,59,SingleStoreClean


In [10]:
lst_tables = ['Def_Project', 'Def_Vondst', 'Def_Stelling', 'Def_Plaatsing', 'Def_Vindplaats', 'Def_Artefact', 'Def_Spoor', 'Def_Doos', 'Def_Monster', 'Def_Monster', 'Def_Monster_Schelp', 'Def_Monster_Botanie', 'Def_Vulling']
regexTable = re.compile(r'\'table\': \'(.*?)\'') # regex to replace Object
regexProject = re.compile(r'\'projectcd\': \'(.*?)\'') # regex to replace Object

def getTable(brondata):    
    #print(brondata)
    if brondata and brondata != "":
        srch = regexTable.search(brondata)
        return srch.group(1) if srch else None
    else: 
        return "" 
def getProject(brondata):    
    #print(brondata)
    if brondata and brondata != "":
        srch = regexProject.search(brondata)
        return srch.group(1) if srch else None
    else: 
        return "" 

df_tables_projects = pd.DataFrame()
    
engine = create_engine(config.SQLALCHEMY_DATABASE_URI)
with engine.connect() as connection:
    for table in lst_tables:  
        q = f'SELECT brondata from public."{table}";'
        df = pd.read_sql(text(q), connection)
        
        df_out = pd.DataFrame()
        df_out['table'] = df.apply(lambda x: getTable(x['brondata']), axis=1)
        df_out['projectcd'] = df.apply(lambda x: getProject(x['brondata']) if x['brondata'] else "", axis=1)
        df_tables_projects = pd.concat([df_tables_projects, pd.DataFrame(df_out).groupby(['projectcd', 'table']).size().reset_index(name='teller')])
    
df_doelsysteem = AggregateData(df_tables_projects, 'Doelsysteem')
df_doelsysteem.head(5)

Unnamed: 0,Table,Projecten,Aantal_Records,Aantal_tabellen,Stage
0,Monster_botanie_determinatie,[M],18083,1,Doelsysteem
1,AARDEWERK 1,"[DB004, DB032, DB033, DB034, DB036, DB112, DB1...",13981,14,Doelsysteem
2,VONDSTENLIJST,"[DB004, DB006, DB031, DB032, DB033, DB034, DB0...",13621,59,Doelsysteem
3,BOT,"[DB006, DB113, DB114, DB119, DB122, DC024, DC0...",10940,10,Doelsysteem
4,AARDEWERK 2,"[DB001, DB004, DB006, DB036, DC005, DC020, DC0...",10202,32,Doelsysteem


## Analyse van alle gegevens

Hieronder volgt een overzicht van alle gegevens die in de vier fases beschikbaar zijn. Zo kunnen fouten in de conversie worden opgespoord. 

In [38]:
def inIgnore(tablename):
    for regex in ignore_lst:
        if re.match(regex, tablename, re.IGNORECASE):
            return True
    return False


def getOverview(OnWhat): 
    lst_countColumns = [OnWhat + '_brondata', OnWhat + '_staging', OnWhat + '_singlestore', OnWhat + '_singlestoreclean', OnWhat + '_doelsysteem']
    
    df = df_brondata[['Table', 'Projecten', OnWhat]].merge(df_staging[['Table', OnWhat]], on=['Table'], how='outer', suffixes=("_brondata", "_staging"))
    df = df.merge(df_singlestore[['Table', OnWhat]], on=['Table'], how='outer', suffixes=("_staging", "_singlestore"))
    df = df.merge(df_singlestoreclean[['Table', OnWhat]], on=['Table'], how='outer', suffixes=("_singlestore", "_singlestoreclean"))
    df = df.merge(df_doelsysteem[['Table', OnWhat]], on=['Table'], how='outer', suffixes=("_singlestoreclean", "_doelsysteem"))
    
    df['inIgnore'] = df.apply(lambda x: notInIgnore(x['Table']), axis=1)
    df = df[df.inIgnore == True]
    df = df.drop(columns='inIgnore') 

    df = df.sort_values(OnWhat+ '_brondata', ascending=False) 
    df.rename(columns={'Aantal_Records': 'Aantal_Records_doelsysteem'}, inplace=True)
    df[lst_countColumns] = df[lst_countColumns].fillna(0)
    df[lst_countColumns] = df[lst_countColumns].astype(int, errors='ignore')
    df = df[df.Table.str.contains('DIA', case=False)]
    df = df.style.bar(subset=lst_countColumns, color='#5fba7d', vmax=6000)
    
    return df

getOverview('Aantal_Records')

Unnamed: 0,Table,Projecten,Aantal_Records_brondata,Aantal_Records_staging,Aantal_Records_singlestore,Aantal_Records_singlestoreclean,Aantal_Records_doelsysteem
73,DIAVOORWERP,"['DC100', 'DB004']",735,4714,4714,7054,0
166,DIAOPGRAVING,['DB109'],122,7246,7246,12012,0


## Detailanalyse Brondata naar Staging

In [12]:
df = df_brondata.merge(df_staging, on=['Table', 'Aantal_Records', 'Aantal_tabellen'], how='outer', suffixes=("_brondata", "_staging"))
df[df.Stage_brondata.isnull() | df.Stage_staging.isnull() ]

Unnamed: 0,Table,Projecten_brondata,Aantal_Records,Aantal_tabellen,Stage_brondata,Projecten_staging,Stage_staging
0,plattegrond_Polyline,[DB027],34071,1,Brondata,,
1,Periodisering,[DC116],32400,1,Brondata,,
3,AARDEWERK 2,"[DC005, DB001, DC116, DC060, DC116, DC033, DC0...",26557,34,Brondata,,
5,VONDSTENLIJST,"[DC117, DC087, DC101, DC064, DB114, DC039, DC0...",15204,63,Brondata,,
6,AARDEWERK 1,"[DB114, DB036, DB033, DB113, DC115, DB032, DB0...",15103,15,Brondata,,
...,...,...,...,...,...,...,...
669,History_Relations,,2,1,,[DB195],Staging
670,Bestanden,,2,1,,[DB195],Staging
671,Spoorrelaties,,2,1,,[DB195],Staging
672,Spijkeranalyse KV I,,2,1,,[DB003],Staging


## Detailanalyse Staging naar SingleStore

In [13]:
df = df_staging.merge(df_singlestore, on=['Table', 'Aantal_Records', 'Aantal_tabellen'], how='outer', suffixes=("_staging", "_singlestore"))
df[df.Stage_singlestore.isnull() | df.Stage_staging.isnull() ]

Unnamed: 0,Table,Projecten_staging,Aantal_Records,Aantal_tabellen,Stage_staging,Projecten_singlestore,Stage_singlestore
0,Periodisering,[DC116],64800,1,Staging,,
3,plattegrond_Polyline,[DB027],27462,1,Staging,,
4,plattegrond_Polyline_Shape_Index,[DB027],27408,1,Staging,,
9,kopie aardewerk2,[DC116],11238,1,Staging,,
10,TIJDELIJK AW2 NINA 07 07 2009,[DC116],11238,1,Staging,,
...,...,...,...,...,...,...,...
500,OPGRAVINGEN,,469,1,,[DELF],SingleStore
501,DANlijst,,257,1,,[DAN],SingleStore
502,DARlijst,,142,1,,[DAR],SingleStore
503,VINDPLAATSEN,,37,1,,[DELF],SingleStore


## Detailanalyse SingleStore naar Doelsysteem

In [14]:
df = df_singlestore.merge(df_doelsysteem, on=['Table', 'Aantal_Records', 'Aantal_tabellen'], how='outer', suffixes=("_singlestore", "_doelsysteem"))
df[df.Stage_singlestore.isnull() | df.Stage_doelsysteem.isnull() ]

Unnamed: 0,Table,Projecten_singlestore,Aantal_Records,Aantal_tabellen,Stage_singlestore,Projecten_doelsysteem,Stage_doelsysteem
0,AARDEWERK 2,"[DC114, DB036, DC027, DC005, DC062, DB006, DC0...",43734,32,SingleStore,,
1,Foto Totaal Tabel,[D],31469,1,SingleStore,,
2,BOT,"[DB113, DC024, DC057, DC060, DB114, DC069, DB0...",31060,10,SingleStore,,
3,VONDSTENLIJST,"[DC018, DC062, DC069, DC082, DB115, DC039, DC0...",23102,59,SingleStore,,
5,AARDEWERK 1,"[DB033, DC101, DB114, DC032, DB032, DB120, DC1...",15376,14,SingleStore,,
...,...,...,...,...,...,...,...
208,BOT (MENSELIJK),,5,1,,[DC116],Doelsysteem
209,Glas determinatietabel,,2,1,,[DC170],Doelsysteem
210,AARDEWERK_LME,,1,1,,[DC166],Doelsysteem
211,Veldvondsten,,1,1,,[DB195],Doelsysteem


In [45]:
re.match( r'^(DAN|DAR)\s*([0-9]+)$', 'DAN T 001', re.M|re.I)