In [1]:
import shared.config as config
import pymongo
import numpy as np
import pandas as pd
import json
import re
import ast

import sqlalchemy as db
from sqlalchemy.orm import sessionmaker
from sqlalchemy import create_engine, inspect
from sqlalchemy.engine import reflection

In [16]:
myclient = pymongo.MongoClient(str(config.MONGO_URI))
stagingDb = myclient[str(config.DB_STAGING)]
analyseDb = myclient[str(config.DB_ANALYSE)]
stagingCol = stagingDb[config.COLL_PLAATJES]
stagingOud = stagingDb[config.COLL_STAGING_OUD]
stagingNieuw = stagingDb[config.COLL_STAGING_NIEUW]
stagingMonster = stagingDb[config.COLL_STAGING_MONSTER]
analyseCol = analyseDb[config.COLL_ANALYSE]
analyseColClean = analyseDb[config.COLL_ANALYSE_CLEAN]

metaCollection = stagingDb['Kolominformatie']
AIRFLOW_WASSTRAAT_CONFIG = "./wasstraat_config/Wasstraat_Config_Harmonize.xlsx"

In [17]:
def AggregateData(df, fase): 
    df['teller'] = pd.to_numeric(df['teller'])
    df = df.groupby('table').agg({'projectcd':lambda x: list(x), 'teller':lambda x: sum(x)})
    df['Count'] = df.apply(lambda x: len(x.projectcd), axis=1)
    df['Stage'] = fase
    df = df.sort_values('teller', ascending=False).reset_index()
    df = df.rename(columns={'table': 'Table', 'projectcd': 'Projecten', 'teller': 'Aantal_Records', 'Count': 'Aantal_tabellen'}).sort_index()
    return df

In [18]:
xl = pd.read_excel(AIRFLOW_WASSTRAAT_CONFIG, None);
df_table = xl['Objecten']
ignore_kolommen = df_table[df_table['Object'] == 'Ignore']['Tabellen'].values[0]
ignore_lst = ast.literal_eval(ignore_kolommen)

ignore_lst

['.*backup.*', '.*kopie.*']

In [19]:
grp_aggr = [{"$match" : {'projectcd': {'$nin': ['MAGAZIJN', 'DELF-IT', 'Digifotos', 'D', 'DELF']}, 'table': { '$not': {'$regex':"^SYS.*"}}}},
            {"$group": { "_id": { 'projectcd': "$projectcd", 'table': "$table", 'teller': '$teller'}}},
            {'$replaceRoot': {'newRoot': {'projectcd': "$_id.projectcd", 'table': "$_id.table", 'teller': "$_id.teller"}}}]

df_brondata = AggregateData(pd.DataFrame(list(metaCollection.aggregate(grp_aggr))), 'Brondata')
df_brondata.head(5)

Unnamed: 0,Table,Projecten,Aantal_Records,Aantal_tabellen,Stage
0,Monster_botanie_determinatie,[M],18083,1,Brondata
1,AARDEWERK 1,"[DB034, DC026, DC097, DC097]",6254,4,Brondata
2,VONDSTENLIJST,"[DC093, DC026, DC020, DC022, DC018, DB034, DC0...",4845,9,Brondata
3,Aardewerk,[DB034],4486,1,Brondata
4,Aardewerk 1 backup,[DB034],4465,1,Brondata


In [20]:
grp_aggr = [{"$match" : {'projectcd': {'$nin': ['MAGAZIJN', 'DELF-IT', 'Digifotos']}, 'table': { '$not': {'$regex':"^SYS.*"}}}}
           ,{"$group": { "_id": { 'projectcd': "$projectcd", 'table': "$table"}, 'teller': {"$sum": 1}}}
           ,{'$replaceRoot': {'newRoot': {'projectcd': "$_id.projectcd", 'table': "$_id.table", 'teller': "$teller"}}}]

df_staging = AggregateData(pd.concat([pd.DataFrame(list(stagingOud.aggregate(grp_aggr))), pd.DataFrame(list(stagingNieuw.aggregate(grp_aggr))), pd.DataFrame(list(stagingMonster.aggregate(grp_aggr)))]), 'Staging')
df_staging.head(5)
df_staging[df_staging.Table.str.contains('VONDST')]

Unnamed: 0,Table,Projecten,Aantal_Records,Aantal_tabellen,Stage
2,VONDSTENLIJST,"[DB034, DC018, DC093, DC020, DC011, DC097, DC1...",5547,9,Staging
46,VONDSTINHD,"[DC024, DC112]",329,2,Staging
51,VONDST,"[DC112, DC024]",270,2,Staging
65,LIJST VONDSTEN URGENTE CONSERVERING,[DB034],129,1,Staging


In [21]:
grp_aggr = [{'$group': {'_id': {'projectcd': "$brondata.projectcd", 'table': "$brondata.table"},'teller': {"$sum": 1}}}
           ,{'$replaceRoot': {'newRoot': {'projectcd': "$_id.projectcd", 'table': "$_id.table", 'teller': "$teller"}}}]

df_singlestore = AggregateData(pd.DataFrame(list(analyseCol.aggregate(grp_aggr))), 'SingleStore')
df_singlestore.head(5)


Unnamed: 0,Table,Projecten,Aantal_Records,Aantal_tabellen,Stage
0,Foto Totaal Tabel,[D],31469,1,SingleStore
1,Monster_botanie_determinatie,[M],18083,1,SingleStore
2,magazijnlijst,[MAGAZIJN],15113,1,SingleStore
3,AARDEWERK 1,"[DC097, DB034, DC026]",6368,3,SingleStore
4,VONDSTENLIJST,"[DC112, DC093, DB034, DC018, DC020, DC011, DC0...",5547,9,SingleStore


In [22]:
#grp_aggr = [{'$group': {'_id': {'projectcd': "$brondata.projectcd", 'table': "$brondata.table"},'teller': {"$sum": 1}}}
#           ,{'$replaceRoot': {'newRoot': {'projectcd': "$_id.projectcd", 'table': "$_id.table", 'teller': "$teller"}}}]

#df_singlestoreclean = AggregateData(pd.DataFrame(list(analyseColClean.aggregate(grp_aggr))), 'SingleStoreClean')
#df_singlestoreclean.head(5)

In [23]:
grp_aggr = [{"$unwind" : {
                "path" : "$wasstraat",
                "preserveNullAndEmptyArrays" : False
            }}, 
            {"$addFields" : {
                "table" : "$wasstraat.table"
            }},
            {'$group': {
                '_id': {'projectcd': "$projectcd", 'table': "$table"},
                'teller': {"$sum": 1}}},
            {'$replaceRoot': {
                'newRoot': {'projectcd': "$_id.projectcd", 'table': "$_id.table", 'teller': "$teller"}}}]

df_singlestoreclean = AggregateData(pd.DataFrame(list(analyseColClean.aggregate(grp_aggr))), 'SingleStoreClean')
df_singlestoreclean.head(5)

Unnamed: 0,Table,Projecten,Aantal_Records,Aantal_tabellen,Stage
0,Foto Totaal Tabel,"[DC093, DC072, DC004, DC022, DC005, DC003, DC0...",36552,18,SingleStoreClean
1,Monster_botanie_determinatie,[M],18083,1,SingleStoreClean
2,magazijnlijst,"[DC034, DC039, MD018, DC092, DB001, DC062, DC1...",15113,209,SingleStoreClean
3,AARDEWERK 1,"[DC026, DC097, DB034]",6240,3,SingleStoreClean
4,DIAOPGRAVING,"[DC021, DC023, DC093, DC024, DC026, DC003, DC0...",5861,11,SingleStoreClean


In [24]:
lst_tables = ['Def_Project', 'Def_Vondst', 'Def_Stelling', 'Def_Plaatsing', 'Def_Vindplaats', 'Def_Artefact', 'Def_Spoor', 'Def_Doos', 'Def_Monster', 'Def_Monster', 'Def_Monster_Schelp', 'Def_Monster_Botanie']
regexTable = re.compile(r'\'table\': \'(.*?)\'') # regex to replace Object
regexProject = re.compile(r'\'projectcd\': \'(.*?)\'') # regex to replace Object

def getTable(brondata):    
    #print(brondata)
    if brondata and brondata != "":
        srch = regexTable.search(brondata)
        return srch.group(1) if srch else None
    else: 
        return "" 
def getProject(brondata):    
    #print(brondata)
    if brondata and brondata != "":
        srch = regexProject.search(brondata)
        return srch.group(1) if srch else None
    else: 
        return "" 

df_tables_projects = pd.DataFrame()
    
engine = create_engine(config.SQLALCHEMY_DATABASE_URI)
with engine.connect() as connection:
    for table in lst_tables:     
        df = pd.read_sql_query('SELECT brondata from "' + table + '"', connection)
        
        df_out = pd.DataFrame()
        df_out['table'] = df.apply(lambda x: getTable(x['brondata']), axis=1)
        df_out['projectcd'] = df.apply(lambda x: getProject(x['brondata']) if x['brondata'] else "", axis=1)
        df_tables_projects = pd.concat([df_tables_projects, pd.DataFrame(df_out).groupby(['projectcd', 'table']).size().reset_index(name='teller')])
    
df_doelsysteem = AggregateData(df_tables_projects, 'Doelsysteem')
df_doelsysteem.head(5)

Unnamed: 0,Table,Projecten,Aantal_Records,Aantal_tabellen,Stage
0,Monster_botanie_determinatie,[M],18083,1,Doelsysteem
1,magazijnlijst,"[MAGAZIJN, MAGAZIJN]",9479,2,Doelsysteem
2,AARDEWERK 1,"[DB034, DC026, DC097]",5427,3,Doelsysteem
3,VONDSTENLIJST,"[DB034, DC011, DC018, DC020, DC022, DC026, DC0...",4779,9,Doelsysteem
4,Aardewerk,[DB034],4280,1,Doelsysteem


## Analyse van alle gegevens

Hieronder volgt een overzicht van alle gegevens die in de vier fases beschikbaar zijn. Zo kunnen fouten in de conversie worden opgespoord. 

In [25]:
def getOverview(OnWhat): 
    lst_countColumns = [OnWhat + '_brondata', OnWhat + '_staging', OnWhat + '_singlestore', OnWhat + '_singlestoreclean', OnWhat + '_doelsysteem']
    
    df = df_brondata[['Table', 'Projecten', OnWhat]].merge(df_staging[['Table', OnWhat]], on=['Table'], how='outer', suffixes=("_brondata", "_staging"))
    df = df.merge(df_singlestore[['Table', OnWhat]], on=['Table'], how='outer', suffixes=("_staging", "_singlestore"))
    df = df.merge(df_singlestoreclean[['Table', OnWhat]], on=['Table'], how='outer', suffixes=("_singlestore", "_singlestoreclean"))
    df = df.merge(df_doelsysteem[['Table', OnWhat]], on=['Table'], how='outer', suffixes=("_singlestoreclean", "_doelsysteem"))

    df = df.sort_values(OnWhat+ '_brondata', ascending=False) 
    df.rename(columns={'Aantal_Records': 'Aantal_Records_doelsysteem'}, inplace=True)
    df[lst_countColumns] = df[lst_countColumns].fillna(0)
    df[lst_countColumns] = df[lst_countColumns].astype(int, errors='ignore')
    #df = df[df.Table.str.contains('vondst', case=False)]
    df = df.style.bar(subset=lst_countColumns, color='#5fba7d', vmax=6000)
    
    return df

getOverview('Aantal_Records')

Unnamed: 0,Table,Projecten,Aantal_Records_brondata,Aantal_Records_staging,Aantal_Records_singlestore,Aantal_Records_singlestoreclean,Aantal_Records_doelsysteem
0,Monster_botanie_determinatie,['M'],18083,18083,18083,18083,18083
1,AARDEWERK 1,"['DB034', 'DC026', 'DC097', 'DC097']",6254,6368,6368,6240,5427
2,VONDSTENLIJST,"['DC093', 'DC026', 'DC020', 'DC022', 'DC018', 'DB034', 'DC011', 'DC112', 'DC097']",4845,5547,5547,4845,4779
3,Aardewerk,['DB034'],4486,4486,4486,4486,4280
4,Aardewerk 1 backup,['DB034'],4465,4465,0,0,0
5,AARDEWERK 2,"['DC020', 'DC005', 'DC021', 'DC026', 'DC072']",1724,1722,1722,1722,1721
6,AARDEWERK 3,['DC026'],1675,1675,1675,1660,809
7,Monster_waardering,['M'],1497,1497,1497,1497,1270
8,Monster_gegevens,['M'],1497,1496,1496,1496,1726
9,Controle vondsten,['DB034'],1466,1466,0,0,0


## Detailanalyse Brondata naar Staging

In [12]:
df = df_brondata.merge(df_staging, on=['Table', 'Aantal_Records', 'Aantal_tabellen'], how='outer', suffixes=("_brondata", "_staging"))
df[df.Stage_brondata.isnull() | df.Stage_staging.isnull() ]

Unnamed: 0,Table,Projecten_brondata,Aantal_Records,Aantal_tabellen,Stage_brondata,Projecten_staging,Stage_staging
0,Monster_botanie_determinatie,[M],18083,1,Brondata,,
1,AARDEWERK 1,"[DB034, DC097, DC097, DC026]",6254,4,Brondata,,
2,VONDSTENLIJST,"[DC026, DC093, DC097, DC112, DC011, DB034, DC0...",4845,9,Brondata,,
5,AARDEWERK 2,"[DC005, DC020, DC072, DC026, DC021]",1724,5,Brondata,,
7,Monster_waardering,[M],1497,1,Brondata,,
...,...,...,...,...,...,...,...
214,Ref_sexe,,6,1,,[DC112],Staging
215,GLAS GLASSOORT,,6,1,,[DC024],Staging
216,DB034_P11_V0_C018_SPO,,4,1,,[DB034],Staging
217,Z_associatie,,3,1,,[DC112],Staging


## Detailanalyse Staging naar SingleStore

In [13]:
df = df_staging.merge(df_singlestore, on=['Table', 'Aantal_Records', 'Aantal_tabellen'], how='outer', suffixes=("_staging", "_singlestore"))
df[df.Stage_singlestore.isnull() | df.Stage_staging.isnull() ]

Unnamed: 0,Table,Projecten_staging,Aantal_Records,Aantal_tabellen,Stage_staging,Projecten_singlestore,Stage_singlestore
2,DC024,[opgravingDC024],5280,1,Staging,,
4,Aardewerk 1 backup,[DB034],4465,1,Staging,,
6,ONG-lijst,[DC026],2708,1,Staging,,
8,Kopie van AARDEWERK 1,[DC097],1860,1,Staging,,
11,Controle vondsten,[DB034],1466,1,Staging,,
...,...,...,...,...,...,...,...
136,Monster_gegevens,,1496,1,,[M],SingleStore
137,Monster_schelp_determinatie,,1082,1,,[M],SingleStore
138,OPGRAVINGEN,,840,1,,[DELF],SingleStore
139,VINDPLAATSEN,,37,1,,[DELF],SingleStore


## Detailanalyse SingleStore naar Doelsysteem

In [14]:
df = df_singlestore.merge(df_doelsysteem, on=['Table', 'Aantal_Records', 'Aantal_tabellen'], how='outer', suffixes=("_singlestore", "_doelsysteem"))
df[df.Stage_singlestore.isnull() | df.Stage_doelsysteem.isnull() ]

Unnamed: 0,Table,Projecten_singlestore,Aantal_Records,Aantal_tabellen,Stage_singlestore,Projecten_doelsysteem,Stage_doelsysteem
0,Foto Totaal Tabel,[D],31469,1,SingleStore,,
2,magazijnlijst,[MAGAZIJN],15113,1,SingleStore,,
3,AARDEWERK 1,"[DC097, DB034, DC026]",6368,3,SingleStore,,
4,VONDSTENLIJST,"[DC112, DC093, DB034, DC018, DC020, DC011, DC0...",5547,9,SingleStore,,
5,Aardewerk,[DB034],4486,1,SingleStore,,
6,DIAOPGRAVING,"[DC024, DB034, DC004, DC022, DC093, DC023, DC0...",3265,11,SingleStore,,
7,doosnr,[MAGAZIJN],2773,1,SingleStore,,
9,AARDEWERK 2,"[DC021, DC026, DC072, DC005, DC020]",1722,5,SingleStore,,
10,AARDEWERK 3,[DC026],1675,1,SingleStore,,
11,Monster_waardering,[M],1497,1,SingleStore,,


In [15]:
# Experiments to be able to parse the mongo bson strings to either JSON or dict
# Not wirking :(((


startQ_regex = re.compile(r"({|\(|,\s*|:\s*)\'") # regex to replace first quote of words to double quaote
endQ_regex = re.compile(r"\'(:|\)|,|})") # regex to replace last quote of words to double quaote
regex = re.compile(r'ObjectId\((.*)\)') # regex to replace Object

# Does not work :(
def getFromJson(brondata):    
    if not brondata or brondata != "":
        brondata = brondata.replace('\"', '\'')
        brondata = startQ_regex.sub('\\1"', brondata)
        brondata = endQ_regex.sub('"\\1', brondata)
        brondata = regex.sub('\\1', brondata)
        brondata = brondata.replace('),', ',') # Quick fix because one ) could not be removed
        print(brondata)
        obj = json.loads(brondata)
        
        return obj
    else: 
        return {} 


#import re
#from bson.json_util import dumps, loads

#p = re.compile(r'blue (?P<animal>dog|cat)')
#p.sub(r'gray \g<animal>',s)
#re.sub(
#    pattern=r'ObjectId\((.*)\)', 
#    repl='\\1', 
#    string=str
#)

str = "{'_id': ObjectId('61e5f4a0ef919b0974d0395d'), 'LOKATIE': 'H:\\GEMEENTES\\PLAATS\\Opgravingen\\NAAM\\opgravingCODE.mdb', 'CODE': 'PN023', 'TOPONIEM': "Karitaat 'Molensloot", 'OPGRAVING': 'Vispaaiplaats Ruijven', 'CODENAAM': 'PN023', 'KAARTBLAD': '37E', 'XCOORD': 87523, 'YCOORD': 445002, 'JAAR': 2013, 'VONDSTENLIJST': 0, 'SPOREN': 0, 'DIAOPGRAVING': 0, 'DIAVOORWERP': 0, 'TEKENINGEN': 0, 'ROMEINS AARDEWERK': 0, 'AARDEWERK 1': 0, 'AARDEWERK 2': 0, 'KLEIPIJPEN': 0, 'TERRA COTTA': 0, 'GLAS': 0, 'BEEN': 0, 'BOT': 0, 'HOORN': 0, 'IVOOR': 0, 'HOUT': 0, 'METAAL': 0, 'MUNTEN EN PENNINGEN': 0, 'STEEN': 0, 'LEER': 0, 'TEXTIEL': 0, 'MODERN': 0, 'BIOLOGISCH': 0, 'SPECIAL': 0, 'IJZ': 0, 'ROM': 0, 'MIDDELEEUWS OF LATER': 0, 'VME': 0, 'LME': 0, 'LMEA': 0, 'LMEB': 0, 'NT': 0, 'WAARNEMING': 57944, 'table': 'OPGRAVINGEN', 'projectcd': 'DELF-IT', 'bron': 'opgravingDELF-IT', 'loadtime': '2022-01-17T22:58:31+00:00'}"
#str = str.replace("\\", "/")
#str = str.replace("\'", "###")
#str = str.replace("\"", "\'")
#str = str.replace("###", "\"")


#str = re.sub(
#    pattern=r'ObjectId\((.*)\)', 
#    repl='\\1', 
#    string=str)

#json.loads(str)
#str = re.sub(
#    pattern=r"\'(.*?)\'(:|}|,)", 
#    repl='"\\1"\\2', 
#    string=str
#)
from ast import literal_eval
#d = eval(str)
#d['_id']

import yaml
yaml.load(str)

SyntaxError: invalid decimal literal (840369390.py, line 36)

In [None]:
grp_aggr = [{"$match" : {'projectcd': {'$nin': ['MAGAZIJN', 'DELF-IT', 'Digifotos']}, 'table': { '$not': {'$regex':"^SYS.*"}}}}]

df_text = pd.DataFrame(list(metaCollection.aggregate(grp_aggr)))
text = ' '.join(list(df_text['name']))
text

In [None]:
# Import package
import matplotlib.pyplot as plt
# Define a function to plot word cloud
def plot_cloud(wordcloud):
    # Set figure size
    plt.figure(figsize=(40, 30))
    # Display image
    plt.imshow(wordcloud) 
    # No axis details
    plt.axis("off")

# Import package
from wordcloud import WordCloud, STOPWORDS
# Generate word cloud
wordcloud = WordCloud(width= 3000, height = 2000, collocations=False, stopwords = STOPWORDS).generate(text)
# Plot
plot_cloud(wordcloud)