In [1]:
import shared.config as config
import pymongo
import numpy as np
import pandas as pd
import json
import re

import sqlalchemy as db
from sqlalchemy.orm import sessionmaker
from sqlalchemy import create_engine, inspect
from sqlalchemy.engine import reflection

In [2]:
myclient = pymongo.MongoClient(str(config.MONGO_URI))
stagingDb = myclient[str(config.DB_STAGING)]
analyseDb = myclient[str(config.DB_ANALYSE)]
stagingCol = stagingDb[config.COLL_PLAATJES]
stagingOud = stagingDb[config.COLL_STAGING_OUD]
stagingNieuw = stagingDb[config.COLL_STAGING_NIEUW]
analyseCol = analyseDb[config.COLL_ANALYSE]
analyseColClean = analyseDb[config.COLL_ANALYSE_CLEAN]

metaCollection = stagingDb['Kolominformatie']

lst_soorten_meta = ['Metaal',
 'Artefact',
 'Glas',
 'Stelling',
 'Hout',
 'Spijker',
 'Plaatsing',
 'Keramiek',
 'Put',
 'Vondst',
 'Standplaats',
 'Aardewerk',
 'Munt',
 'Muur',
 'Skelet',
 'Doos',
 'Project',
 'Spoor',
 'Vindplaats',
 'Leer',
 'Steen',
 'Bot',
 'Vlak',
 'Kleipijp']
df_soorten_meta = pd.DataFrame(lst_soorten_meta).rename(columns={0: 'Soort'}) 

In [3]:
grp_aggr = [{'$group': {'_id': {'project': "$brondata.project", 'soort': "$soort"},'teller': {"$sum": 1}}}
           ,{'$replaceRoot': {'newRoot': {'project': "$_id.project", 'soort': "$_id.soort", 'teller': { '$ifNull': ["$teller", 0]}}}}]
grp_aggr_artef = [{"$match": {"soort": 'Artefact'}}
                 ,{'$group': {'_id': {'project': "$brondata.project", 'soort': "$artefactsoort"},'teller': {"$sum": 1}}}
                 ,{'$replaceRoot': {'newRoot': {'project': "$_id.project", 'soort': "$_id.soort", 'teller': { '$ifNull': ["$teller", 0]}}}}]

def AggregateData(df, fase): 
    df['teller'] = pd.to_numeric(df['teller'])
    df = df.groupby('soort').agg({'project':lambda x: list(x), 'teller':lambda x: sum(x)})
    df['Count'] = df.apply(lambda x: len(x.project), axis=1)
    df['Stage'] = fase
    df = df.sort_values('teller', ascending=False).reset_index()
    df = df.rename(columns={'soort': 'Soort', 'project': 'Projecten', 'teller': 'Aantal_Records', 'Count': 'Aantal_projecten'}).sort_index()
    return df

def getData(collection, soort): 
    df1 = pd.DataFrame(list(collection.aggregate(grp_aggr_artef)))
    df2 = pd.DataFrame(list(collection.aggregate(grp_aggr)))

    return AggregateData(df1.append(df2[df2.soort != 'Artefact']), soort)

In [4]:
df_singlestore = getData(analyseCol, 'SingleStore')
df_singlestore.head(5)

  return AggregateData(df1.append(df2[df2.soort != 'Artefact']), soort)


Unnamed: 0,Soort,Projecten,Aantal_Records,Aantal_projecten,Stage
0,Aardewerk,"[DC032, DC087, DB003, DB013, DB011, VORM, SPLI...",156121,81,SingleStore
1,Dierlijk_Bot,"[OPMERKING, INDIVIDU, DIGI_FO_NO, DP4, DC179, ...",71270,52,SingleStore
2,Foto,[nan],40149,1,SingleStore
3,Vondst,"[DC034, DB191, DC163, DC033, DB240, DC249, DB2...",39084,109,SingleStore
4,Vulling,"[DC232, HK, DB120, DB122, DC097, DB133, DB033,...",31816,117,SingleStore


In [5]:
df_singlestoreclean = getData(analyseColClean, 'SingleStoreClean')
df_singlestoreclean.head(5)

  return AggregateData(df1.append(df2[df2.soort != 'Artefact']), soort)


Unnamed: 0,Soort,Projecten,Aantal_Records,Aantal_projecten,Stage
0,Aardewerk,"[[DC097, DC097], [DC088], [DC115, DC115, DC115...",72533,114,SingleStoreClean
1,Dierlijk_Bot,"[[VERGROEI], [DB113, DB113], [ORIENTATIE], [DB...",51351,67,SingleStoreClean
2,Foto,"[[D, DC003], [D, DB032], [D, DC115], [D, DB217...",40149,65,SingleStoreClean
3,Vulling,"[[DC150], [VKL], [MORTEL], [DC114], [DB227], [...",31816,117,SingleStoreClean
4,Fotokoppel,[[D]],31469,1,SingleStoreClean


In [6]:
lst_tables = ['Def_Project', 'Def_Put', 'Def_Vondst', 'Def_Spoor', 'Def_Stelling', 'Def_Doos', 'Def_Standplaats', 'Def_Plaatsing', 'Def_Vlak', 'Def_Vindplaats', 'Def_Artefact'] #['Def_Project', 'Def_Vondst', 'Def_Stelling', 'Def_Plaatsing', 'Def_Vindplaats', 'Def_Artefact']
regexProject = re.compile(r'\'project\': \'(.*?)\'') # regex to replace Object

def getProject(brondata):    
    #print(brondata)
    if brondata is None or brondata != "":
        return regexProject.search(brondata).group(1)
    else: 
        return "" 

df_tables_projects = pd.DataFrame()
    
engine = create_engine(config.SQLALCHEMY_DATABASE_URI)
with engine.connect() as connection:
    for table in lst_tables:     
        df = pd.read_sql_query('SELECT brondata from "' + table + '"', connection)
        
        df_out = pd.DataFrame()
        df_out['project'] = df.apply(lambda x: getProject(x['brondata']), axis=1)
        df_out['soort'] = table[4:]
        df_tables_projects = pd.concat([df_tables_projects, pd.DataFrame(df_out).groupby(['project', 'soort']).size().reset_index(name='teller')])

with engine.connect() as connection:
    df = pd.read_sql_query('SELECT brondata, artefactsoort from "Def_Artefact"', connection)

    df_out = pd.DataFrame()
    df_out['project'] = df.apply(lambda x: getProject(x['brondata']), axis=1)
    df_out['soort'] = df['artefactsoort']
    df_tables_projects = pd.concat([df_tables_projects, pd.DataFrame(df_out).groupby(['project', 'soort']).size().reset_index(name='teller')])

        
df_doelsysteem = AggregateData(df_tables_projects, 'Doelsysteem')
df_doelsysteem.head(5)

Unnamed: 0,Soort,Projecten,Aantal_Records,Aantal_projecten,Stage
0,Artefact,"[0, AANTAL, AANW, AFMETING, ARTEFACT, ASSOCIA,...",280571,266,Doelsysteem
1,Aardewerk,"[AANTAL, ARTEFACT, BAKSEL, CAT, DAT_BEG, DAT_E...",72533,81,Doelsysteem
2,Dierlijk_Bot,"[AANTAL, ARTEFACT, ASSOCIA, BEWERK, BRAND, DB0...",51351,52,Doelsysteem
3,Onbekend,"[AANTAL, ARTEFACT, DC160, GEWICHT, OPMERKING, ...",30054,6,Doelsysteem
4,Vondst,"[DB001, DB002, DB003, DB004, DB005, DB006, DB0...",29042,112,Doelsysteem


## Analyse van alle gegevens

Hieronder volgt een overzicht van alle gegevens die in de vier fases beschikbaar zijn. Zo kunnen fouten in de conversie worden opgespoord. 

In [7]:


def getOverview(OnWhat): 
    lst_countColumns = [OnWhat + '_singlestore', OnWhat + '_singlestoreclean', OnWhat + '_doelsysteem']
    
    df = df_singlestore[['Soort', 'Projecten', OnWhat]].merge(df_singlestoreclean[['Soort', OnWhat]], on=['Soort'], how='outer', suffixes=("_singlestore", "_singlestoreclean"))
    df = df.merge(df_doelsysteem[['Soort', OnWhat]], on=['Soort'], how='outer', suffixes=("_singlestoreclean", "_doelsysteem"))

    df = df.sort_values(OnWhat+ '_singlestore', ascending=False) 
    df.rename(columns={'Aantal_Records': 'Aantal_Records_doelsysteem'}, inplace=True)
    df = df.merge(df_soorten_meta, on=['Soort'], how='outer')
    df[lst_countColumns] = df[lst_countColumns].fillna(0)
    df[lst_countColumns] = df[lst_countColumns].astype(int, errors='ignore')

    column_to_move = df.pop("Projecten")
    df.insert(4, "Projecten", column_to_move)
    df = df.style.bar(subset=lst_countColumns, color='#5fba7d', vmax=20000)
    
    return df

getOverview('Aantal_Records')
#df.columns
#df

Unnamed: 0,Soort,Aantal_Records_singlestore,Aantal_Records_singlestoreclean,Aantal_Records_doelsysteem,Projecten
0,Aardewerk,156121,72533,72533,"['DC032', 'DC087', 'DB003', 'DB013', 'DB011', 'VORM', 'SPLIT_ID', 'DC100', 'BAKSEL', 'MAI', 'DB014', 'DC115', 'DB002', 'DB034', 'DC101', 'DOOSNR', 'FOTO', 'DC057', 'DC160', 'DC114', 'DB120', 'DC097', 'DETER', 'DC033', 'DC116', 'DB114', 'DC065', 'TEK', 'DB217', 'DC026', 'DB004', 'FRAGM', 'DC072', 'DC112', 'DC088', 'DAT_EIND', 'DC036', 'DIAMETER', 'DB113', 'CAT', 'RIM', 'DC042', 'DC061', 'OPMERKING', 'DC098', 'PAST_AAN', 'DB015', 'DC027', 'DC064', 'DC005', 'DC054', 'DC022', 'ARTEFACT', 'DC062', 'MAT', 'DB006', 'DC021', 'DB133', 'DC029', 'DC024', 'AANTAL', 'DC069', 'DB012', 'DC060', 'DC020', 'DB241', 'GEWICHT', 'DB005', 'VERSIERING', 'DC067', 'DC083', 'DC037', 'DB001', 'DB112', 'DB010', 'DB032', 'DC077', 'DAT_BEG', 'DB033', 'DB036', 'SUBNO']"
1,Dierlijk_Bot,71270,51351,51351,"['OPMERKING', 'INDIVIDU', 'DIGI_FO_NO', 'DP4', 'DC179', 'MAAT3', 'AANTAL', 'SUBNO', 'DB119', 'DB148', 'SPECIES', 'ARTEFACT', 'DC069', 'DETER', 'MAAT1', 'DC054', 'TEKNO', 'SYM', 'DC067', 'DC057', 'DB122', 'P4', 'ORIENTATIE', 'DC160', 'DB113', 'DC005', 'KNAAG', 'DB006', 'MAAT4', 'VERGROEI', 'DC163', 'FRAGM', 'M1', 'ELEMENT', 'BEWERK', 'LENGTE', 'GRAF', 'M2', 'M3', 'DB114', 'DOOSNO', 'SPLIT_ID', 'MAAT2', 'LEEFTIJD', 'PERC', 'GEWICHT', 'BRAND', 'DC060', 'DC116', 'DC024', 'ASSOCIA', 'PATHOLOGIE']"
2,Foto,40149,40149,0,[nan]
3,Vondst,39084,29042,29042,"['DC034', 'DB191', 'DC163', 'DC033', 'DB240', 'DC249', 'DB215', 'DC032', 'DC101', 'DC160', 'DC029', 'DC091', 'DB005', '1', 'DB209', 'DC064', 'VAKNR', 'DC011', 'DB138', 'DC230', 'DC082', 'MATERIAAL', 'DC063', 'DC158', 'DC114', 'DB031', 'DC067', 'SEGMENT', 'DB202', 'VAKLETTER', 'DC069', 'DC097', 'DC170', 'DC171', 'DB108', 'DC018', 'DB112', 'DB006', 'DB032', 'DC042', 'DC117', 'DC053', 'DC022', 'DB033', 'VONDST', 'DB004', 'DB120', 'DB200', 'DC100', 'DC271', 'DB035', 'DB193', 'DB109', 'DC179', 'DC081', 'DC030', 'DC112', 'DB210', 'DC154', 'DC020', 'DB144', 'DC254', 'DB122', 'DB135', 'DC119', 'DC268', 'DC228', 'DC039', 'DC157', 'DC093', 'DC166', 'DC087', 'DC059', 'DB152', 'DC150', 'DB119', 'DB115', 'DC054', 'DC116', 'DC062', 'DC024', 'DC085', 'DB157', 'DC232', 'DC060', 'DC065', 'DC172', 'OPMERKING', 'DC041', 'VERZMWIJZE', 'DC066', 'DB034', 'SPOOR', 'DC026', 'DB213', 'DC044', 'DC115', 'DB133', 'DC084', 'VULLING', 'DB113', 'DC057', 'DB036', 'DC103', 'DC164', 'DB148', 'DC052', 'DB197', 'DB114']"
4,Vulling,31816,31816,0,"['DC232', 'HK', 'DB120', 'DB122', 'DC097', 'DB133', 'DB033', 'DC032', 'DB227', 'MUUR', 'FEMN', 'VULLING', 'DB034', 'MEDIAAN', 'DC230', 'LGINTERP', 'DC164', 'DC029', 'BIOTURB', 'DC170', 'DB200', 'DC065', 'LB2', 'DC030', 'DC267', 'DB240', 'HB2', 'FO', 'DC150', 'HOEK', 'DB035', 'DB138', 'HB3', 'DC087', 'DB222', 'PO', 'DB119', 'DB113', 'DB210', 'DC254', 'KLEUR', 'BB3', 'DC008', 'LB3', 'DC171', 'DB003', 'DB115', 'SPOOR', 'LSR', 'DC112', 'DC160', 'DB135', 'DB111', 'DC163', 'HLM', 'DB193', 'VKL', 'DB152', 'METSELVB', 'DC258', 'DB187', 'GEVLEKT', 'HERGEBR', 'DC166', 'DB191', 'DC274', 'BB2', 'BB1', 'PA', 'DC249', 'DB154', 'DB116', 'DC157', 'DC179', 'DC114', '10LGMT', 'DC271', 'DB144', 'LPR', 'DC116', 'DC091', 'DB197', 'DB209', 'DB112', 'DB202', 'DB032', 'BOUWMAT', 'DB109', 'DB157', 'DB215', 'DB195', 'DC084', 'LB1', 'MORTEL', 'REDUCTIE', 'DB192', 'DB121', 'DC268', 'TBIJMENG', 'DB236', 'DB148', 'OPMERKING', 'DC172', 'DB118', 'LAMINATIE', 'DC093', 'TEXTUUR', 'DC229', 'DC158', 'DC154', 'VOEGMAT', 'SUBLAAG', 'DC085', 'HB1', 'DC228', 'DB114', 'DB213']"
5,Fotokoppel,31469,31469,0,['D']
6,Onbekend,30054,30054,30054,"['GEWICHT', 'AANTAL', 'DC160', 'SPLIT_ID', 'ARTEFACT', 'OPMERKING']"
7,Spoor,21430,23103,23103,"['PROFIEL', 'DC172', 'DB192', 'DC118', 'DC232', 'DB117', 'DB114', 'DC024', 'DB210', 'DB144', 'DB240', 'DC166', 'TEKCOUPE', 'VORMCOUPE', 'DB006', 'DC154', 'DC030', 'DB109', 'DC228', 'DC164', 'DB120', 'DB209', 'DC160', 'DC093', 'DB122', '5', 'DC268', 'DB116', 'INTERPRET', 'DATERING', 'DC271', 'DC254', 'DC171', 'VLAK', 'DB202', 'DB215', 'DC163', 'DB148', 'DC084', 'DC065', 'SPOOR', 'DC114', 'DC230', 'DB003', 'DB222', 'DB111', 'DB152', 'DC179', 'VORM', 'DC150', 'DB187', 'DC170', 'DB035', 'DC101', 'DC092', 'MB', 'COUPELIJST', 'GECOUPEERD', 'DB135', 'DC249', 'DB138', 'DB034', 'DB119', 'DC029', 'DB195', 'DB154', 'DB191', 'DB118', 'DC091', 'DB197', 'DB200', 'DB115', 'PUT', 'DB121', 'DC116', 'DB133', 'DB236', 'AFGEWERKT', 'DB213', 'DC157', 'COUPENR', 'DC032', 'DIEPTE', 'DB032', 'DB033', 'DC085', 'DB157', 'DB112', 'DB113', 'DC087', 'DC097', 'DB193', 'DC158', 'DC112', 'DB036']"
8,Hout,15972,14669,14669,"['DB034', 'DC082', 'DIVERS', 'PUBL_CODE', 'DB148', 'DC052', 'DB215', 'DC063', 'TYPE_CODE', 'HOUT_CODE', 'DENDRO', 'DC076', 'DC029', 'DC026', 'DC032', 'DC170', 'DC164', 'DC045', 'DC101', 'DC057', 'BIAX_G', 'DAT_COMPL', 'DC036', 'DC022', 'DC053', 'DB240', 'STC', 'DC154', 'AANTAL', 'DC067', 'DC020', 'DC034', 'DB009', 'DAT_VRWP', 'DC033', 'GEBRUIK', 'DIG_FO_NO', 'PUNTVL', 'DC018', 'DC011', 'DC089', 'BIAX_C', 'DC088', 'GEWICHT', 'ARTEFACT', 'DC112', 'PRODCNM', '0', 'BESCHR', 'DB197', 'DC024', 'DET_W', 'VRWP', 'FRAGM', 'MAI', 'CONSERV', 'BEWERK', 'DETERM', 'DB033', 'DC060', 'DC098', 'LITERATUUR', 'FUNCTIE', 'DC230', 'DC091', 'DC013', 'DC041', 'DB202', 'DC115', 'DC061', 'MATEN', 'DC042', 'DC179', 'EXPO', 'DC069', 'DC005', 'DC023', 'RESTAU', 'DC116', 'D_BOOM', 'DB032', 'DC009', 'DB004', 'PUNTL', 'AANW', 'DC093', 'DC027', 'SUBNO', 'DOOSNO', 'DB005', 'DC097', 'SPLIT_ID', 'JSB', 'DC030', 'DB001', 'DC172', 'COMPLEET', 'DC103', 'DECORATIE', 'DC021', 'DC065', 'DC054', 'DC087', 'DC160', 'C14', 'DC085']"
9,Metaal,15774,13837,13837,"['DC024', 'TYPE_CODE', 'GROEP', 'BEWERKING', 'DB113', 'DC026', 'DC179', 'BESCHR', 'DC072', 'DC163', 'DB114', 'DC172', 'FRAGM', 'DC087', 'TEK_NO', 'AANTAL', 'COMPLEET', 'DB032', 'DB003', 'DC112', 'CONSERV', 'DB202', 'DC154', 'DAT_VRWP', 'DB004', 'GEWICHT', 'LITERATUUR', 'SUBNO', 'RESTAU', 'OPPERVLAK', 'DC232', 'DC097', 'DIG_FO_NO', 'DC116', 'DB133', 'DC067', 'TYPE_VRWP', 'DOOS_NO', 'DB034', 'DECORATIE', 'ARTEFACT', 'DETER', 'DB120', 'DC041', 'DC101', 'DC160', 'DC171', 'FUNCTIE', 'DIVERSE', 'DC150', 'MATEN', 'DB033', 'DAT_COMPL', 'DC014', 'DC157', 'MET_SOORT', 'PUBL_CODE', 'EXPO', 'PROD_CNM', 'SPLIT_ID']"


## Detailanalyse SingleStore naar Doelsysteem

In [8]:
df = df_singlestore.merge(df_doelsysteem, on=['Soort', 'Aantal_Records', 'Aantal_projecten'], how='outer', suffixes=("_singlestore", "_doelsysteem"))
df[df.Stage_singlestore.isnull() | df.Stage_doelsysteem.isnull() ]

Unnamed: 0,Soort,Projecten_singlestore,Aantal_Records,Aantal_projecten,Stage_singlestore,Projecten_doelsysteem,Stage_doelsysteem
0,Aardewerk,"[DC032, DC087, DB003, DB013, DB011, VORM, SPLI...",156121,81,SingleStore,,
1,Dierlijk_Bot,"[OPMERKING, INDIVIDU, DIGI_FO_NO, DP4, DC179, ...",71270,52,SingleStore,,
2,Foto,[nan],40149,1,SingleStore,,
3,Vondst,"[DC034, DB191, DC163, DC033, DB240, DC249, DB2...",39084,109,SingleStore,,
4,Vulling,"[DC232, HK, DB120, DB122, DC097, DB133, DB033,...",31816,117,SingleStore,,
5,Fotokoppel,[D],31469,1,SingleStore,,
7,Spoor,"[PROFIEL, DC172, DB192, DC118, DC232, DB117, D...",21430,95,SingleStore,,
8,Hout,"[DB034, DC082, DIVERS, PUBL_CODE, DB148, DC052...",15972,106,SingleStore,,
9,Metaal,"[DC024, TYPE_CODE, GROEP, BEWERKING, DB113, DC...",15774,60,SingleStore,,
10,Kleipijp,"[DC067, DC023, DC057, DC036, DC033, DC093, BAS...",15160,60,SingleStore,,
