In [1]:
import config
import pymongo
import numpy as np
import pandas as pd
import json
import re

import sqlalchemy as db
from sqlalchemy.orm import sessionmaker
from sqlalchemy import create_engine, inspect
from sqlalchemy.engine import reflection

In [2]:
myclient = pymongo.MongoClient(str(config.MONGO_URI))
stagingDb = myclient[str(config.DB_STAGING)]
analyseDb = myclient[str(config.DB_ANALYSE)]
stagingCol = stagingDb[config.COLL_PLAATJES]
stagingOud = stagingDb[config.COLL_STAGING_OUD]
stagingNieuw = stagingDb[config.COLL_STAGING_NIEUW]
analyseCol = analyseDb[config.COLL_ANALYSE]
analyseColClean = analyseDb[config.COLL_ANALYSE_CLEAN]

metaCollection = stagingDb['Kolominformatie']

lst_soorten_meta = ['Metaal',
 'Artefact',
 'Glas',
 'Stelling',
 'Hout',
 'Spijker',
 'Plaatsing',
 'Keramiek',
 'Put',
 'Vondst',
 'Standplaats',
 'Aardewerk',
 'Munt',
 'Muur',
 'Skelet',
 'Doos',
 'Project',
 'Spoor',
 'Vindplaats',
 'Leer',
 'Steen',
 'Bot',
 'Vlak',
 'Kleipijp']
df_soorten_meta = pd.DataFrame(lst_soorten_meta).rename(columns={0: 'Soort'}) 

In [3]:
grp_aggr = [{'$group': {'_id': {'project': "$brondata.project", 'soort': "$soort"},'teller': {"$sum": 1}}}
           ,{'$replaceRoot': {'newRoot': {'project': "$_id.project", 'soort': "$_id.soort", 'teller': { '$ifNull': ["$teller", 0]}}}}]
grp_aggr_artef = [{"$match": {"soort": 'Artefact'}}
                 ,{'$group': {'_id': {'project': "$brondata.project", 'soort': "$artefactsoort"},'teller': {"$sum": 1}}}
                 ,{'$replaceRoot': {'newRoot': {'project': "$_id.project", 'soort': "$_id.soort", 'teller': { '$ifNull': ["$teller", 0]}}}}]

def AggregateData(df, fase): 
    df['teller'] = pd.to_numeric(df['teller'])
    df = df.groupby('soort').agg({'project':lambda x: list(x), 'teller':lambda x: sum(x)})
    df['Count'] = df.apply(lambda x: len(x.project), axis=1)
    df['Stage'] = fase
    df = df.sort_values('teller', ascending=False).reset_index()
    df = df.rename(columns={'soort': 'Soort', 'project': 'Projecten', 'teller': 'Aantal_Records', 'Count': 'Aantal_projecten'}).sort_index()
    return df

def getData(collection, soort): 
    df1 = pd.DataFrame(list(collection.aggregate(grp_aggr_artef)))
    df2 = pd.DataFrame(list(collection.aggregate(grp_aggr)))

    return AggregateData(df1.append(df2[df2.soort != 'Artefact']), soort)

In [4]:
df_singlestore = getData(analyseCol, 'SingleStore')
df_singlestore.head(5)

Unnamed: 0,Soort,Projecten,Aantal_Records,Aantal_projecten,Stage
0,Aardewerk,"[DC97, HHD2, DC21, DC22, DC36, DC37, DB34, DC0...",9631,15,SingleStore
1,Standplaats,[MAGAZIJN],8407,1,SingleStore
2,Plaatsing,[MAGAZIJN],5634,1,SingleStore
3,Foto,[nan],4299,1,SingleStore
4,Vondst,"[DC11, DC22, DC24_STADSKANTOOR, DC30, DC29, DC...",4197,18,SingleStore


In [5]:
df_singlestoreclean = getData(analyseColClean, 'SingleStoreClean')
df_singlestoreclean.head(5)

Unnamed: 0,Soort,Projecten,Aantal_Records,Aantal_projecten,Stage
0,Aardewerk,"[HHD2, DC97, DC21, DC36, DC22, DC05, DB34, DC3...",9631,15,SingleStoreClean
1,Standplaats,[MAGAZIJN],8407,1,SingleStoreClean
2,Vondst,"[DC032, DC34, DC32, DC030, DC179, DC023, DC021...",6309,45,SingleStoreClean
3,Plaatsing,[MAGAZIJN],5634,1,SingleStoreClean
4,Foto,[nan],4299,1,SingleStoreClean


In [6]:
lst_tables = ['Def_Project', 'Def_Put', 'Def_Vondst', 'Def_Spoor', 'Def_Stelling', 'Def_Doos', 'Def_Standplaats', 'Def_Plaatsing', 'Def_Vlak', 'Def_Vindplaats', 'Def_Artefact'] #['Def_Project', 'Def_Vondst', 'Def_Stelling', 'Def_Plaatsing', 'Def_Vindplaats', 'Def_Artefact']
regexProject = re.compile(r'\'project\': \'(.*?)\'') # regex to replace Object

def getProject(brondata):    
    #print(brondata)
    if brondata is None or brondata != "":
        return regexProject.search(brondata).group(1)
    else: 
        return "" 

df_tables_projects = pd.DataFrame()
    
engine = create_engine(config.SQLALCHEMY_DATABASE_URI)
with engine.connect() as connection:
    for table in lst_tables:     
        df = pd.read_sql_query('SELECT brondata from "' + table + '"', connection)
        
        df_out = pd.DataFrame()
        df_out['project'] = df.apply(lambda x: getProject(x['brondata']), axis=1)
        df_out['soort'] = table[4:]
        df_tables_projects = pd.concat([df_tables_projects, pd.DataFrame(df_out).groupby(['project', 'soort']).size().reset_index(name='teller')])

with engine.connect() as connection:
    df = pd.read_sql_query('SELECT brondata, artefactsoort from "Def_Artefact"', connection)

    df_out = pd.DataFrame()
    df_out['project'] = df.apply(lambda x: getProject(x['brondata']), axis=1)
    df_out['soort'] = df['artefactsoort']
    df_tables_projects = pd.concat([df_tables_projects, pd.DataFrame(df_out).groupby(['project', 'soort']).size().reset_index(name='teller')])

        
df_doelsysteem = AggregateData(df_tables_projects, 'Doelsysteem')
df_doelsysteem.head(5)

Unnamed: 0,Soort,Projecten,Aantal_Records,Aantal_projecten,Stage
0,Artefact,"[DB08, DB34, DC05, DC09, DC11, DC112, DC13, DC...",14932,29,Doelsysteem
1,Aardewerk,"[DB34, DC05, DC112, DC20, DC21, DC22, DC24_STA...",9631,15,Doelsysteem
2,Standplaats,[MAGAZIJN],8407,1,Doelsysteem
3,Vondst,"[DB008, DB034, DB34, DC004, DC005, DC008, DC00...",6309,45,Doelsysteem
4,Plaatsing,[MAGAZIJN],5634,1,Doelsysteem


## Analyse van alle gegevens

Hieronder volgt een overzicht van alle gegevens die in de vier fases beschikbaar zijn. Zo kunnen fouten in de conversie worden opgespoord. 

In [7]:


def getOverview(OnWhat): 
    lst_countColumns = [OnWhat + '_singlestore', OnWhat + '_singlestoreclean', OnWhat + '_doelsysteem']
    
    df = df_singlestore[['Soort', 'Projecten', OnWhat]].merge(df_singlestoreclean[['Soort', OnWhat]], on=['Soort'], how='outer', suffixes=("_singlestore", "_singlestoreclean"))
    df = df.merge(df_doelsysteem[['Soort', OnWhat]], on=['Soort'], how='outer', suffixes=("_singlestoreclean", "_doelsysteem"))

    df = df.sort_values(OnWhat+ '_singlestore', ascending=False) 
    df.rename(columns={'Aantal_Records': 'Aantal_Records_doelsysteem'}, inplace=True)
    df = df.merge(df_soorten_meta, on=['Soort'], how='outer')
    df[lst_countColumns] = df[lst_countColumns].fillna(0)
    df[lst_countColumns] = df[lst_countColumns].astype(int, errors='ignore')

    column_to_move = df.pop("Projecten")
    df.insert(4, "Projecten", column_to_move)
    df = df.style.bar(subset=lst_countColumns, color='#5fba7d', vmax=20000)
    
    return df

getOverview('Aantal_Records')
#df.columns
#df

Unnamed: 0,Soort,Aantal_Records_singlestore,Aantal_Records_singlestoreclean,Aantal_Records_doelsysteem,Projecten
0,Aardewerk,9631,9631,9631,"['DC97', 'HHD2', 'DC21', 'DC22', 'DC36', 'DC37', 'DB34', 'DC05', 'DC20', 'DC29', 'DC24_STADSKANTOOR', 'DC33', 'DC72', 'DC112', 'DC27']"
1,Standplaats,8407,8407,8407,['MAGAZIJN']
2,Plaatsing,5634,5634,5634,['MAGAZIJN']
3,Foto,4299,4299,0,[nan]
4,Vondst,4197,6309,6309,"['DC11', 'DC22', 'DC24_STADSKANTOOR', 'DC30', 'DC29', 'DC154', 'DB34', 'DC20', 'DC18', 'DC97', 'DC112', 'DC33', 'DC39', 'DC93', 'HHD2', 'DC179', 'DC34', 'DC32']"
5,Doos,2773,2830,2830,['MAGAZIJN']
6,Spoor,1655,2127,2127,"['DC30', 'DC179', 'DC97', 'DC29', 'DC112', 'DC24_STADSKANTOOR', 'HHD2', 'DB34', 'DC93', 'DC154']"
7,Hout,1090,1090,1090,"['DC112', 'DC20', 'HHD2', 'DC32', 'DB34', 'DC97', 'DC05', 'DC24_STADSKANTOOR', 'DC154', 'DC36', 'DC13', 'DC23', 'DC33', 'DC34', 'DC18', 'DC22', 'DC29', 'DC30', 'DC179', 'DC09', 'DC27', 'DC21', 'DC11', 'DC93']"
8,Kleipijp,882,882,882,"['DC24_STADSKANTOOR', 'DC23', 'DC27', 'DC14', 'DC29', 'DC21', 'DC20', 'DC112', 'DB08', 'DC30', 'DC36', 'DC93', 'DC154', 'DC33', 'DC39', 'DC18']"
9,Project,840,840,840,['DELF-IT']


## Detailanalyse SingleStore naar Doelsysteem

In [8]:
df = df_singlestore.merge(df_doelsysteem, on=['Soort', 'Aantal_Records', 'Aantal_projecten'], how='outer', suffixes=("_singlestore", "_doelsysteem"))
df[df.Stage_singlestore.isnull() | df.Stage_doelsysteem.isnull() ]

Unnamed: 0,Soort,Projecten_singlestore,Aantal_Records,Aantal_projecten,Stage_singlestore,Projecten_doelsysteem,Stage_doelsysteem
3,Foto,[nan],4299,1,SingleStore,,
4,Vondst,"[DC22, DC33, DC32, DC112, DC34, DC30, DC24_STA...",4197,18,SingleStore,,
5,Doos,[MAGAZIJN],2773,1,SingleStore,,
6,Spoor,"[DC93, DB34, DC29, DC30, DC97, HHD2, DC179, DC...",1655,10,SingleStore,,
19,Vlak,"[DC154, DC24_STADSKANTOOR, DC179]",34,3,SingleStore,,
22,Put,"[DC179, DC154, DC24_STADSKANTOOR]",13,3,SingleStore,,
23,Artefact,,14932,29,,"[DB08, DB34, DC05, DC09, DC11, DC112, DC13, DC...",Doelsysteem
24,Vondst,,6309,45,,"[DB008, DB034, DB34, DC004, DC005, DC008, DC00...",Doelsysteem
25,Doos,,2830,15,,"[DB034, DC005, DC011, DC018, DC021, DC024, DC0...",Doelsysteem
26,Spoor,,2127,16,,"[DB034, DB34, DC033, DC039, DC093, DC097, DC11...",Doelsysteem
