In [1]:
import config
import pymongo
import numpy as np
import pandas as pd
import json
import re

import sqlalchemy as db
from sqlalchemy.orm import sessionmaker
from sqlalchemy import create_engine, inspect
from sqlalchemy.engine import reflection

In [10]:
myclient = pymongo.MongoClient(str(config.MONGO_URI))
stagingDb = myclient[str(config.DB_STAGING)]
analyseDb = myclient[str(config.DB_ANALYSE)]
stagingCol = stagingDb[config.COLL_PLAATJES]
stagingOud = stagingDb[config.COLL_STAGING_OUD]
stagingNieuw = stagingDb[config.COLL_STAGING_NIEUW]
analyseCol = analyseDb[config.COLL_ANALYSE]
analyseColClean = analyseDb[config.COLL_ANALYSE_CLEAN]

metaCollection = stagingDb['Kolominformatie']

lst_soorten_meta = ['Put', 'Vlak', 'Spoor', 'Stelling', 'Aardewerk', 'Artefact', 'Standplaats', 'Project', 'Vindplaats', 'Vondst', 'Plaatsing', 'Doos']
df_soorten_meta = pd.DataFrame(lst_soorten_meta).rename(columns={0: 'Soort'}) 

Unnamed: 0,Soort
0,Put
1,Vlak
2,Spoor
3,Stelling
4,Aardewerk
5,Artefact
6,Standplaats
7,Project
8,Vindplaats
9,Vondst


In [3]:
def AggregateData(df, fase): 
    df['teller'] = pd.to_numeric(df['teller'])
    df = df.groupby('soort').agg({'project':lambda x: list(x), 'teller':lambda x: sum(x)})
    df['Count'] = df.apply(lambda x: len(x.project), axis=1)
    df['Stage'] = fase
    df = df.sort_values('teller', ascending=False).reset_index()
    df = df.rename(columns={'soort': 'Soort', 'project': 'Projecten', 'teller': 'Aantal_Records', 'Count': 'Aantal_projecten'}).sort_index()
    return df

In [4]:
grp_aggr = [{'$group': {'_id': {'project': "$brondata.project", 'soort': "$soort"},'teller': {"$sum": 1}}}
           ,{'$replaceRoot': {'newRoot': {'project': "$_id.project", 'soort': "$_id.soort", 'teller': "$teller"}}}]

df_singlestore = AggregateData(pd.DataFrame(list(analyseCol.aggregate(grp_aggr))), 'SingleStore')
df_singlestore.head(5)

Unnamed: 0,Soort,Projecten,Aantal_Records,Aantal_projecten,Stage
0,Standplaats,[MAGAZIJN],16814,1,SingleStore
1,Vondst,"[DB34, DC179, DC97, DC11, DC154, DC112, DC93]",8531,7,SingleStore
2,Aardewerk,"[DB34, DC112, DC36, DC72]",4670,4,SingleStore
3,Foto,[nan],2810,1,SingleStore
4,Spoor,"[DC154, DB34, DC179, DC97, DC112, DC93]",2139,6,SingleStore


In [5]:
#grp_aggr = [{'$group': {'_id': {'project': "$brondata.project", 'soort': "$brondata.table"},'teller': {"$sum": 1}}}
#           ,{'$replaceRoot': {'newRoot': {'project': "$_id.project", 'table': "$_id.table", 'teller': "$teller"}}}]

df_singlestoreclean = AggregateData(pd.DataFrame(list(analyseColClean.aggregate(grp_aggr))), 'SingleStoreClean')
df_singlestoreclean.head(5)

Unnamed: 0,Soort,Projecten,Aantal_Records,Aantal_projecten,Stage
0,Standplaats,[MAGAZIJN],16814,1,SingleStoreClean
1,Vondst,"[DC97, DB34, DC112, DC93, DC179, DC11, DC154]",8531,7,SingleStoreClean
2,Foto,[nan],2810,1,SingleStoreClean
3,Spoor,"[DC179, DC154, DC97, DB34, DC112, DC93]",2139,6,SingleStoreClean
4,Project,[DELF-IT],840,1,SingleStoreClean


In [6]:
lst_tables = ['Def_Project', 'Def_Vondst', 'Def_Stelling', 'Def_Plaatsing', 'Def_Vindplaats', 'Def_Artefact']
regexProject = re.compile(r'\'project\': \'(.*?)\'') # regex to replace Object

def getProject(brondata):    
    #print(brondata)
    if brondata is None or brondata != "":
        return regexProject.search(brondata).group(1)
    else: 
        return "" 

df_tables_projects = pd.DataFrame()
    
engine = create_engine(config.SQLALCHEMY_DATABASE_URI)
with engine.connect() as connection:
    for table in lst_tables:     
        df = pd.read_sql_query('SELECT brondata from "' + table + '"', connection)
        
        df_out = pd.DataFrame()
        df_out['project'] = df.apply(lambda x: getProject(x['brondata']), axis=1)
        df_out['soort'] = table[4:]
        df_tables_projects = pd.concat([df_tables_projects, pd.DataFrame(df_out).groupby(['project', 'soort']).size().reset_index(name='teller')])
    
df_doelsysteem = AggregateData(df_tables_projects, 'Doelsysteem')
df_doelsysteem.head(5)


Unnamed: 0,Soort,Projecten,Aantal_Records,Aantal_projecten,Stage
0,Vondst,"[DB34, DC11, DC112, DC154, DC179, DC93, DC97]",8531,7,Doelsysteem
1,Project,[DELF-IT],840,1,Doelsysteem
2,Artefact,"[DC154, DC179]",209,2,Doelsysteem
3,Vindplaats,[DELF-IT],74,1,Doelsysteem
4,Stelling,[MAGAZIJN],44,1,Doelsysteem


## Analyse van alle gegevens

Hieronder volgt een overzicht van alle gegevens die in de vier fases beschikbaar zijn. Zo kunnen fouten in de conversie worden opgespoord. 

In [13]:


def getOverview(OnWhat): 
    lst_countColumns = [OnWhat + '_singlestore', OnWhat + '_singlestoreclean', OnWhat + '_doelsysteem']
    
    df = df_singlestore[['Soort', 'Projecten', OnWhat]].merge(df_singlestoreclean[['Soort', OnWhat]], on=['Soort'], how='outer', suffixes=("_singlestore", "_singlestoreclean"))
    df = df.merge(df_doelsysteem[['Soort', OnWhat]], on=['Soort'], how='outer', suffixes=("_singlestoreclean", "_doelsysteem"))

    df = df.sort_values(OnWhat+ '_singlestore', ascending=False) 
    df.rename(columns={'Aantal_Records': 'Aantal_Records_doelsysteem'}, inplace=True)
    df = df.merge(df_soorten_meta, on=['Soort'], how='outer')
    df[lst_countColumns] = df[lst_countColumns].fillna(0)
    df[lst_countColumns] = df[lst_countColumns].astype(int, errors='ignore')

    df = df.style.bar(subset=lst_countColumns, color='#5fba7d', vmax=6000)
    
    return df

getOverview('Aantal_Records')

Unnamed: 0,Soort,Projecten,Aantal_Records_singlestore,Aantal_Records_singlestoreclean,Aantal_Records_doelsysteem
0,Standplaats,['MAGAZIJN'],16814,16814,0
1,Vondst,"['DB34', 'DC179', 'DC97', 'DC11', 'DC154', 'DC112', 'DC93']",8531,8531,8531
2,Aardewerk,"['DB34', 'DC112', 'DC36', 'DC72']",4670,0,0
3,Foto,[nan],2810,2810,0
4,Spoor,"['DC154', 'DB34', 'DC179', 'DC97', 'DC112', 'DC93']",2139,2139,0
5,Project,['DELF-IT'],840,840,840
6,Put,"['DC154', 'DC179']",682,682,0
7,Artefact,"['DC179', 'DC154']",209,209,209
8,Vindplaats,['DELF-IT'],74,74,74
9,Vlak,"['DC179', 'DC154']",57,57,0


## Detailanalyse SingleStore naar Doelsysteem

In [8]:
df = df_singlestore.merge(df_doelsysteem, on=['Soort', 'Aantal_Records', 'Aantal_projecten'], how='outer', suffixes=("_singlestore", "_doelsysteem"))
df[df.Stage_singlestore.isnull() | df.Stage_doelsysteem.isnull() ]

Unnamed: 0,Soort,Projecten_singlestore,Aantal_Records,Aantal_projecten,Stage_singlestore,Projecten_doelsysteem,Stage_doelsysteem
0,Standplaats,[MAGAZIJN],16814,1,SingleStore,,
2,Aardewerk,"[DB34, DC112, DC36, DC72]",4670,4,SingleStore,,
3,Foto,[nan],2810,1,SingleStore,,
4,Spoor,"[DC154, DB34, DC179, DC97, DC112, DC93]",2139,6,SingleStore,,
6,Put,"[DC154, DC179]",682,2,SingleStore,,
9,Vlak,"[DC179, DC154]",57,2,SingleStore,,
