In [1]:
import config
import pymongo
import numpy as np
import pandas as pd
import json
import re
import copy
import ast

from ipywidgets import interact, Dropdown

pd.set_option('display.max_rows', 200)

In [2]:
myclient = pymongo.MongoClient(str(config.MONGO_URI))
stagingDb = myclient[str(config.DB_STAGING)]
analyseDb = myclient[str(config.DB_ANALYSE)]
stagingCol = stagingDb[config.COLL_PLAATJES]
stagingOud = stagingDb[config.COLL_STAGING_OUD]
stagingNieuw = stagingDb[config.COLL_STAGING_NIEUW]
analyseCol = analyseDb[config.COLL_ANALYSE]
analyseColClean = analyseDb[config.COLL_ANALYSE_CLEAN]
metaCollection = stagingDb['Kolominformatie']

AIRFLOW_WASSTRAAT_CONFIG = "./wasstraat_config/Wasstraat_Config_Harmonize.xlsx"

In [3]:
xl = pd.read_excel(AIRFLOW_WASSTRAAT_CONFIG, None);
df_table = xl['Objecten']
df_table.head(30)

Unnamed: 0,Object,Tabellen,Overerven,Samenvoegen,ABR-code
0,Ignore,"["".*backup.*"", "".*kopie.*""]",,,
1,Tekening,"[""^TEKENING.*""]",,,
2,Vondst,"[""^VONDSTENLIJST"", ""^VONDST$""]",,,
3,Spoor,"[""^SPOREN$"", ""^SPOOR$""]",,,
4,Vulling,"[""^VULLING.*"", ""MUUR""]",,,
5,Dia,"[""DIA.*""]",,,
6,Foto,"[""FOTO.*""]",,,
7,Put,"[""^PUT$"", ""PUTTEN""]",,,
8,Artefact,"[""ARTEFACT.*""]",,,
9,Hout,"[""HOUT"", ""ARTF_OPH""]",Artefact,,


In [4]:
objecten = list(xl.keys())
objecten.pop(0)

df_attr = pd.DataFrame()
for obj in objecten:
    df_tmp = xl[obj][['Attribute', 'Kolommen']]
    df_tmp['Object'] = obj
    df_attr = pd.concat([df_attr, df_tmp])
     
df_attr['Kolommen'] = df_attr.apply(lambda x: ast.literal_eval(x['Kolommen']), axis=1)
df_attr = df_attr.explode('Kolommen').reset_index().drop(['index'], axis=1)

# Add Inherited attributes
df_overerven = pd.DataFrame()
for index, row in df_table[df_table.Overerven.notnull()].iterrows():
    df_tmp = df_attr[df_attr.Object == row['Overerven']].copy()
    df_tmp['Object'] = row['Object']
    df_overerven = pd.concat([df_overerven, df_tmp])
df_attr = pd.concat([df_attr, df_overerven])
    
df_attr.head(5)

Unnamed: 0,Attribute,Kolommen,Object
0,putnr,PUT,Put
1,putnr,PUTNO,Put
2,beschrijving,BESCHRYF,Put
3,aangelegd,AANGELEGD,Put
4,datum_ingevoerd,INGEVOERD,Put


## Count how many times a column is found in the data

In [5]:
mapper = {"$arrayToObject" : {"$filter": {"input" : {"$objectToArray" : "$brondata"}, 
                                          "as" : "item", 
                                          "cond" : {"$and" : [{"$ne" : ["$$item.v",np.NaN]},{"$ne" : ["$$item.v",None]}, {"$ne" : ["$$item.v",""]}]}}}}

df_count = pd.DataFrame(list(analyseCol.aggregate([
    {"$match": {"brondata": {"$exists": {"$Bool": 1}}, "soort": {"$exists": {"$Bool": 1}}}},
    {"$replaceRoot": { "newRoot": { "$mergeObjects": [ { "soort": {"$ifNull": ["$artefactsoort", "$soort"]}}, mapper ] } } }])))
df_count = df_count.groupby(['soort']).agg(['count'])
df_count.columns = list(df_count.columns.levels[0])
df_count = df_count.reset_index(level=0)
df_count = df_count.melt(id_vars=["soort", "_id"])
df_count = df_count[df_count.value != 0]
df_count.rename(columns={'value': 'teller', 'soort': 'Object', 'variable': 'Kolommen'}, inplace=True)
df_count.sort_values(by=['Object', 'Kolommen'], inplace=True)
df_count['percentage_gevuld'] = pd.to_numeric(100 * df_count['teller'] / df_count['_id'], downcast='integer').round(0)
df_count.drop(columns=['_id'], inplace=True)
df_count.head(5)

Unnamed: 0,Object,Kolommen,teller,percentage_gevuld
1242,Aardewerk,10a,10515,75.0
2116,Aardewerk,10b,878,6.0
3036,Aardewerk,10c,457,3.0
3059,Aardewerk,10d,516,4.0
1265,Aardewerk,11,11969,85.0


## Find all Attributes and connect them to the Objects

In [6]:
def getObject(table):
    for index, row in xl['Objecten'].iterrows():
        kolommen = ast.literal_eval(row['Tabellen'])
        for kolom in kolommen:
            if re.match(kolom, str(table)):
                return row['Object']
    
    return 'Geen' #Nothing found

def flatten(lst):
    flat_list = [item for sublist in lst for item in sublist]
    return list(set(flat_list))


grp_aggr = [{"$match" : {'project': {'$nin': ['MAGAZIJN', 'DELF-IT', 'Digifotos']}, 'table': { '$not': {'$regex':"^SYS.*"}}}},
            {"$group": { "_id": {'table': "$table", 'name': '$name'}, "count": {"$sum": 1},  "omschrijvingen": { "$push": "$Description" },  "projecten": { "$push": "$project" }}},
            {'$replaceRoot': {'newRoot': {'table': "$_id.table", 'name': "$_id.name", 'count': '$count', 'omschrijvingen': "$omschrijvingen", "projecten": "$projecten"}}}]

# Get All Attributes and set Object
df = pd.DataFrame(list(metaCollection.aggregate(grp_aggr)))
df['Object'] = df.apply(lambda x: getObject(x['table']), axis=1)

# Now get all unique attributes 
df = df.groupby(['Object', 'name']).agg({'omschrijvingen':lambda x: list(x), 'count':lambda x: sum(x), 'projecten': lambda x: list(x)}).reset_index()
df['omschrijvingen'] = df.apply(lambda x: flatten(x['omschrijvingen']), axis=1)
df['projecten'] = df.apply(lambda x: flatten(x['projecten']), axis=1)
df = df.rename(columns={"name": "Kolommen"})

#Merge it with the attrubutes used in the Excel to 
df = pd.merge(df, df_attr, on=['Object', 'Kolommen'], how='left')
df['Attribute'] = df['Attribute'].fillna(value="")

#Merge with the count of the columns
df = pd.merge(df, df_count, on=['Object', 'Kolommen'], how='left')
#df = df.dropna()
df['teller'] = pd.to_numeric(df['teller'], downcast='integer')

df[df['Object'] == 'Vulling'].head(5)

Unnamed: 0,Object,Kolommen,omschrijvingen,count,projecten,Attribute,teller,percentage_gevuld
1096,Vulling,(none),[],15,"[DB34, DC154, DC179, DC97, DC008, DC93, DC112]",,,
1097,Vulling,10LGMT,[10 lagen maat],5,"[DC008, DC179]",,,
1098,Vulling,BB1,[formaat steen],5,"[DC008, DC179]",breedte_baksteen1,,
1099,Vulling,BB2,[formaat steen],5,"[DC008, DC179]",breedte_baksteen2,,
1100,Vulling,BB3,[formaat steen],5,"[DC008, DC179]",breedte_baksteen3,,


In [7]:
lst = df['Object'].unique()
Objecten_lst = Dropdown(options = sorted(lst))

@interact(obj = Objecten_lst)
def print_city(obj):
    display(df[df.Object == obj].style.apply(lambda x: ["color: green" if bool(set(x.projecten) & set(['DC179', 'DC154'])) else "" for v in x], axis = 1))


interactive(children=(Dropdown(description='obj', options=('Aardewerk', 'Artefact', 'Bot', 'Dia', 'Geen', 'Gla…