In [1]:
import config
import pymongo
import numpy as np
import pandas as pd
import json
import re
import simplejson
from math import isnan
import ast
import sys
import roman
#import attributes

import sqlalchemy as db
from sqlalchemy.orm import sessionmaker
from sqlalchemy import create_engine, inspect
from sqlalchemy.engine import reflection
from sqlalchemy.types import VARCHAR

from pandas_profiling import ProfileReport
from ipywidgets import interact, Dropdown

import timeperiod2daterange
#import attributes

In [2]:
myclient = pymongo.MongoClient(config.MONGO_URI)
stagingDb = myclient[config.DB_STAGING]
analyseDb = myclient[config.DB_ANALYSE]
stagingCol = stagingDb[config.COLL_PLAATJES]
stagingOud = stagingDb[config.COLL_STAGING_OUD]
stagingNieuw = stagingDb[config.COLL_STAGING_NIEUW]
analyseCol = analyseDb[config.COLL_ANALYSE]
analyseColClean = analyseDb[config.COLL_ANALYSE_CLEAN]
metaCollection = stagingDb['Kolominformatie']


In [3]:
def fixDatering(value):
    
    try:     
        value = str(value).replace("?", "")
        value = value.replace('-', ',').replace("/", ",").replace("+", ",").replace("=", ",").replace(",,", ",-").replace(")", "").replace("(", "")
        if value[0] == ',':
            value = value.replace(',', '-', 1)
        if value[-1] == ',':
            value = value[:-1]

        eersteDate = None
        datset = set()
        datlist = value.split(",")
        for dat in datlist:
            dat = str(dat)
            if "LMEb" in dat:
                datset.add(1200)
                datset.add(1500)
                continue
            if "RT" in dat or 'romeins' in dat:
                datset.add(-1200)
                datset.add(450)
                continue
            if "XIV C" in dat:
                datset.add(1450)
                datset.add(1475)
                continue
                

            matchObj = re.match( r'([0-9]{3,4})', dat.replace(" ", ""), re.M)
            if matchObj:
                datset.add(int(matchObj.group(1)))
                eersteDate = int(matchObj.group(1)) if not eersteDate else eersteDate
                continue
            else:
                matchObj = re.match( r'^([0-9]{1,2})([a-d]+)$', dat.replace(" ", ""), re.M|re.I)
                if matchObj:
                    intdate = int(matchObj.group(1)) * 100
                    eersteDate = intdate if not eersteDate else eersteDate
                    if matchObj.group(2) is not None: 
                        kwart = str(matchObj.group(2))
                        kwart_int_first = ord(kwart.lower()[0]) - 96
                        kwart_int_last = ord(kwart.lower()[-1]) - 96
                        datset.add(intdate + 25*(kwart_int_first-1))
                        datset.add(intdate + 25*kwart_int_last)
                        continue
                    else:
                        datset.add(intdate)
                        continue


            matchObj = re.match( r'^([IVXLCMD]+)([a-dA-D]+)?$', dat.replace(" ", ""), re.M) 
            if matchObj:
                try:
                    romandate = int(roman.fromRoman(str(matchObj.group(1)))) * 100
                    eersteDate = romandate if not eersteDate else eersteDate
                    if matchObj.group(2) is not None: 
                        kwart = str(matchObj.group(2))
                        kwart_int_first = ord(kwart.lower()[0]) - 96
                        kwart_int_last = ord(kwart.lower()[-1]) - 96
                        datset.add(romandate + 25*(kwart_int_first-1))
                        datset.add(romandate + 25*kwart_int_last)
                        continue
                    else:
                        datset.add(romandate)
                        continue

                except Exception as err:
                    msg = "Fout bij omzetten romeinse waarde naar getal: <" + value + ">"  +" met melding: " + str(err)
                    print(msg)

            matchObj = re.match( r'^([a-dA-D])?$', dat.replace(" ", ""), re.M) 
            if matchObj and eersteDate:
                try:
                        kwart = str(matchObj.group(1))
                        kwart_int_first = ord(kwart.lower()[0]) - 96
                        kwart_int_last = ord(kwart.lower()[-1]) - 96
                        datset.add(eersteDate + 25*(kwart_int_first-1))
                        datset.add(eersteDate + 25*kwart_int_last)
                        continue

                except Exception as err:
                    msg = "Fout bij omzetten romeinse waarde naar getal: <" + value + ">"  +" met melding: " + str(err)
                    print(msg)

            # If all fails try PHD-date fixer
            phdfix = timeperiod2daterange.detection2daterange(dat)
            if phdfix:
                datset.add(phdfix[0] if phdfix[0] < -25 or phdfix[0] > 25 else phdfix[0] * 100)
                datset.add(phdfix[1] if phdfix[1] < -25 or phdfix[1] > 25 else phdfix[1] * 100)

                
    except Exception as err:
        msg = "Fout bij omzetten romeinse waarde naar getal: <" + value + ">"  +" met melding: " + str(err)
        print(msg)
        return None
   
    return (min(datset), max(datset)) if len(datset) > 0 else None
        

In [4]:
grp_aggr = [{"$match" : {'soort': 'Vondst', 'vondstdatering': {'$exists': True}}}]
df_vondst = pd.DataFrame(list(analyseCol.aggregate(grp_aggr)))
df_vondst['datering'] = df_vondst['vondstdatering']
df_vondst['nieuw'] = ""
df_vondst.head()

Unnamed: 0,_id,brondata,projectcd,putnr,vlaknr,omstandigheden,opmerkingen,vondstdatering,datum,vondstnr,...,vondstdatering_tot,vondstkey_met_putnr,key,key_project,key_put,key_vlak,spoornr,error,datering,nieuw
0,6287827055febdcefd2f677f,"{'_id': 6287827055febdcefd2f677f, 'ID': 1, 'CO...",DB034,10.0,1,detectorvondst,fe,1250-1500,2003-04-17,1,...,1500.0,True,PDB034P10V1,PDB034,PDB034P10,PDB034P10V1,,,1250-1500,
1,627525421de6201dca62c964,"{'_id': 627525421de6201dca62c964, 'ID': 1, 'CO...",DB034,10.0,1,detectorvondst,fe,1250-1500,2003-04-17,1,...,1500.0,True,PDB034P10V1,PDB034,PDB034P10,PDB034P10V1,,,1250-1500,
2,6287827055febdcefd2f6788,"{'_id': 6287827055febdcefd2f6788, 'ID': 10, 'C...",DB034,10.0,1,detectorvondst,fe,1250-1500,2003-04-17,10,...,1500.0,True,PDB034P10V10,PDB034,PDB034P10,PDB034P10V1,,,1250-1500,
3,627525421de6201dca62c96d,"{'_id': 627525421de6201dca62c96d, 'ID': 10, 'C...",DB034,10.0,1,detectorvondst,fe,1250-1500,2003-04-17,10,...,1500.0,True,PDB034P10V10,PDB034,PDB034P10,PDB034P10V1,,,1250-1500,
4,6287827055febdcefd2f67e2,"{'_id': 6287827055febdcefd2f67e2, 'ID': 100, '...",DB034,10.0,1,detectorvondst,fe,1250-1500,2003-04-17,100,...,1500.0,True,PDB034P10V100,PDB034,PDB034P10,PDB034P10V1,,,1250-1500,


In [5]:
grp_aggr = [{"$match" : {'soort': 'Spoor', 'spoordatering': {'$exists': True}}}]
df_spoor = pd.DataFrame(list(analyseCol.aggregate(grp_aggr)))
df_spoor['datering'] = df_spoor['spoordatering']
df_spoor['nieuw'] = ""
df_spoor.head()

Unnamed: 0,_id,brondata,projectcd,putnr,vlaknrs,spoornr,aard,spoordatering,soort,datering_origineel,...,vorm,diepte,afgewerkt,coupnrs,gecoupeerd,profiel,error,onderkant_NAP,datering,nieuw
0,62878277d80b6206c4311e77,"{'_id': 62878277d80b6206c4311e77, 'ID': 177, '...",DB034,10,2,324,"sloot, romeins",romeins,Spoor,romeins,...,,,,,,,,,romeins,
1,62752543f8f1d31eae2c2e0d,"{'_id': 62752543f8f1d31eae2c2e0d, 'ID': 177, '...",DB034,10,2,324,"sloot, romeins",romeins,Spoor,romeins,...,,,,,,,,,romeins,
2,628782c5d4a7e795ee35820b,"{'_id': 628782c5d4a7e795ee35820b, 'SPOOR': 1, ...",DC024,3,0,1,MR,-,Spoor,-,...,rechth,0.0,0.0,-,0.0,0.0,,,-,
3,627525652f894566894fd99a,"{'_id': 627525652f894566894fd99a, 'SPOOR': 1, ...",DC024,3,0,1,MR,-,Spoor,-,...,rechth,0.0,0.0,-,0.0,0.0,,,-,
4,628782c5d4a7e795ee35820c,"{'_id': 628782c5d4a7e795ee35820c, 'SPOOR': 2, ...",DC024,3,0,2,MR,-,Spoor,-,...,rechth,0.0,0.0,-,0.0,0.0,,,-,


In [6]:
grp_aggr = [{"$match" : {'soort': 'Artefact', 'artefactdatering': {'$exists': True}}}]

df_artf = pd.DataFrame(list(analyseCol.aggregate(grp_aggr)))
df_artf['datering'] = df_artf['artefactdatering']
df_artf['nieuw'] = ""
df_artf.head()

Unnamed: 0,_id,brondata,projectcd,artefactdatering,artefactdatering_tot,artefactdatering_vanaf,doosnr,exposabel,functievoorwerp,plek,...,randindex,randpercentage,subbaksel,type_rand,vorm,wanddikte,catalogus,percentage_rand,datering,nieuw
0,628782454ad995a1153e5c71,"{'_id': 628782454ad995a1153e5c71, 'ID': 1, 'CO...",DB034,1225-1300,1280.0,1225.0,1.0,0.0,Drinken,x Vlak muur (S91),...,,,,,,,,,1225-1300,
1,628782454ad995a1153e5c72,"{'_id': 628782454ad995a1153e5c72, 'ID': 2, 'CO...",DB034,1225-1300,1280.0,1225.0,1.0,0.0,Drinken,x Vlak kwadrant,...,,,,,,,,,1225-1300,
2,628782454ad995a1153e5c73,"{'_id': 628782454ad995a1153e5c73, 'ID': 3, 'CO...",DB034,1225-1300,1280.0,1225.0,1.0,0.0,Drinken,x Vlak muur (S67),...,,,,,,,,,1225-1300,
3,628782454ad995a1153e5c74,"{'_id': 628782454ad995a1153e5c74, 'ID': 4, 'CO...",DB034,1225-1300,1280.0,1225.0,1.0,0.0,Drinken,x Profiel,...,,,,,,,,,1225-1300,
4,628782454ad995a1153e5c75,"{'_id': 628782454ad995a1153e5c75, 'ID': 5, 'CO...",DB034,1225-1300,1280.0,1225.0,1.0,0.0,Drinken,x Profiel,...,,,,,,,,,1225-1300,


In [7]:
df = pd.concat([df_vondst[['datering_origineel', 'nieuw']].copy(), df_spoor[['datering_origineel', 'nieuw']].copy(), df_artf[['datering_origineel', 'nieuw']].copy()])
df['nieuw'] = df.apply(lambda x: str(fixDatering(x['datering_origineel'])), axis=1)

grp_df = df.groupby(['datering_origineel', 'nieuw']).agg({'datering_origineel': ['count']}).reset_index()
grp_df


Fout bij omzetten romeinse waarde naar getal: <> met melding: string index out of range
Fout bij omzetten romeinse waarde naar getal: <> met melding: string index out of range
Fout bij omzetten romeinse waarde naar getal: <> met melding: string index out of range
Fout bij omzetten romeinse waarde naar getal: <> met melding: string index out of range
Fout bij omzetten romeinse waarde naar getal: <> met melding: string index out of range
Fout bij omzetten romeinse waarde naar getal: <> met melding: string index out of range
Fout bij omzetten romeinse waarde naar getal: <> met melding: string index out of range
Fout bij omzetten romeinse waarde naar getal: <> met melding: string index out of range
Fout bij omzetten romeinse waarde naar getal: <> met melding: string index out of range
Fout bij omzetten romeinse waarde naar getal: <> met melding: string index out of range
Fout bij omzetten romeinse waarde naar getal: <> met melding: string index out of range
Fout bij omzetten romeinse waard

Unnamed: 0_level_0,datering_origineel,nieuw,datering_origineel
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,count
0,1,"(100, 100)",12
1,2,"(200, 200)",12
2,3,"(300, 300)",12
3,4,"(400, 400)",12
4,5,"(500, 500)",12
...,...,...,...
497,voor 1525,"(1525, 1525)",3
498,voor 1600,"(1600, 1600)",3
499,waarschijnlijk XIII-XIV,"(1400, 1400)",2
500,xxxx-xxxx,False,10


In [8]:
grp_df.to_excel('datering.xlsx')

In [9]:
fixDatering('15th century')

(1500, 1500)

In [10]:
timeperiod2daterange.detection2daterange('17e eeuw')

[1600, 1699]