Analysis Baseline Training Data 
===============

# Import and Set Up

In [None]:
# Imports 
import pandas as pd
from src.settings import Settings
import matplotlib.pyplot as plt
import uuid
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from src.utils.data import *

In [None]:
# Settings
settings = Settings(_env_file="paths/.env.dev") 
df = pd.read_csv(settings.BASELINE_MDK_TRAINING_DATA)
df["uuid"] = [uuid.uuid4() for i in range(0, len(df))]
plt.style.use('fivethirtyeight')

# Descriptive Analysis

In [None]:
# Length 
print(f"Training Data Length: {len(df)}")

In [None]:
# Duplicates 
print(f"Duplicates: {df.duplicated().any()}")

# check for identifying column
for column in list(df.columns):
    print(f"{column} column unique: {df[column].is_unique}")

In [None]:
# Number of categories  
themas = len(df["THEMA"].unique())
bezeichnungen = len(df["BEZEICHNUNG"].unique())
musterdatensaetzen = len(df["MUSTERDATENSATZ"].unique())

print(f"The training data consists of {themas} 'Themas', {bezeichnungen} 'Bezeichnungen' and {musterdatensaetzen} 'Musterdatensätze'")

In [None]:
# Period 
df_date_cleaned = df[df['Datum'].notna()]
print(f"{len(df_date_cleaned)/len(df)*100} percent of the data has a date")

print(f'Min Date: {pd.to_datetime(df_date_cleaned["Datum"]).min()}')
print(f'Max Date: {pd.to_datetime(df_date_cleaned["Datum"]).max()}')

In [None]:
# ORG
df_ORG_cleaned = df[df['ORG'].notna()]
print(f"{len(df_ORG_cleaned)/len(df)*100} percent of the data has a ORG")
len(set(df["ORG"].to_list()))

# Distribution
df_count_ORG = count_categories(df=df, column_name="ORG")

make_distribution_fig(df_count=df_count_ORG, variable="ORG", mode="distribution", counter="uuid",df=df)

In [None]:
df_dcat_landing_page = df[df['dcat:landingPage'].notna()]
print(f"{len(df_ORG_cleaned)/len(df)*100} percent of the data has a LandingPage")

landingPages = df['dcat:landingPage'].tolist()

nrw = list(filter(lambda x: "nrw" in x, landingPages))
schleswig_holstein = list(filter(lambda x: "schleswig-holstein" in x, landingPages))

print(f"Anteil NRW Daten: {len(nrw)/len(landingPages)}")
print(f"Anteil schleswig_holstein Daten: {len(schleswig_holstein)/len(landingPages)}")

In [None]:
df_dcat_themes = df[df['dcat:theme'].notna()]
print(f"{len(df_dcat_themes)/len(df)*100} percent of the data has a Theme")
df_count_themes = count_categories(df=df, column_name="dcat:theme", top=5)
make_distribution_fig(df_count=df_count_themes, variable="dcat:theme", mode="distribution", counter="uuid",df=df)

# Distribution Analysis

In [None]:
# Distribution of categories Thema 
# Thema
df_count_thema = count_categories(df=df, column_name="THEMA")
make_distribution_fig(df_count=df_count_thema, variable="THEMA", mode="distribution", counter="uuid",df=df)

In [None]:
# Distribution of categories Bezeichnung 

# Bezeichnung
df_count_bezeichnung = count_categories(df=df, column_name="BEZEICHNUNG", limit=5)
make_distribution_fig(df_count=df_count_bezeichnung, variable="BEZEICHNUNG", mode="distribution", counter="uuid",df=df)

In [None]:
# Distribution of categories Musterdatensatz  
# Musterdatensatz
df_mkd_per_category = df.groupby(by=["MUSTERDATENSATZ"]).count()["uuid"].reset_index().sort_values(by=["uuid"], ascending=False) 

# exclude all musterdatensaetze lower than 5 
df_count_musterdatensatz = count_categories(df=df, column_name="MUSTERDATENSATZ", limit=5) 

make_distribution_fig(df_count=df_count_musterdatensatz, variable="MUSTERDATENSATZ", mode="distribution", counter="uuid",df=df)