In [None]:
import os
import pandas as pd
import seaborn as sns


if os.path.basename(os.getcwd()) == 'notebooks':
    os.chdir(os.path.dirname(os.getcwd()))

from coral_models.Geography_Helper import Geography_Helper

In [None]:
sns.set(rc={'figure.figsize':(12,8)})

In [None]:
dir_speaker_data = "data/hidden/speakers.xlsx"
df_speakers = pd.read_excel(dir_speaker_data)

In [None]:
df_speakers['gender'] = df_speakers['gender'].replace('F', "Female")
df_speakers['gender'] = df_speakers['gender'].str.capitalize()

df_speakers['dialect'] = df_speakers['dialect'].str.capitalize()
df_speakers['dialect'] = df_speakers['dialect'].str.strip()
df_speakers['dialect'] = df_speakers['dialect'].replace(
    'Djurslandsk (nord-, syddjurs m. nord- og sydsamsø, anholt)', "Djurslandsk")
df_speakers['dialect'] = df_speakers['dialect'].replace(
    'Sønderjysl', "Sønderjysk")
df_speakers['dialect'] = df_speakers['dialect'].replace(
    'Østligt sønderjysk', "Østligt sønderjysk (m. als)")
df_speakers['dialect'] = df_speakers['dialect'].replace(
    'Øststjysk', "Østjysk")
df_speakers['dialect'] = df_speakers['dialect'].replace(
    'Vestlig sønderjysk', "Vestlig sønderjysk (m. mandø og rømø)")
df_speakers['dialect'] = df_speakers['dialect'].replace(
    'Sydsjællandsk', "Sydsjællandsk (sydligt sydsjællandsk)")



In [None]:
counts = df_speakers['gender'].value_counts()
ax = sns.countplot(df_speakers,x="gender")
ax.bar_label(ax.containers[0])

In [None]:
counts = df_speakers['dialect'].value_counts().sort_index()
dialects = list(pd.unique(df_speakers['dialect']))
dialects.sort()

ax = sns.countplot(df_speakers,x="dialect", order=dialects)
ax.bar_label(ax.containers[0])
ax.tick_params(axis='x', rotation=90)

In [None]:
dialects[:20]

In [None]:
sns.displot(df_speakers, x="age")

In [None]:
geo_helper = Geography_Helper()


In [None]:
df_speakers['kommunekod'] = df_speakers['zip_grew_up'].apply(
    lambda x: geo_helper.getMunicipality(x))
df_speakers['regionskod'] = df_speakers['kommunekod'].apply(
    lambda x: geo_helper.getRegion(x))


In [None]:
df_count_zip = df_speakers.groupby(['zip_grew_up']).size().reset_index(name='count')
df_count_zip = df_count_zip.rename(columns={"zip_grew_up": "postnummer"})

dfmap_zip = geo_helper.get_dfmap('zipcode')

dfmap_zip = pd.merge(dfmap_zip, df_count_zip, how="left",on='postnummer')
dfmap_zip['count'] = dfmap_zip['count'].fillna(0.)
dfmap_zip.plot('count', cmap='viridis', legend=True)

In [None]:
df_count_mun = df_speakers.groupby(['kommunekod']).size().reset_index(name='count')

dfmap_mun = geo_helper.get_dfmap('municipality')

dfmap_mun = pd.merge(dfmap_mun, df_count_mun, how="left",on='kommunekod')
dfmap_mun['count'] = dfmap_mun['count'].fillna(0.)
dfmap_mun.plot('count', cmap='viridis', legend=True)

In [None]:
dfmap_mun.explore(
    column = 'count',
    tooltip = ['navn', 'kommunekod', 'count'],
    popup=True
)

In [None]:
df_count_reg = df_speakers.groupby(['regionskod']).size().reset_index(name='count')

dfmap_reg = geo_helper.get_dfmap('region')

dfmap_reg = pd.merge(dfmap_reg, df_count_reg, how="left",on='regionskod')
dfmap_reg['count'] = dfmap_reg['count'].fillna(0.)
dfmap_reg.plot('count', cmap='viridis', legend=True)