In [None]:
import os
import sqlite3
from collections import Counter

import pandas as pd
import seaborn as sns

if os.path.basename(os.getcwd()) == "notebooks":
    os.chdir(os.path.dirname(os.getcwd()))

from coral.Dialect_Helper import Dialect_Helper
from coral.Geography_Helper import Geography_Helper

dialect_helper = Dialect_Helper()

## Load Data

In [None]:
path_sqlite = '/Volumes/CoRal/_new_structure/raw/CoRal_public.db'

# Connect to the SQLite database
conn = sqlite3.connect(path_sqlite)

# Define the SQL query to select the table
query = "SELECT * FROM Speakers"

# Load the table into a pandas dataframe
df_speakers = pd.read_sql_query(query, conn)

# Close the database connection
conn.close()


In [None]:
df_speakers['dialect_lvl1'] = df_speakers['dialect'].apply(
    lambda x: dialect_helper.convert_to_depth(x,1))
df_speakers['dialect_lvl2'] = df_speakers['dialect'].apply(
    lambda x: dialect_helper.convert_to_depth(x,2))
df_speakers['dialect_lvl3'] = df_speakers['dialect'].apply(
    lambda x: dialect_helper.convert_to_depth(x,3))
df_speakers['dialect_lvl4'] = df_speakers['dialect'].apply(
    lambda x: dialect_helper.convert_to_depth(x,4))

In [None]:
# Display the dataframe
df_speakers

## Compute distribution plots

In [None]:
counts = df_speakers["gender"].value_counts()
ax = sns.countplot(df_speakers, x="gender")
ax.bar_label(ax.containers[0])

In [None]:
counts = df_speakers["dialect_lvl1"].value_counts().sort_index()
dialects = list(pd.unique(df_speakers["dialect_lvl1"]))
dialects.sort()

ax = sns.countplot(df_speakers, x="dialect_lvl1", order=dialects)
ax.bar_label(ax.containers[0])
ax.tick_params(axis="x", rotation=90)

In [None]:
counts = df_speakers["dialect_lvl2"].value_counts().sort_index()
dialects = list(pd.unique(df_speakers["dialect_lvl2"]))
dialects.sort()

ax = sns.countplot(df_speakers, x="dialect_lvl2", order=dialects)
ax.bar_label(ax.containers[0])
ax.tick_params(axis="x", rotation=90)

In [None]:
counts = df_speakers["dialect_lvl3"].value_counts().sort_index()
dialects = list(pd.unique(df_speakers["dialect_lvl3"]))
dialects.sort()

ax = sns.countplot(df_speakers, x="dialect_lvl3", order=dialects)
ax.bar_label(ax.containers[0])
ax.tick_params(axis="x", rotation=90)

In [None]:
counts = df_speakers["dialect_lvl4"].value_counts().sort_index()
dialects = list(pd.unique(df_speakers["dialect_lvl4"]))
dialects.sort()

ax = sns.countplot(df_speakers, x="dialect_lvl4", order=dialects)
ax.bar_label(ax.containers[0])
ax.tick_params(axis="x", rotation=90)

In [None]:
sns.displot(df_speakers, x="age")

In [None]:
Counter(df_speakers["age"])

In [None]:
df_speakers[df_speakers["age"] == 3]

In [None]:
df_recordings[df_recordings["id_speaker"] == "spe_518ed29525738cebdac49c49e60ea9d3"]

In [None]:
geo_helper = Geography_Helper()

In [None]:
df_speakers["kommunekod"] = df_speakers["zip_school"].apply(
    lambda x: geo_helper.getMunicipality(x)
)
df_speakers["regionskod"] = df_speakers["kommunekod"].apply(
    lambda x: geo_helper.getRegion(x)
)

In [None]:
df_count_zip = df_speakers.groupby(["zip_school"]).size().reset_index(name="count")
df_count_zip = df_count_zip.rename(columns={"zip_school": "postnummer"})

dfmap_zip = geo_helper.get_dfmap("zipcode")

dfmap_zip = pd.merge(dfmap_zip, df_count_zip, how="left", on="postnummer")
dfmap_zip["count"] = dfmap_zip["count"].fillna(0.0)
dfmap_zip.plot("count", cmap="viridis", legend=True)

In [None]:
dfmap_zip.explore(column="count", tooltip=["navn", "postnummer", "count"], popup=True)

In [None]:
df_count_mun = df_speakers.groupby(["kommunekod"]).size().reset_index(name="count")

dfmap_mun = geo_helper.get_dfmap("municipality")

dfmap_mun = pd.merge(dfmap_mun, df_count_mun, how="left", on="kommunekod")
dfmap_mun["count"] = dfmap_mun["count"].fillna(0.0)
dfmap_mun.plot("count", cmap="viridis", legend=True)

In [None]:
dfmap_mun.explore(column="count", tooltip=["navn", "kommunekod", "count"], popup=True)

In [None]:
df_count_reg = df_speakers.groupby(["regionskod"]).size().reset_index(name="count")

dfmap_reg = geo_helper.get_dfmap("region")

dfmap_reg = pd.merge(dfmap_reg, df_count_reg, how="left", on="regionskod")
dfmap_reg["count"] = dfmap_reg["count"].fillna(0.0)
dfmap_reg.plot("count", cmap="viridis", legend=True)

In [None]:
path_sqlite = "/Volumes/CoRal/_new_structure/raw/CoRal_public.db"

# Connect to the SQLite database
conn = sqlite3.connect(path_sqlite)

# Define the SQL query to select the table
query = "SELECT * FROM Recordings"

# Load the table into a pandas dataframe
df_recordings = pd.read_sql_query(query, conn)

# Close the database connection
conn.close()

In [None]:
df_recordings

In [None]:
df_recordings["date"] = df_recordings["datetime_start"].apply(lambda x: x.split(" ")[0])

Counter(df_recordings["date"])

In [None]:
unique_values = df_recordings["date"].unique()
unique_values.sort()
len(unique_values)

In [None]:
path_sqlite = "/Volumes/CoRal/_new_structure/raw/CoRal_public.db"

# Connect to the SQLite database
conn = sqlite3.connect(path_sqlite)

# Define the SQL query to select the table
query = "SELECT * FROM conversations"

# Load the table into a pandas dataframe
df_conversations = pd.read_sql_query(query, conn)

# Close the database connection
conn.close()

In [None]:
df_conversations