In [None]:
# Language distribution by region (choropleth placeholder)

import pandas as pd
import plotly.express as px

# Load language detection results and track metadata
lang_df = pd.read_csv('../data/language_detected.csv')
tracks_df = pd.read_csv('../data/tracks_2020.csv')

# Merge both datasets on track_id to combine language and region info
df = pd.merge(tracks_df, lang_df[['track_id', 'language']], on='track_id', how='left')

# Temporarily assign all tracks to a single region for testing the choropleth
# In a real scenario, this should be replaced with actual country or region data per track
df['region'] = 'United States'  # Replace with actual country field if available

# Aggregate language counts by region
# This will give the number of songs in each language per region
lang_counts = df.groupby(['region', 'language']).size().reset_index(name='count')

# Plot a choropleth map using Plotly Express
# This map visualizes the distribution of song languages by country (mock data for now)
fig = px.choropleth(
    lang_counts,
    locations='region',
    locationmode='country names',
    color='count',
    hover_name='language',
    title='Language Distribution by Country (Mock)',
    color_continuous_scale='Viridis'
)

fig.show()