In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import plotly.express as px
import plotly.graph_objects as go
from ipywidgets import Dropdown, VBox, HBox, Button, Output
from IPython.display import display

In [None]:
train_df = pd.read_csv('data/train.csv')
test_df = pd.read_csv('data/test.csv')

In [None]:
train_df.head(5)

In [None]:
# data = train_df[train_df['as_of_date_id']>=70]
data = train_df.copy()

In [None]:
data = data[data['as_of_date_id'] >= 70].reset_index(drop=True)

In [None]:
# 1. Sum of all population per period of time
population_per_period = data.groupby('as_of_date_id')['count'].sum().reset_index()
fig1 = px.line(population_per_period, x='as_of_date_id', y='count', markers=True,
               title='Sum of Population per Period of Time',
               labels={'as_of_date_id': 'Time Period', 'count': 'Total Population'})
fig1.show()

In [None]:
# 2. Sum of population per each district_id
population_per_district_time = data.groupby(['as_of_date_id', 'district_id'])['count'].sum().reset_index()
fig2 = px.line(population_per_district_time, x='as_of_date_id', y='count', color='district_id', markers=True,
               title='Sum of Population per District over Time',
               labels={'as_of_date_id': 'Time Period', 'count': 'Total Population', 'district_id': 'District ID'})
fig2.show()

In [None]:
import plotly.graph_objects as go

def plot_individual_results(data, district, age_bin, gender):
    train_subset = data[(data['district_id'] == district) & 
                              (data['age_bin_id'] == age_bin) & 
                              (data['gender_id'] == gender)]
    
    fig = go.Figure()
    
    # Add training data
    fig.add_trace(go.Scatter(
        x=train_subset['as_of_date_id'], 
        y=train_subset['count'], 
        mode='lines+markers', 
        name='Training Data',
        line=dict(color='blue')
    ))
    
    fig.update_layout(
        title=f'Population Count per Period of Time (District: {district}, Age Bin: {age_bin}, Gender: {gender})',
        xaxis_title='Time Period',
        yaxis_title='Population Count',
        legend_title='Dataset'
    )
    
    return fig

In [None]:
# Create a dropdown for each combination to view individual plots
unique_combinations = data[['district_id', 'age_bin_id', 'gender_id']].drop_duplicates()

# Convert district_id to integers for correct sorting, then sort by district_id, age_bin_id, and gender_id
unique_combinations['district_id_int'] = unique_combinations['district_id'].astype(int)
unique_combinations = unique_combinations.sort_values(by=['district_id_int', 'age_bin_id', 'gender_id'])

# Create the options for the dropdown
options = [(f'District: {row["district_id"]}, Age Bin: {row["age_bin_id"]}, Gender: {row["gender_id"]}', 
            (row["district_id"], row["age_bin_id"], row["gender_id"])) for _, row in unique_combinations.iterrows()]

# Drop the temporary integer column
unique_combinations = unique_combinations.drop(columns=['district_id_int'])


# Initialize the figure widget
fig = go.FigureWidget()

def update_plot(change):
    district, age_bin, gender = change['new']
    new_fig = plot_individual_results(data, district, age_bin, gender)
    
    with fig.batch_update():
        fig.data = []
        for trace in new_fig.data:
            fig.add_trace(trace)
        fig.layout = new_fig.layout

# Create the dropdown widget
dropdown = Dropdown(options=options, description='Select Group:')
dropdown.observe(update_plot, names='value')

# Create 'Back' and 'Next' buttons
back_button = Button(description='Back')
next_button = Button(description='Next')

# Function to handle 'Back' button click
def on_back_button_clicked(b):
    current_index = [i[1] for i in options].index(dropdown.value)
    if current_index > 0:
        dropdown.value = options[current_index - 1][1]

# Function to handle 'Next' button click
def on_next_button_clicked(b):
    current_index = [i[1] for i in options].index(dropdown.value)
    if current_index < len(options) - 1:
        dropdown.value = options[current_index + 1][1]

back_button.on_click(on_back_button_clicked)
next_button.on_click(on_next_button_clicked)

# Display the dropdown, buttons, and the plot output
display(VBox([HBox([back_button, next_button, dropdown]), fig]))

# Initialize with the first combination
dropdown.value = options[0][1]
update_plot({'new': options[0][1]})

In [None]:
# 2. Sum of population per each gender in district 7 over time
district_7_data = data[data['district_id'] == 6]
population_per_gender_time = district_7_data.groupby(['as_of_date_id', 'gender_id'])['count'].sum().reset_index()

fig2 = px.line(population_per_gender_time, x='as_of_date_id', y='count', color='gender_id', markers=True,
               title='Sum of Population per Gender in District 7 over Time',
               labels={'as_of_date_id': 'Time Period', 'count': 'Total Population', 'gender_id': 'Gender ID'})

fig2.show()


In [None]:
# Aggregate the data by district, age bin, and gender
aggregated_data = data.groupby(['district_id', 'age_bin_id', 'gender_id']).agg({'count': 'sum'}).reset_index()

# Sort the unique combinations by the aggregated sum
sorted_aggregated_data = aggregated_data.sort_values(by='count', ascending=False)

# Display all rows
pd.set_option('display.max_rows', None)
sorted_aggregated_data