# Forecasting population in Vilnius districts by age group and gender

Jonas Vitkauskas
Paulina Udes
Donatas Goštautas

### Uploading libraries

In [2]:
import pandas as pd
import numpy as np
from prophet import Prophet
import plotly.express as px
import plotly.graph_objects as go
from ipywidgets import Dropdown, VBox, HBox, Button, Output
from IPython.display import display
import logging
import warnings
import contextlib

### Getting rid of warning and logging messages

In [None]:
# Suppress logging messages from cmdstanpy
logger = logging.getLogger('cmdstanpy')
logger.setLevel(logging.ERROR)
for handler in logger.handlers:
    handler.setLevel(logging.ERROR)

# Suppress SettingWithCopyWarning
warnings.filterwarnings('ignore', category=pd.errors.SettingWithCopyWarning)

### Functions

In [None]:
# Context manager to suppress logging
@contextlib.contextmanager
def suppress_logging():
    logging.disable(logging.CRITICAL)
    try:
        yield
    finally:
        logging.disable(logging.NOTSET)

# Function to load and preprocess data
def load_and_preprocess(train_file_path, test_file_path):
    train_data = pd.read_csv(train_file_path)
    test_data = pd.read_csv(test_file_path)

    train_data['as_of_date_id'] = train_data['as_of_date_id'].astype(int)
    train_data['age_bin_id'] = train_data['age_bin_id'].astype(str)
    train_data['gender_id'] = train_data['gender_id'].astype(str)
    train_data['district_id'] = train_data['district_id'].astype(str)

    test_data['as_of_date_id'] = test_data['as_of_date_id'].astype(int)
    test_data['age_bin_id'] = test_data['age_bin_id'].astype(str)
    test_data['gender_id'] = test_data['gender_id'].astype(str)
    test_data['district_id'] = test_data['district_id'].astype(str)

    for age_bin in train_data['age_bin_id'].unique():
        for gender in train_data['gender_id'].unique():
            for district in train_data['district_id'].unique():
                mask = (train_data['age_bin_id'] == age_bin) & (train_data['gender_id'] == gender) & (train_data['district_id'] == district)
                count_75 = train_data.loc[mask & (train_data['as_of_date_id'] == 75), 'count'].values
                count_77 = train_data.loc[mask & (train_data['as_of_date_id'] == 77), 'count'].values
                if len(count_75) > 0 and len(count_77) > 0:
                    avg_count = (count_75[0] + count_77[0]) / 2
                    train_data.loc[mask & (train_data['as_of_date_id'] == 76), 'count'] = avg_count

    # Filter train_data to start from as_of_date_id 70
    train_data = train_data[train_data['as_of_date_id'] >= 70].reset_index(drop=True)

    # Assume start date and convert 'as_of_date_id' to datetime
    start_date = pd.to_datetime('2000-01-01')
    train_data['ds'] = start_date + pd.to_timedelta(train_data['as_of_date_id'], unit='D')
    test_data['ds'] = start_date + pd.to_timedelta(test_data['as_of_date_id'], unit='D')

    return train_data, test_data

# Function to normalize data
def normalize_data(df, column):
    mean = df[column].mean()
    std = df[column].std()
    df[column] = (df[column] - mean) / std
    return mean, std

# Function to denormalize data
def denormalize_data(df, column, mean, std):
    df[column] = df[column] * std + mean
    return df

# Function to train models using Prophet
def train_models(train_data):
    models = {}
    unique_combinations = train_data[['district_id', 'age_bin_id', 'gender_id']].drop_duplicates()

    for _, row in unique_combinations.iterrows():
        district, age_bin, gender = row['district_id'], row['age_bin_id'], row['gender_id']
        mask = (train_data['district_id'] == district) & (train_data['age_bin_id'] == age_bin) & (train_data['gender_id'] == gender)
        subset_data = train_data[mask]

        # Normalize the data
        mean, std = normalize_data(subset_data, 'count')
        
        subset_data = subset_data.rename(columns={'ds': 'ds', 'count': 'y'})
        model = Prophet(
            yearly_seasonality='auto',              # Enable yearly seasonality | 'auto'
            changepoint_prior_scale=0.1,         # Adjust to control trend flexibility | 0.1
            seasonality_prior_scale=10,           # Adjust to control seasonality flexibility | 10
            changepoint_range=0.85                 # Allow more flexibility for trend changes | 0.9
        )
        model.add_seasonality(
            name='12-period',
            period=12, 
            fourier_order=5
        )


        with suppress_logging():
            model.fit(subset_data[['ds', 'y']])
        
        models[(district, age_bin, gender)] = (model, mean, std)
    return models

# Function to make predictions
def make_predictions(models, test_data):
    predictions = []

    unique_combinations = test_data[['district_id', 'age_bin_id', 'gender_id']].drop_duplicates()

    for _, row in unique_combinations.iterrows():
        district, age_bin, gender = row['district_id'], row['age_bin_id'], row['gender_id']
        model, mean, std = models[(district, age_bin, gender)]
        mask_test = (test_data['district_id'] == district) & (test_data['age_bin_id'] == age_bin) & (test_data['gender_id'] == gender)
        subset_test_data = test_data[mask_test]

        future = subset_test_data[['ds']]
        forecast = model.predict(future)

        # Denormalize the predictions
        forecast['yhat'] = forecast['yhat'] * std + mean

        with warnings.catch_warnings():
            warnings.simplefilter('ignore', category=pd.errors.SettingWithCopyWarning)
            subset_test_data['Prediction'] = forecast['yhat'].values
            subset_test_data['Prediction'] = subset_test_data['Prediction'].iloc[::-1].values

        predictions.append(subset_test_data[['ID', 'district_id', 'age_bin_id', 'gender_id', 'as_of_date_id', 'Prediction']])
    
    return pd.concat(predictions, ignore_index=True)

# Function to plot results
def plot_results(train_data, predictions):
    train_plot_df = train_data.groupby('as_of_date_id')['count'].sum().reset_index()
    fig_train = px.line(train_plot_df, x='as_of_date_id', y='count', title='Sum of Population per Period of Time (Training Data)', labels={'as_of_date_id': 'Time Period', 'count': 'Total Population'}, markers=True)
    test_plot_df = predictions.groupby('as_of_date_id')['Prediction'].sum().reset_index()
    fig_test = px.line(test_plot_df, x='as_of_date_id', y='Prediction', title='Sum of Population per Period of Time (Test Predictions)', labels={'as_of_date_id': 'Time Period', 'Prediction': 'Total Predicted Population'}, markers=True)
    
    # Change the color of the prediction line
    fig_test.data[0].line.color = 'red'
    
    fig_train.add_trace(fig_test.data[0])
    fig_train.update_layout(title='Population Count per Period of Time (Training Data and Test Predictions)', xaxis_title='Time Period', yaxis_title='Population Count', legend_title='Dataset')
    fig_train.show()

# Function to plot results for a specific combination
def plot_individual_results(district, age_bin, gender):
    train_subset = train_data[(train_data['district_id'] == district) & 
                              (train_data['age_bin_id'] == age_bin) & 
                              (train_data['gender_id'] == gender)]
    
    pred_subset = predictions[(predictions['district_id'] == district) & 
                              (predictions['age_bin_id'] == age_bin) & 
                              (predictions['gender_id'] == gender)]
    
    fig = go.Figure()
    
    # Add training data
    fig.add_trace(go.Scatter(
        x=train_subset['as_of_date_id'], 
        y=train_subset['count'], 
        mode='lines+markers', 
        name='Training Data',
        line=dict(color='blue')
    ))
    
    # Add prediction data
    fig.add_trace(go.Scatter(
        x=pred_subset['as_of_date_id'], 
        y=pred_subset['Prediction'], 
        mode='lines+markers', 
        name='Predictions',
        line=dict(color='red')
    ))
    
    fig.update_layout(
        title=f'Population Count per Period of Time (District: {district}, Age Bin: {age_bin}, Gender: {gender})',
        xaxis_title='Time Period',
        yaxis_title='Population Count',
        legend_title='Dataset'
    )
    
    return fig

### Uploading data/training model/predicting/plotting general overview

In [None]:
# Main execution
train_file_path = '/workspaces/mpo-ab-test-overview/hacketon/data/train.csv'
test_file_path = '/workspaces/mpo-ab-test-overview/hacketon/data/test.csv'

# Load and preprocess data
train_data, test_data = load_and_preprocess(train_file_path, test_file_path)

# Train models for each combination of district, age_bin, and gender using Prophet
models = train_models(train_data)

# Make predictions using the trained models
predictions = make_predictions(models, test_data)

# Plot combined results with different color for predictions
plot_results(train_data, predictions)

### Plotting overview case by case

In [None]:
# Create a dropdown for each combination to view individual plots
unique_combinations = train_data[['district_id', 'age_bin_id', 'gender_id']].drop_duplicates()

# Convert district_id to integers for correct sorting, then sort by district_id, age_bin_id, and gender_id
unique_combinations['district_id_int'] = unique_combinations['district_id'].astype(int)
unique_combinations = unique_combinations.sort_values(by=['district_id_int', 'age_bin_id', 'gender_id'])

# Create the options for the dropdown
options = [(f'District: {row["district_id"]}, Age Bin: {row["age_bin_id"]}, Gender: {row["gender_id"]}', 
            (row["district_id"], row["age_bin_id"], row["gender_id"])) for _, row in unique_combinations.iterrows()]

# Drop the temporary integer column
unique_combinations = unique_combinations.drop(columns=['district_id_int'])


# Initialize the figure widget
fig = go.FigureWidget()

def update_plot(change):
    district, age_bin, gender = change['new']
    new_fig = plot_individual_results(district, age_bin, gender)
    
    with fig.batch_update():
        fig.data = []
        for trace in new_fig.data:
            fig.add_trace(trace)
        fig.layout = new_fig.layout

# Create the dropdown widget
dropdown = Dropdown(options=options, description='Select Group:')
dropdown.observe(update_plot, names='value')

# Create 'Back' and 'Next' buttons
back_button = Button(description='Back')
next_button = Button(description='Next')

# Function to handle 'Back' button click
def on_back_button_clicked(b):
    current_index = [i[1] for i in options].index(dropdown.value)
    if current_index > 0:
        dropdown.value = options[current_index - 1][1]

# Function to handle 'Next' button click
def on_next_button_clicked(b):
    current_index = [i[1] for i in options].index(dropdown.value)
    if current_index < len(options) - 1:
        dropdown.value = options[current_index + 1][1]

back_button.on_click(on_back_button_clicked)
next_button.on_click(on_next_button_clicked)

# Display the dropdown, buttons, and the plot output
display(VBox([HBox([back_button, next_button, dropdown]), fig]))

# Initialize with the first combination
dropdown.value = options[0][1]
update_plot({'new': options[0][1]})

### Saving output

In [None]:
# Save predictions to 'output.csv'
output = predictions[['ID', 'Prediction']].rename(columns={'Prediction': 'count'}).sort_values(by='ID')
output.to_csv('output.csv', index=False)