# Step 3. Analyze Data

In [None]:
import warnings
warnings.filterwarnings("ignore")

import re
import os
import time
from datetime import datetime
import glob

import json
from tqdm import tqdm

import pandas as pd
pd.set_option('display.max_rows', 50)

import matplotlib.pyplot as plt
from matplotlib.ticker import FuncFormatter

import seaborn as sns
sns.set_style("ticks")
plt.rc('grid', linestyle=':')

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import plotly.express as px

In [None]:
data_dir = '../data'
os.makedirs(data_dir, exist_ok=True)

In [None]:
with open(f'{data_dir}/states.json') as f:
    states = json.load(f)

df_states = pd.DataFrame(states) \
    .rename(columns={'name': 'state_name', 'value': 'state_id'}) \
    .sort_values('state_id') \
    .reset_index(drop=True)

df_states['state_id'] = df_states['state_id'].astype(int)
df_states = df_states.sort_values('state_name').reset_index(drop=True)

df_states = df_states[df_states['state_id'] != 0]

print(df_states.shape)
df_states.head(3)

In [None]:
df_dashboard = pd.read_csv(f'{data_dir}/dashboard.csv')

df_dashboard['date'] = pd.to_datetime(df_dashboard['year'].astype(str) + '-' + df_dashboard['month'].astype(str))
df_dashboard.sort_values(by=['state_id', 'date'], inplace=True)

print(df_dashboard.shape)
df_dashboard.head()

In [None]:
subset = df_dashboard[df_dashboard['state_id'].isna()]
print(subset.shape)
subset.head(3)

## Growth in Eligible Beneficiaries by State

In [None]:
counts = df_dashboard \
    .pivot_table(
        index='date',
        columns='state_id',
        values='eligible_beneficiaries'
    ).sort_index(axis=1)

growth = (counts.iloc[-1] - counts.iloc[0]).reset_index(name='total_growth')
growth = pd.merge(growth, df_states, on='state_id').drop(columns=['state_id'])[['state_name', 'total_growth']].sort_values('total_growth', ascending=True)

fig = px.bar(
    growth,
    x='state_name',
    y='total_growth',
    title=f"Growth in Eligible Beneficiaries by State ({counts.index.max().strftime('%Y-%m')} vs {counts.index.min().strftime('%Y-%m')})",
    labels={'state_name': 'State', 'total_growth': 'Growth in Eligible Beneficiaries'},
    width=1000,
    height=800
)

fig.update_layout(
    xaxis_title='',
    yaxis_title='Growth in Eligible Beneficiaries',
    showlegend=False,
    bargap=0.2,
    plot_bgcolor='white',
    xaxis_tickangle=-90,
    margin=dict(l=40, r=20, t=60, b=100)
)

fig.update_yaxes(
    gridcolor='lightgrey',
    tickformat=','
)

fig.update_xaxes(
    gridcolor='lightgrey',
    gridwidth=1,
    griddash='dot'
)

fig.show()

In [None]:
counts = df_dashboard \
    .pivot_table(
        index='date',
        columns='state_id',
        values='eligible_beneficiaries'
    ).sort_index(axis=1).diff().cumsum()

counts.columns = [df_states[df_states['state_id'] == state_id]['state_name'].values[0] for state_id in counts.columns]
counts = counts.reindex(sorted(counts.columns), axis=1)

fig = px.line(
    counts,
    y=counts.columns,
    x=counts.index,
    title=f"Cumulative Growth in Eligible Beneficiaries by State Over Time ({counts.index.min().strftime('%Y-%m')} to {counts.index.max().strftime('%Y-%m')})",
    labels={'state_name': 'State', 'total_growth': 'Growth in Eligible Beneficiaries'},
    width=1500,
    height=900,
)

fig.update_yaxes(
    gridcolor='lightgrey',
    tickformat=',',
    title = 'Growth in Eligible Beneficiaries'
)

fig.update_xaxes(
    gridcolor='lightgrey',
    gridwidth=1,
    griddash='dot',
    title = ''
)

fig.update_layout(
    legend_title_text="States"
)

fig.show()

## Growth in Anganwadi Centers by State

In [None]:
counts = df_dashboard \
    .pivot_table(
        index='date',
        columns='state_id',
        values='anganwadi_centers'
    ).sort_index(axis=1)

growth = (counts.iloc[-1] - counts.iloc[0]).reset_index(name='total_growth')
growth = pd.merge(growth, df_states, on='state_id').drop(columns=['state_id'])[['state_name', 'total_growth']].sort_values('total_growth', ascending=True)

fig = px.bar(
    growth,
    x='state_name',
    y='total_growth',
    title=f"Growth in Anganwadi Centers by State ({counts.index.max().strftime('%Y-%m')} vs {counts.index.min().strftime('%Y-%m')})",
    labels={'state_name': 'State', 'total_growth': 'Growth in Anganwadi Centers'},
    width=1000,
    height=800
)

fig.update_layout(
    xaxis_title='',
    yaxis_title='Growth in Anganwadi Centers',
    showlegend=False,
    bargap=0.2,
    plot_bgcolor='white',
    xaxis_tickangle=-90,
    margin=dict(l=40, r=20, t=60, b=100)
)

fig.update_yaxes(
    gridcolor='lightgrey',
    tickformat=','
)

fig.update_xaxes(
    gridcolor='lightgrey',
    gridwidth=1,
    griddash='dot'
)

fig.show()

In [None]:
counts = df_dashboard \
    .pivot_table(
        index='date',
        columns='state_id',
        values='anganwadi_centers'
    ).sort_index(axis=1).diff().cumsum()

counts.columns = [df_states[df_states['state_id'] == state_id]['state_name'].values[0] for state_id in counts.columns]
counts = counts.reindex(sorted(counts.columns), axis=1)

fig = px.line(
    counts,
    y=counts.columns,
    x=counts.index,
    title=f"Cumulative Growth in Anganwadi Centers by State Over Time ({counts.index.min().strftime('%Y-%m')} to {counts.index.max().strftime('%Y-%m')})",
    labels={'state_name': 'State', 'total_growth': 'Growth in Anganwadi Centers'},
    width=1500,
    height=900,
)

fig.update_yaxes(
    gridcolor='lightgrey',
    tickformat=',',
    title = 'Growth in Anganwadi Centers'
)

fig.update_xaxes(
    gridcolor='lightgrey',
    gridwidth=1,
    griddash='dot',
    title = ''
)

fig.update_layout(
    legend_title_text="States"
)

fig.show()

## Growth in Anganwadi Workers by State

In [None]:
counts = df_dashboard \
    .pivot_table(
        index='date',
        columns='state_id',
        values='anganwadi_workers'
    ).sort_index(axis=1)

growth = (counts.iloc[-1] - counts.iloc[0]).reset_index(name='total_growth')
growth = pd.merge(growth, df_states, on='state_id').drop(columns=['state_id'])[['state_name', 'total_growth']].sort_values('total_growth', ascending=True)

fig = px.bar(
    growth,
    x='state_name',
    y='total_growth',
    title=f"Growth in Anganwadi Workers by State ({counts.index.max().strftime('%Y-%m')} vs {counts.index.min().strftime('%Y-%m')})",
    labels={'state_name': 'State', 'total_growth': 'Growth in Anganwadi Workers'},
    width=1000,
    height=800
)

fig.update_layout(
    xaxis_title='',
    yaxis_title='Growth in Anganwadi Workers',
    showlegend=False,
    bargap=0.2,
    plot_bgcolor='white',
    xaxis_tickangle=-90,
    margin=dict(l=40, r=20, t=60, b=100)
)

fig.update_yaxes(
    gridcolor='lightgrey',
    tickformat=','
)

fig.update_xaxes(
    gridcolor='lightgrey',
    gridwidth=1,
    griddash='dot'
)

fig.show()

In [None]:
counts = df_dashboard \
    .pivot_table(
        index='date',
        columns='state_id',
        values='anganwadi_workers'
    ).sort_index(axis=1).diff().cumsum()

counts.columns = [df_states[df_states['state_id'] == state_id]['state_name'].values[0] for state_id in counts.columns]
counts = counts.reindex(sorted(counts.columns), axis=1)

fig = px.line(
    counts,
    y=counts.columns,
    x=counts.index,
    title=f"Cumulative Growth in Anganwadi Workers by State Over Time ({counts.index.min().strftime('%Y-%m')} to {counts.index.max().strftime('%Y-%m')})",
    labels={'state_name': 'State', 'total_growth': 'Growth in Anganwadi Workers'},
    width=1500,
    height=900,
)

fig.update_yaxes(
    gridcolor='lightgrey',
    tickformat=',',
    title = 'Growth in Anganwadi Workers'
)

fig.update_xaxes(
    gridcolor='lightgrey',
    gridwidth=1,
    griddash='dot',
    title = ''
)

fig.update_layout(
    legend_title_text="States"
)

fig.show()