In [6]:
import sys, os
import re

import pandas as pd
import numpy as np

#---------------------------------------------------------------------
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
def get_standard_colors(n_colors, palette=None):
    # colorblind has only 10 different colors
    # If palette is None:
    #   Use colorblind if n_colors < 10
    #   Otherwise use husl (maybe should be Spectral?);
    # Otherwise:
    #   Use palette
    if palette is None:
        if n_colors <= 10:
            colors = sns.color_palette('colorblind', n_colors)
        else:
            colors = sns.color_palette('husl', n_colors)
    else:
        colors = sns.color_palette(palette, n_colors)
    return colors

def get_standard_colors_dict(keys, palette=None):
    n_colors = len(keys)
    palette_dict = get_standard_colors(n_colors, palette)
    palette_dict = {keys[i]:palette_dict[i] for i in range(n_colors)}
    return palette_dict

In [None]:
#Erica's functions
def dist_cmi(df, district, percentage):
    dist_df=df[df['District']==district]
    total_cutouts=dist_df['Total Potential Porcelain Cutouts'].sum()
    top_cutouts=round(percentage/100*total_cutouts,0)
    threshold=100
    new_df=dist_df[dist_df['final_score']==threshold]
    while new_df['Total Potential Porcelain Cutouts'].sum()<=top_cutouts:
        threshold-=.1
        new_df=dist_df[dist_df['final_score']>=threshold]
    return new_df
def percent_cmi(df, district, percentage):
    # Spot the error in the original line below!
    #cutout_dist_df=dist_cmi(df, district, percent)
    cutout_dist_df=dist_cmi(df, district, percentage)
    percentage_cmi=cutout_dist_df['Total Circuit CMI'].sum()
    all_dist_df=df[df['District']==district]
    all_cmi=all_dist_df['Total Circuit CMI'].sum()
    return percentage_cmi/all_cmi*100


In [None]:
def dist_cmi_JB_0(df, district, percentage, district_col='District', ignore_warning=True, **kwargs):
    # percentage INTENDED to be between 0 and 1, but using values
    # Using values between 0 and 100 should work also (i.e., 20 instead of 0.20)
    #   However, to avoid any confusion, it is suggested to stick to values between 0 and 1
    # If percentage=1 is input, this will be interpreted as 100%
    # To implement a 1% value, one would need to input percentage = 0.01
    #------------------
    # Unpack kwargs
    # This simply makes the function more flexible.  If, in the future, your dataframe has different
    #   column names, you can simply input the new names as arguments to the function.
    district_col                   = kwargs.get('district_col', 'District')
    final_score_col                = kwargs.get('final_score_col', 'final_score')
    tot_potential_prcln_cutous_col = kwargs.get('tot_potential_prcln_cutous_col', 
                                                'Total Potential Porcelain Cutouts')
    ignore_warning                 = kwargs.get('ignore_warning', False)
    #------------------
    if percentage>1:
        if not ignore_warning:
            print(f'percentage > 1!\npercentage set equal to {percentage}\nUsing value {percentage/100}')
        percentage/=100
    srtd_dist_df = df[df[district_col]==district].sort_values(by=[final_score_col], ascending=False)
    srtd_dist_df['cumulative_percentage'] = (srtd_dist_df[tot_potential_prcln_cutous_col].cumsum()/
                                             srtd_dist_df[tot_potential_prcln_cutous_col].sum())
    return_df = srtd_dist_df[srtd_dist_df['cumulative_percentage']<=percentage] #+.015 with 20% to get Erica's results
    return_df = return_df.drop(columns=['cumulative_percentage'])
    return return_df


def dist_cmi_JB(df, district, percentage, use_first_value_over_percentage=False, **kwargs):
    # percentage INTENDED to be between 0 and 1, but using values
    # Using values between 0 and 100 should work also (i.e., 20 instead of 0.20)
    #   However, to avoid any confusion, it is suggested to stick to values between 0 and 1
    # If percentage=1 is input, this will be interpreted as 100%
    # To implement a 1% value, one would need to input percentage = 0.01
    #
    # use_first_value_over_percentage
    #   If True, this will return the first value over percentage if there is no value
    #     exactly equal to the value
    #     I believe this should match the functionality of Erica's original code
    #   If False, this will function like dist_cmi_JB_0, where percentage is a hard maximum,
    #     and only entries with values less than it will be returned
    #------------------
    # Unpack kwargs
    # This simply makes the function more flexible.  If, in the future, your dataframe has different
    #   column names, you can simply input the new names as arguments to the function.
    district_col                   = kwargs.get('district_col', 'District')
    final_score_col                = kwargs.get('final_score_col', 'final_score')
    tot_potential_prcln_cutous_col = kwargs.get('tot_potential_prcln_cutous_col', 
                                                'Total Potential Porcelain Cutouts')
    ignore_warning                 = kwargs.get('ignore_warning', False)
    cumu_pct_col                   = kwargs.get('cumu_pct_col', 'cumulative_percentage')
    drop_cumu_pct_col              = kwargs.get('drop_cumu_pct_col', True)
    #------------------
    if percentage>1:
        if not ignore_warning:
            print(f'percentage > 1!\npercentage set equal to {percentage}\nUsing value {percentage/100}')
        percentage/=100
    srtd_dist_df = df[df[district_col]==district].sort_values(by=[final_score_col], ascending=False)
    srtd_dist_df[cumu_pct_col] = (srtd_dist_df[tot_potential_prcln_cutous_col].cumsum()/
                                             srtd_dist_df[tot_potential_prcln_cutous_col].sum())
    return_df = srtd_dist_df[srtd_dist_df[cumu_pct_col]<=percentage]
    #------------------
    # Below, cumulative_percentage is monotonically increasing by definition, which is why
    #   grabbing the index location is fine
    # NOTE!!!! Order matters in or statement below
    #          If return_df is empty, return_df.iloc[-1][cumu_pct_col]<percentage will throw an error
    #          Therefore, return_df.shape[0]==0 must be evaluated first
    if (use_first_value_over_percentage and 
        (return_df.shape[0]==0 or return_df.iloc[-1][cumu_pct_col]<percentage)):
        next_val = srtd_dist_df.iloc[return_df.shape[0]][cumu_pct_col]
        # This handles the unlikely case that the next 2 or more entries have the exact same cumulative_percentage
        #   This could only occur if the tot_potential_prcln_cutous_col values for these rows equals 0
        #   In this case, all sharing the cumulative_percentage value will be appended
        #  In probably every case, there will only be one additional row appended
        to_append = srtd_dist_df[srtd_dist_df[cumu_pct_col]==next_val]
        return_df = return_df.append(to_append)
    #------------------
    if drop_cumu_pct_col:
        return_df = return_df.drop(columns=[cumu_pct_col])
    return return_df

In [None]:
def percent_cmi_JB(df, district, percentage, use_first_value_over_percentage=False, **kwargs):
    #------------------
    # Unpack kwargs
    district_col                   = kwargs.get('district_col', 'District')
    tot_cir_cmi_col                = kwargs.get('tot_cir_cmi_col', 'Total Circuit CMI')
    #------------------
    dist_df = df[df[district_col]==district]
    cmi_tot = dist_df[tot_cir_cmi_col].sum()
    # NOTE: dist_df fed into dist_cmi_JB instead of df
    #       Can still feed df and get same result, but feeding
    #       it dist_df should save memory and time
    dist_df_i = dist_cmi_JB(dist_df, district, percentage, use_first_value_over_percentage, **kwargs)
    cmi_i = dist_df_i[tot_cir_cmi_col].sum()
    if cmi_tot==0:
        assert(cmi_i==0)
        return 0
    else:
        return 100.0*cmi_i/cmi_tot

In [None]:
erica_df = pd.read_csv(r'C:\Users\s346557\Documents\erica_df.csv')
# NOTE: Is this what you want?
erica_df = erica_df.dropna(subset=['final_score'])

# Compare dist_cmi to dist_cmi_JB

In [None]:
district_col = 'District'
final_score_col = 'final_score'
tot_potential_prcln_cutous_col = 'Total Potential Porcelain Cutouts'

district='Athens'
percentage= 10

In [None]:
df = erica_df.copy()
df_dist_cmi_EP = dist_cmi(df, district, percentage)
df_dist_cmi_JB_0 = dist_cmi_JB_0(df, district, percentage)
df_dist_cmi_JB_0_v2 = dist_cmi_JB(df, district, percentage)
df_dist_cmi_JB = dist_cmi_JB(df, district, percentage, use_first_value_over_percentage=True)

In [None]:
print('df_dist_cmi_EP == df_dist_cmi_JB?')
print(df_dist_cmi_EP.sort_values(by=['Circuit'], ignore_index=True).equals(df_dist_cmi_JB.sort_values(by=['Circuit'], ignore_index=True)))

In [None]:
print('df_dist_cmi_JB_0 == df_dist_cmi_JB_0_v2?')
print(df_dist_cmi_JB_0.sort_values(by=['Circuit'], ignore_index=True).equals(df_dist_cmi_JB_0_v2.sort_values(by=['Circuit'], ignore_index=True)))

In [None]:
print('Compare methods')
for district in erica_df['District'].unique():
    print(f'{district}:')
    print(f"""
    Erica's method:               {percent_cmi(erica_df, district, percentage)}
    JB (use_first_over...=True):  {percent_cmi_JB(erica_df, district, percentage, use_first_value_over_percentage=True, ignore_warning=True)}
    JB (use_first_over...=False): {percent_cmi_JB(erica_df, district, percentage, use_first_value_over_percentage=False, ignore_warning=True)}
    """)

# Walk through dist_cmi_JB

In [None]:
district_col = 'District'
final_score_col = 'final_score'
tot_potential_prcln_cutous_col = 'Total Potential Porcelain Cutouts'
ignore_warning = False
use_first_value_over_percentage = False

district='Athens'
percentage= 10
percentage= 0.10


if percentage>1:
    if not ignore_warning:
        print(f'percentage > 1!\npercentage set equal to {percentage}\nUsing value {percentage/100}')
    percentage/=100
srtd_dist_df = df[df[district_col]==district].sort_values(by=[final_score_col], ascending=False)
srtd_dist_df['cumulative_percentage'] = (srtd_dist_df[tot_potential_prcln_cutous_col].cumsum()/
                                         srtd_dist_df[tot_potential_prcln_cutous_col].sum())
return_df = srtd_dist_df[srtd_dist_df['cumulative_percentage']<=percentage]
#------------------
# Below, cumulative_percentage is monotonically increasing by definition, which is why
#   grabbing the index location is fine
if use_first_value_over_percentage and return_df.iloc[-1]['cumulative_percentage']<percentage:
    next_val = srtd_dist_df.iloc[return_df.shape[0]]['cumulative_percentage']
    # This handles the unlikely case that the next 2 or more entries have the exact same cumulative_percentage
    #   This could only occur if the tot_potential_prcln_cutous_col values for these rows equals 0
    #   In this case, all sharing the cumulative_percentage value will be appended
    #  In probably every case, there will only be one additional row appended
    to_append = srtd_dist_df[srtd_dist_df['cumulative_percentage']==next_val]
    return_df = return_df.append(to_append)
#------------------
return_df = return_df.drop(columns=['cumulative_percentage'])

# Build Plotting DataFrame

### Original idea, new_df below, good for visual inspection by human eye
### But, not so great for utilization in barplot/lineplot methods

In [None]:
fig_num = 0
percents = np.linspace(0,1,11)
percents = percents[1:]  #Exclude pct=0, which is trivially 0 for all
districts = erica_df['District'].unique().tolist()
new_df = pd.DataFrame(columns=['cutoff_pct']+districts)
print(new_df)
print(percents)

In [None]:
for pct in percents:
    col_dict = {'cutoff_pct':pct}
    for district in districts:
        assert(district not in col_dict)
        col_dict[district] = percent_cmi_JB(erica_df, district, pct)
    new_df=new_df.append(col_dict, ignore_index=True)
new_df

### plot_df. below, better suited for plotting

In [None]:
plot_df = pd.DataFrame(columns=['pct_cutoff', 'district', 'value'])
for pct in percents:
    for district in districts:
        plot_df = plot_df.append({'pct_cutoff':100*pct, 
                                  'district':district, 
                                  'value':percent_cmi_JB(erica_df, district, pct)}, 
                                 ignore_index=True)
plot_df['pct_cutoff'] =  plot_df['pct_cutoff'].round(1)
#plot_df()

In [None]:
colors_dict_districts = get_standard_colors_dict(districts)
colors_dict_pct_cutoffs = get_standard_colors_dict(plot_df['pct_cutoff'].unique())

In [None]:
fig,ax = plt.subplots(1, 1, num=fig_num, figsize=[14, 6])
plot_df[plot_df['pct_cutoff']==10.0].plot.bar(ax=ax, x='district', y='value')
fig_num += 1

In [None]:
sns.barplot(x='district', y='value', data=plot_df[plot_df['pct_cutoff']==10.0], palette=colors_dict_districts)

In [None]:
fig,ax = plt.subplots(1, 1, num=fig_num, figsize=[14, 6])
sns.barplot(ax=ax, x='district', y='value', hue='pct_cutoff', data=plot_df)
fig_num+=1

In [None]:
def handle_and_apply_formattings_args(ax, draw_legend, kwargs):
    title_args  = kwargs.get('title_args', None)
    ax_args     = kwargs.get('ax_args', None)
    xlabel_args = kwargs.get('xlabel_args', None)
    ylabel_args = kwargs.get('ylabel_args', None)
    legend_args = kwargs.get('legend_args', None)
    tick_args = kwargs.get('tick_args', None)
    # Note: tick_args can be a dict or a list
    #       Making it a list allows operations on both x and y
    #       e.g. tick_args =[dict(axis='x', labelrotation=90, labelsize=7.0, direction='in'), 
    #                        dict(axis='y', labelrotation=0, labelsize=10.0, direction='out')]
    #----------------------------
    if isinstance(title_args, str):
        title_args = dict(label=title_args)
    if title_args is not None:
        ax.set_title(**title_args)
    #----------------------------
    if ax_args is not None:
        ax.set(**ax_args)
    if xlabel_args is not None:
        ax.set_xlabel(**xlabel_args)
    if ylabel_args is not None:
        ax.set_ylabel(**ylabel_args)
    if tick_args is not None:
        if isinstance(tick_args, dict):
            tick_args = [tick_args]
        for t_args in tick_args:
            ax.tick_params(**t_args)
    #---------------------------
    if not draw_legend:
        if ax.legend_:
            ax.legend_.remove()
    else:
        if legend_args is None:
            ax.legend()
        else:
            ax.legend(**legend_args)
    #---------------------------

In [None]:
def build_plot_for_erica(ax, df, x_col, y_col, hue, 
                         draw_legend=True, barplot_kwargs=None, 
                         **kwargs):
    #---------------------------
    if barplot_kwargs is None:
        barplot_kwargs = {}
    #---------------------------
    # These should only be arguments accepted by sns.barplot
    barplot_kwargs['palette'] = barplot_kwargs.get('palette', None)
    #---------------------------
    sns.barplot(ax=ax, x=x_col, y=y_col, hue=hue, data=df, **barplot_kwargs)
    #----------------------------
    handle_and_apply_formattings_args(ax=ax, draw_legend=draw_legend, kwargs=kwargs)

In [None]:
fig,ax = plt.subplots(1, 1, num=fig_num, figsize=[14, 6])
sns.barplot(ax=ax, x='pct_cutoff', y='value', hue='district', data=plot_df, palette=colors_dict_districts)
fig_num+=1

In [None]:
fig,ax = plt.subplots(1, 1, num=fig_num, figsize=[14, 6])
build_plot_for_erica(ax=ax, df=plot_df, x_col='pct_cutoff', y_col='value', hue='district', 
                     draw_legend=True, legend_args=dict(fontsize=15, title='District', title_fontsize=15), 
                     barplot_kwargs= dict(palette=colors_dict_districts), 
                     title_args=dict(label=f'% CMI Avoided After Replacing Cutouts', fontsize=22.5), 
                     ylabel_args = dict(ylabel=f'% CMI', fontsize=20, x=0.0, y=0.8, ha='left', va='bottom'), 
                     xlabel_args = dict(xlabel='% Cutouts Replaced', fontsize=20, x=0.9, y=0.0, ha='right', va='top'), 
                     tick_args=[dict(axis='x', labelrotation=90, labelsize=15), 
                                dict(axis='y', labelsize=15)])
fig_num += 1

In [None]:
# Using husl palette instead of dict
fig,ax = plt.subplots(1, 1, num=fig_num, figsize=[14, 6])
build_plot_for_erica(ax=ax, df=plot_df, x_col='pct_cutoff', y_col='value', hue='district', 
                     draw_legend=True, legend_args=dict(fontsize=15, title='District', title_fontsize=15), 
                     barplot_kwargs= dict(palette='husl'), 
                     title_args=dict(label=f'% CMI Avoided After Replacing Cutouts', fontsize=22.5), 
                     ylabel_args = dict(ylabel=f'% CMI', fontsize=20, x=0.0, y=0.8, ha='left', va='bottom'), 
                     xlabel_args = dict(xlabel='% Cutouts Replaced', fontsize=20, x=0.9, y=0.0, ha='right', va='top'), 
                     tick_args=[dict(axis='x', labelrotation=90, labelsize=15), 
                                dict(axis='y', labelsize=15)])
fig_num += 1

In [None]:
0.125

In [None]:
fig,ax = plt.subplots(1, 1, num=fig_num, figsize=[14, 6])
plt.subplots_adjust(right=0.80, bottom=0.25, left=0.075)
build_plot_for_erica(ax=ax, df=plot_df, x_col='district', y_col='value', hue='pct_cutoff', 
                     draw_legend=True, legend_args=dict(fontsize=15, bbox_to_anchor=(1,1), title='% Cutouts Replaced', title_fontsize=15), 
                     barplot_kwargs= dict(palette=colors_dict_pct_cutoffs), 
                     title_args=dict(label=f'% CMI Avoided After Replacing Cutouts', fontsize=22.5), 
                     ylabel_args = dict(ylabel=f'% CMI', fontsize=20, x=0.0, y=0.8, ha='left', va='bottom'), 
                     xlabel_args = dict(xlabel='% Cutouts Replaced', fontsize=20, x=0.9, y=0.0, ha='right', va='top'), 
                     tick_args=[dict(axis='x', labelrotation=90, labelsize=15), 
                                dict(axis='y', labelsize=15)])
fig_num += 1
fig.savefig(r'C:\Users\s346557\Downloads\test.pdf')

In [None]:
pct_cutoff = 10.0
fig,ax = plt.subplots(1, 1, num=fig_num, figsize=[14, 6])
build_plot_for_erica(ax=ax, df=plot_df[plot_df['pct_cutoff']==pct_cutoff], x_col='district', y_col='value', hue=None, 
                     draw_legend=False, 
                     barplot_kwargs= dict(palette=colors_dict_districts), 
                     title_args=dict(label=f'% CMI Avoided After Replacing Cutouts', fontsize=22.5), 
                     ylabel_args = dict(ylabel=f'% CMI', fontsize=20, x=0.0, y=0.8, ha='left', va='bottom'), 
                     xlabel_args = dict(xlabel='% Cutouts Replaced', fontsize=20, x=0.9, y=0.0, ha='right', va='top'), 
                     tick_args=[dict(axis='x', labelrotation=90, labelsize=15), 
                                dict(axis='y', labelsize=15)])
fig_num += 1

In [None]:
order = plot_df['district'].unique().tolist()
#order = sorted(plot_df['district'].unique().tolist())

fig,axs = plt.subplots(5, 2, num=fig_num, figsize=[28, 30], sharex=False, sharey=False) #14,6 for 1x1
plt.subplots_adjust(top=0.945, hspace=0.75)
axs = axs.flatten()
assert(len(axs)==plot_df['pct_cutoff'].nunique())
for i,pct_cutoff in enumerate(plot_df['pct_cutoff'].unique()):
    build_plot_for_erica(ax=axs[i], df=plot_df[plot_df['pct_cutoff']==pct_cutoff], x_col='district', y_col='value', hue=None, 
                         draw_legend=False, 
                         barplot_kwargs= dict(order=order, palette=colors_dict_districts), 
                         title_args=dict(label=f'{pct_cutoff}% Cutouts Replaced', fontsize=22.5), 
                         ylabel_args = dict(ylabel=f'% CMI', fontsize=20, x=0.0, y=0.8, ha='left', va='bottom'), 
                         xlabel_args = dict(xlabel='% Cutouts Replaced', fontsize=20, x=0.9, y=0.0, ha='right', va='top'), 
                         tick_args=[dict(axis='x', labelrotation=45, labelsize=15), 
                                    dict(axis='y', labelsize=15)], 
                         ax_args = dict(ylim=[0,100]))
fig.suptitle('% CMI Avoided After Replacing Cutouts', fontsize=25, fontweight='bold');
fig_num += 1

In [None]:
def build_lineplot_for_erica(ax, df, x_col, y_col, hue=None, 
                         draw_legend=False, lineplot_kwargs=None, 
                         **kwargs):
    #---------------------------
    if lineplot_kwargs is None:
        lineplot_kwargs = {}
    #---------------------------
    # These should only be arguments accepted by sns.barplot
    lineplot_kwargs['palette'] = lineplot_kwargs.get('palette', None)
    #---------------------------
    sns.lineplot(ax=ax, x=x_col, y=y_col, hue=hue, data=df, **lineplot_kwargs)
    #----------------------------
    handle_and_apply_formattings_args(ax=ax, draw_legend=draw_legend, kwargs=kwargs)    

In [None]:
fig,ax = plt.subplots(1, 1, num=fig_num, figsize=[14, 6])
build_lineplot_for_erica(ax=ax, df=plot_df, x_col='pct_cutoff', y_col='value', hue='district', 
                     draw_legend=True, 
                     lineplot_kwargs= dict(markers=True, style='district'), 
                     title_args=dict(label=f'% CMI Avoided After Replacing Cutouts', fontsize=22.5), 
                     ylabel_args = dict(ylabel=f'% CMI', fontsize=20, x=0.0, y=0.8, ha='left', va='bottom'), 
                     xlabel_args = dict(xlabel='% Cutouts Replaced', fontsize=20, x=0.9, y=0.0, ha='right', va='top'), 
                     tick_args=[dict(axis='x', labelrotation=90, labelsize=15), 
                                dict(axis='y', labelsize=15)])
fig_num += 1