In [2]:
import pandas as pd
import numpy as np
from bokeh.io import export_png
from bokeh.plotting import figure, output_notebook, show, output_file

We're going to track two trends:
1. The above/below trend
2. Which numbers are getting selected above what you would expect by random

## Above/Below trend:

Getting whether it was above or below:

In [3]:
# Functions:
def above_below_40(input_list):
    over_40 = [item for item in input_list if item > 40]
    if len(over_40) >= 12:
        return 'over'
    elif len(over_40) <= 8:
        return 'under'
    else:
        return 'neither'
    
def update_dict(input_dict, input_value):
    all_options = list(input_dict.keys())
    for each_value in input_value:
        input_dict[each_value].append(1)
    for each_value in all_options:
        if each_value in input_value:
            continue
        input_dict[each_value].append(0)
    return input_dict

def calc_prob_dict(input_list):
    cum_list = np.cumsum(input_list)
    prob_list = []
    for idx, each_item in enumerate(cum_list):
        prob_list.append(float(each_item)/(idx + 1))
    return prob_list

In [20]:
communities = ['Omaha', 'Lincoln', 'Norfolk', 'Fremont']
for each_community in communities:
    
    #### OVER/UNDER REPRESENTATION:
    test_file = pd.read_excel('./data/input/%s.xlsx' % each_community.lower(), index_col=0)

    tracking_progress = []
    for idx, each_row in test_file.iterrows():
        tracking_progress.append(above_below_40(list(each_row)))

    # Calculating trends - onverting to ones and zeroes:
    over_under_dict = {'over':[],
                       'under':[],
                       'neither':[]}
    for each_val in tracking_progress:
        over_under_dict = update_dict(over_under_dict, [each_val])

    # Converting to probabilities:
    prob_dict = {'over' : calc_prob_dict(over_under_dict['over']),
                 'under' : calc_prob_dict(over_under_dict['under']),
                 'neither' : calc_prob_dict(over_under_dict['neither'])}

    # Plotting:
    output_file('./data/output/%s_top_vs_bottom.html' % each_community.lower())
    overunder_drunken_walk_fig = figure(toolbar_location=None, title='%s' % each_community)
    overunder_drunken_walk_fig.line(x=list(range(len(prob_dict['over']))), y=prob_dict['over'], 
                          color='red', legend_label='Over', line_width=4)
    overunder_drunken_walk_fig.line(x=list(range(len(prob_dict['under']))), y=prob_dict['under'], 
                          color='blue', legend_label='Under', line_width=4)
    overunder_drunken_walk_fig.line(x=list(range(len(prob_dict['neither']))), y=prob_dict['neither'], 
                          color='black', legend_label='Neither', line_width=4)
    show(overunder_drunken_walk_fig)
    
    
    #### MOST REPRESENTED NUMBERS:
    # Making the big number dictionary:
    number_dict = {}
    for each_number in list(range(1, 81)):
        number_dict[each_number] = []

    # Getting the use of each number
    for idx, each_row in test_file.iterrows():
        number_dict = update_dict(number_dict, list(each_row))

    # converting to probabilities:
    number_prob_dict = {}
    for each_number in number_dict.keys():
        number_prob_dict[each_number] = calc_prob_dict(number_dict[each_number])

    # Plotting numbers:
    output_file('./data/output/%s_top_numbers.html' % each_community.lower())
    top_rep_numbers_fig = figure(toolbar_location=None)
    for each_value in number_prob_dict.keys():
        top_rep_numbers_fig.line(x=list(range(len(number_prob_dict[each_value]))), y=number_prob_dict[each_value], 
                              color='black', line_width=2)
    show(top_rep_numbers_fig)

    # Getting the mean of the last five values:
    final_number_prob_dict = {}
    for each_value in number_prob_dict.keys():
        final_number_prob_dict[each_value] = np.mean(number_prob_dict[each_value][-5:])
    final_number_prob_dict_sorted = {k: v for k, v in sorted(final_number_prob_dict.items(), 
                                                             key=lambda item: item[1], reverse=True)}

    # Writing the top numbers to a file:
    top_number_file = open('./data/output/%s_top_numbers.txt' % each_community, 'w')
    for idx, each_val in enumerate(final_number_prob_dict_sorted.keys()):
        if idx == 20:
            break
        top_number_file.write(str(each_val) + '\t' + str(final_number_prob_dict_sorted[each_val]) + '\n')
    top_number_file.close()