In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
import seaborn as sns
import scipy.special
from statsmodels.stats import weightstats as stests

import plotly.tools as tls
%matplotlib inline 

import plotly.graph_objs as go

import plotly as py
py.offline.init_notebook_mode()



import dash 
import dash_core_components as dcc
import dash_html_components as html
from dash.dependencies import Output, Input

import plotly.figure_factory as ff

import pickle

In [None]:
# loading data

grades_final = pd.read_csv('msu_tidy_grades_final.csv', parse_dates = ['date'])

probability_lists = np.loadtxt('probability_lists.csv')

grades_final['probability_lists'] = list(probability_lists)

all_class_instruc = list(grades_final.groupby(['instructors', 'course_name']).probability_lists)




In [None]:
# assigning value to possible gpa distributions 

gpa_dist = np.array([4, 3.5, 3, 2.5, 2, 1.5, 1, 0])


In [None]:
# normalizing every course distribution


def normalize_gpa_dist(x):
    '''This funcion simply aggregates and normalizes a teacher's
    course distributions. If the same teacher taught the same course,
    it sums their respective distributions together, then divides by
    number of courses to normalize '''
    inital_normalized_dists = []
    for i in range(len(x)):
        number_of_courses = len(x[i][1])
        
        # summing distributions together
        
        distribution_sum = np.sum(x[i][1].values)
        
        
        # normalizng by dividing by the number of courses taught 
        
        
        normalized_dist = distribution_sum / number_of_courses
        
        inital_normalized_dists.append([x[i][0], normalized_dist])
        
    return inital_normalized_dists
        
all_normalized_dists = normalize_gpa_dist(all_class_instruc)

In [None]:
# pulling each unique course as well as every teacher

teacher_list = []
for i in range(len(all_normalized_dists)):
    teacher_list.append(all_normalized_dists[i][0][0])
    
teacher_list = np.array(teacher_list)


course_list = []
for i in range(len(all_normalized_dists)):
    course_list.append(all_normalized_dists[i][0][1])
    
course_list = np.array(course_list)

In [None]:
#  multi_teacher_index_test = np.where(teacher_list == ['A M SAEED'])


# target_teacher_prob_test = np.sum(np.array(all_normalized_dists)[multi_teacher_index_test][:,1])

# target_teacher_prob_test =  target_teacher_prob_test / len(multi_teacher_index_test[0])

In [None]:
# np.array(all_normalized_dists)[multi_teacher_index_test]

In [None]:
# this function returns the bootstrapped GPA for a given professor or for a given course

def mean_bootstrapper(teacher_input = None, course_input = None):
    '''This function  generates 100 random samples with replacement from a normalized distribution 
    it then calculates the mean then stores it in a array. 
    Finally, the mean of those means as well as the standard
    deviation is calculated in order to calculate a confidence
    Interval'''
    
    
    
    target_array = np.array(all_normalized_dists)
        
#     multi_teacher_index = np.where(teacher_list == str(teacher_input).upper())
    
#     multi_course_index = np.where(course_list == str(course_input).upper())
    
    
    
    # not entering a course nor a teacher
    if teacher_input is None  and course_input is None:
        return ValueError("Please enter at least a specific course or a specific teacher")


    
    
    # selecting to analyze a teacher
    elif teacher_input is not None and course_input is None:
#         teacher_input = teacher_input.upper()
        
        multi_teacher_index = np.where(teacher_list == teacher_input)

        
        target_teacher_prob = np.sum(target_array[multi_teacher_index][:,1])
        
        target_teacher_prob =  target_teacher_prob / len(multi_teacher_index[0])
        
        
#         target_teacher_prob = np.round(target_teacher_prob)
        
        boot_strapped_means = np.array([np.mean(
            np.random.choice(gpa_dist, size = 100, p = target_teacher_prob))
                                        for i in range(10000)])
        
        
        mean_of_bootstrap = np.mean(boot_strapped_means)
        
        sigma_of_bootstrap = np.std(boot_strapped_means)
        
        ci_bootstrap = stats.norm.interval(0.95, loc=mean_of_bootstrap, scale=sigma_of_bootstrap)
        
        # returning courses offered by professor
        
        list_teacher_courses = course_list[multi_teacher_index]
        
        return boot_strapped_means, ci_bootstrap, list_teacher_courses
        

        
        
        
    elif teacher_input is None and course_input is not None:
#         course_input = course_input.upper()
        
        multi_course_index = np.where(course_list == course_input)
        
        target_course_prob = np.sum(target_array[multi_course_index][:,1])
        
        target_course_prob =  target_course_prob / len(multi_course_index[0])
        
        
        boot_strapped_means = np.array([np.mean(
            np.random.choice(gpa_dist, size = 100, p = target_course_prob))
                                        for i in range(10000)])
        
        
        mean_of_bootstrap = np.mean(boot_strapped_means)
        
        sigma_of_bootstrap = np.std(boot_strapped_means)
        
        ci_bootstrap = stats.norm.interval(0.95, loc=mean_of_bootstrap, scale=sigma_of_bootstrap)
        
        # returning courses offered by professor
        
        list_course_instructors = teacher_list[multi_course_index]
        
        return boot_strapped_means, ci_bootstrap, list_course_instructors
        
        
        
        
        

        
    elif teacher_input is not None and course_input is not None:
#         teacher_input = teacher_input.upper()
#         course_input = course_input.upper()

        
        # finding index for specific course and teacher
        
        specific_course_and_teacher_index = np.where(
            (teacher_list == teacher_input) & (course_list == course_input))

        # pulling probability for specific course and teacher
        
        specific_course_and_teacher_prob = target_array[specific_course_and_teacher_index].flatten()[1]
        
        # generating random samples for specific course and teacher
        
        boot_strapped_means = np.array([np.mean(
            np.random.choice(gpa_dist, size = 100, p = specific_course_and_teacher_prob))
                                        for i in range(1000)])
        
        # math for calculating CI for specific course and teacher
        
        mean_of_bootstrap = np.mean(boot_strapped_means)
        
        sigma_of_bootstrap = np.std(boot_strapped_means)
        
        ci_bootstrap = stats.norm.interval(0.95, loc=mean_of_bootstrap, scale=sigma_of_bootstrap)
        
        # returning list of course_name & respective teacher
        
        list_teacher_and_course = [teacher_input, course_input]
        
        return boot_strapped_means, ci_bootstrap, list_teacher_and_course

        
        



In [None]:
## creating function that can generate data of lists of multiple teachers


def multi_mean_bootstrapper(teacher_input, course_input):
    
        target_array = np.array(all_normalized_dists)
        # finding index for specific course and teacher
        
        golden_list_of_course_and_teachers = []
        
        for teacher in teacher_input:
        
            specific_course_and_teacher_index = np.where(
                (teacher_list == teacher) & (course_list == course_input))

            # pulling probability for specific course and teacher
            
            specific_course_and_teacher_prob = target_array[specific_course_and_teacher_index].flatten()[1]

            # generating random samples for specific course and teacher

            boot_strapped_means = np.array([np.mean(
                np.random.choice(gpa_dist, size = 100, p = specific_course_and_teacher_prob))
                                            for i in range(1000)])

            # math for calculating CI for specific course and teacher

            mean_of_bootstrap = np.mean(boot_strapped_means)

            sigma_of_bootstrap = np.std(boot_strapped_means)

            ci_bootstrap = stats.norm.interval(0.95, loc=mean_of_bootstrap, scale=sigma_of_bootstrap)

            # returning list of course_name & respective teacher

            list_teacher_and_course = [teacher, course_input]
            
            golden_list_of_course_and_teachers.append([boot_strapped_means, ci_bootstrap, list_teacher_and_course])
        
        return golden_list_of_course_and_teachers

    
    

In [None]:
multi_mean_bootstrapper(['A CAREY', 'FREDERIK DERKSEN'], ['VM_547'])

# function is working


In [None]:
mth_132_means, mth_132_ci, mth_132_teachers = mean_bootstrapper(course_input="MTH_132")
eric_mth_132_means, eric_ci, eric_courses = mean_bootstrapper(teacher_input='ERICK A VERLEYE')

In [None]:
mth_132_fig = plt.figure() 

sns.distplot(mth_132_means, hist = False, kde = True, norm_hist= True,
            kde_kws = {'shade': True, 'linewidth': 3}, 
                  label = 'All MTH 132')

sns.distplot(eric_mth_132_means, hist = False, kde = True, norm_hist= True,
            kde_kws = {'shade': True, 'linewidth': 3}, 
                  label = 'MTH 132 Sec. 09')


# plt.axvline(yang_ci[0], color = 'r', linestyle = "dashed")
# plt.axvline(yang_ci[1], color = 'r', linestyle = "dashed")
plt.xlabel("Bootstrapped Mean GPA")
plt.yticks([])
plt.title("MTH 132 Random Sampling Mean GPA")

print("Random Sample MTH 132 Sec. 09 Mean GPA", np.round(np.mean(eric_mth_132_means),2))
print("Random Sample All MTH 132 Mean GPA", np.round(np.mean(mth_132_means), 2))


# sns.distplot(cmse_bootstrapped_means, hist = False, kde = True,
#                  kde_kws = {'shade': True, 'linewidth': 3}, 
#                   label = 'All CMSE')

In [None]:
# two sample ztest



ztest ,pval1 = stests.ztest(
    mth_132_means, x2=eric_mth_132_means,
    value=0,alternative='smaller')

print(float(pval1))
if pval1<0.05:
    print("reject null hypothesis")
else:
    print("accept null hypothesis")

In [None]:
ztest ,pval1 = stests.ztest(
    np.array([100,100,100]), x2=[100,100,99],
    value=0,alternative='two-sided')

In [None]:
external_stylesheets = ['https://codepen.io/chriddyp/pen/bWLwgP.css']


app = dash.Dash(external_stylesheets= external_stylesheets)



app.config['suppress_callback_exceptions']=True

In [None]:
# making list of dictionaries for drop down menu
course_dic_list = []

for i, val in enumerate(course_list):
    course_dic_list.append({'label': val, 'value': val })
    
# course_dic_list = sorted(course_dic_list, key=lambda k: k['label']) 

In [None]:
# loading all of msu_original Data

all_msu_bootstrap = np.load('all_msu_bootstrap.npy')

all_msu_bar = np.load('all_msu_gpa_bar.npy')

all_msu_ci = np.load('all_msu_ci.npy')

In [None]:
# making default all msu histogram plot

hist_data = [all_msu_bootstrap]
group_labels = ['All MSU']

fig = ff.create_distplot(hist_data, group_labels, bin_size=.02, show_rug=False, show_hist=True)

fig['layout'].update(title='All MSU GPA Distribution')

fig['layout']['yaxis'].update(autorange=True,
        showgrid=True,
        zeroline=False,
        showline=False,
        ticks='',
        showticklabels=False)

fig['layout']['xaxis'].update(title='Bootstrapped Mean GPA')




py.offline.iplot(fig)


In [None]:
# formatting app


app.layout = html.Div([
    html.Label('Choose a Course to Analyze'),
    html.Label("Choose Course's Teacher", style = dict(position = 'relative', left = '50%')),
    
    dcc.Dropdown(id = "course_input_dropdown",
    options = course_dic_list, style = dict(width = "68%")),
    
    dcc.Dropdown(id = "teacher_input_dropdown", multi = True,
    options = [{'label': 'Select a Course', 'value': 'Select A Course'}], style = dict(width = "68%")),
    
    dcc.Graph(id = 'course_graph', figure = fig),
    dcc.Graph(id = 'teacher_graph', figure = {})])



In [None]:
# hist_data = [all_msu_bootstrap]
# group_labels = ['All MSU']

# fig2 = ff.create_distplot(hist_data, group_labels, bin_size=.02, show_rug=False, show_hist=True)

# fig2['layout'].update(title='All MSU GPA Distribution')

# fig2['layout']['yaxis'].update(autorange=True,
#         showgrid=True,
#         zeroline=False,
#         showline=False,
#         ticks='',
#         showticklabels=False)

# fig2['layout']['xaxis'].update(title='Bootstrapped Mean GPA')

In [None]:

# updating initial course figure

@app.callback(Output('course_graph', 'figure'),
              [Input('course_input_dropdown', 'value')])



def course_fig(update_value):
    
    bs_data, bs_ci, meta_data = mean_bootstrapper(course_input= update_value)
    
    
#     data = []
    hist_data = [bs_data]
    group_labels = [update_value]

    fig = ff.create_distplot(hist_data, group_labels, bin_size=.01, show_rug=False, show_hist=True)

    fig['layout'].update(title=update_value)

    # turning of the yaxis variable
    fig['layout']['yaxis'].update(autorange=True, showgrid=True, zeroline=False, showline=False, ticks='', showticklabels=False)

    fig['layout']['xaxis'].update(title='Bootstrapped Mean GPA')

#     data.append(fig) # data must be in list form
    return  fig 


all_normalized_dists_easy = [all_normalized_dists[i][0] for i, val in enumerate(all_normalized_dists)]

all_normalized_dists_easy = np.array(all_normalized_dists_easy)


@app.callback(Output('teacher_input_dropdown', 'options'),
              [Input('course_graph', 'figure')])



def fill_dropdown_menu(update_teacher_value):
    update_teacher_value = update_teacher_value['layout']['title']['text']
    intermediate_teacher_index = np.where(all_normalized_dists_easy == str(update_teacher_value))[0]    
    # return objects of normalized dists where the course update value matches 
    
    teacher_dic_list = teacher_list[intermediate_teacher_index]
    
    teacher_dic_list = list(teacher_dic_list)
    

    final_teacher_dic_list = []

    for i, val in enumerate(teacher_dic_list):
        final_teacher_dic_list.append({'label': val, 'value': val })
    
    return final_teacher_dic_list





In [None]:
@app.callback(
    Output('teacher_graph', 'figure'),
    [Input('teacher_input_dropdown', 'value'),
     Input('course_input_dropdown', 'value')])
#      Input('course_graph', 'figure')])

def multi_teacher_fig_bind(teacher_input_dropdown, course_input_dropdown):
    
    bs_data, bs_ci, meta_data = mean_bootstrapper(course_input = course_input_dropdown)
    
    
    # generating bootstrap means and CI for all teacher_inputs
    
    list_of_relavent_teacher = multi_mean_bootstrapper(teacher_input_dropdown, course_input_dropdown)
    
    
    relavent_teacher_data = [bs_data]
    relavent_teacher_labels = [course_input_dropdown]
    
    for i in list_of_relavent_teacher:
        relavent_teacher_data.append(i[0])
        relavent_teacher_labels.append(i[2][0])
        
        
    

    teach_fig = ff.create_distplot(relavent_teacher_data,
                                   relavent_teacher_labels, bin_size=.01, show_rug=False, show_hist=False)

    teach_fig['layout'].update(title= course_input_dropdown)

    # turning of the yaxis variable
    teach_fig['layout']['yaxis'].update(autorange=True,
                                        showgrid=True, zeroline=False, showline=False, ticks='',
                                        showticklabels=False)

    teach_fig['layout']['xaxis'].update(title='Bootstrapped Mean GPA')

#     data.append(fig) # data must be in list form
    return  teach_fig 

In [None]:
if __name__ == '__main__':
    app.run_server()

In [None]:
# draw a mean dotted dashed line down the middle for each graph 

# create a new drop down box to the left of instructor that draws individual distributions on the same graph

# have a better title so it displays the course description

# have a div tag that displays all of the past semesters the course was offered

# display a bar graph of the original distribution

# display confidence intervals

# sort inputs

# display tukey test matrix 

In [None]:
# create a function similar to figure that returns all the individual teacher graphs based on
# teacher input droptown

In [None]:
import time
start = time.time()
mean_bootstrapper(course_input='Mth_132', teacher_input="Erick A Verleye")
end = time.time()

print('total time', end - start)