In [89]:

import pathlib
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import weightedstats as ws


# consider using this for weighted stats: http://www.ccgalberta.com/pygeostat/welcome.html



In [90]:

def load_data(data_file: str) -> pd.DataFrame:
    print 
    '''
    Load data from /data directory
    '''
    PATH = pathlib.Path().resolve()
    DATA_PATH = PATH.joinpath("../data").resolve()
    return pd.read_csv(DATA_PATH.joinpath(data_file))



In [91]:
def map_values_std_dev(pdseries, power_max = 90.75, power_min = 59.25, scale_factor = 1):
    # the function takes a pdseries as input, and returns a mapped pdseries with the same index as the input pdseries
    # power max and min are 1 std dev away from the median, and map to the same std dev away from the median in the new series (with a scaling factor defaulting to 1)
    lower_std = pdseries.median() - pdseries.std()*scale_factor
    upper_std = pdseries.median() + pdseries.std()*scale_factor
    output_series =  (pdseries - lower_std) / (upper_std - lower_std) * (power_max - power_min) + power_min
    return output_series




In [92]:


df = load_data("country_data_master_interpolated.csv")
# set index to alpha 3
df.set_index("alpha3", inplace=True)

# break out the data into separate series
weights = df["Average Weight"]
Fitness = df["percent_insufficient_activity"]


In [93]:
# plot histogram of weights with plotly
fig = px.histogram(weights, nbins=20)
fig.show()

# plot curve of weights with plotly
fig = px.line(weights.sort_values(), title="Weight Distribution")
fig.show()

# get data for USA
usa = weights.loc["USA"]
print(usa)

# create summary statistics
print(weights.describe())



84.38914393643789
count    233.000000
mean      72.125386
std        9.060381
min       51.679256
25%       64.700525
50%       74.508285
75%       78.489492
max       98.809208
Name: Average Weight, dtype: float64


In [94]:

####
# now do the same for fitness
####
fig = px.histogram(Fitness, nbins=20)
fig.show()

fig = px.line(Fitness.sort_values(), title="Fitness Distribution")
fig.show()

usa = Fitness.loc["USA"]
print(usa)

print(Fitness.describe())


40.0
count    233.000000
mean      29.035988
std       10.387526
min        5.500000
25%       21.900000
50%       29.300000
75%       36.590000
max       67.000000
Name: percent_insufficient_activity, dtype: float64


In [95]:

# turn this cell in to a function
df["percent_sufficient_activity"] = 100 - df["percent_insufficient_activity"]
df["power"] = map_values_std_dev(df["percent_sufficient_activity"], power_max = 90.75, power_min = 59.25, scale_factor = 3)


In [97]:

# plot histogram of power with plotly
fig = px.histogram(df["power"], nbins=20)
fig.show()

# plot curve of power with plotly
fig = px.line(df["power"].sort_values(), title="Power Distribution")
fig.show()

In [98]:
df["power"].describe()

count    233.000000
mean      75.133435
std        5.250000
min       55.945895
25%       71.315533
50%       75.000000
75%       78.740063
max       87.028851
Name: power, dtype: float64