In [39]:
import pandas as pd
import seaborn as sns
from collections import OrderedDict



In [40]:
df = pd.read_csv("../data/processed/2020_03_29/United_States_county.csv")

In [41]:
df.dtypes

Region                  object
Retail & recreation    float64
Grocery & pharmacy     float64
Parks                  float64
Transit stations       float64
Workplaces             float64
Residential            float64
State                   object
dtype: object

### Create a 'Total' Column

In [42]:
num_values = 100*(6-df[['Retail & recreation','Grocery & pharmacy','Parks','Transit stations','Workplaces','Residential']].isnull().sum(axis=1))
df['total']=(df.sum(1,skipna=True)/num_values)
num_values

0       400
1       300
2       400
3       400
4       400
       ... 
2805    300
2806    300
2807    300
2808    300
2809    400
Length: 2810, dtype: int64

### Create a descriptive stats summary table

In [94]:
def summary_stats(df, state = True, ):
    """
    Creates summary stats for df.
    
    inputs:
        df : df, containing mobility data 
        state : boolean, default = True, True if working with df containing sub regions (provinces, states, etc)
                                    False if working with super regions (countries)
    
    outputs:
        df containg summary statisitics 
        
    """
    sum_data = OrderedDict()
    categories=[i for i in df.columns]

    if state == True: 
        for col in categories:
            if df.dtypes[col]=='float64':
                mean = df[col].mean()
                median = df[col].median()
                maxim = (df[['State','Region',col]].loc[df[col]== df[col].max()])
                minin = df[['State','Region',col]].loc[df[col]==df[col].min()]
#                 maxim = (df[['State','Region']].loc[df[col]== df[col].max()])+': '+str(df[col].max())
#                 minin = df[['State','Region']].loc[df[col]==df[col].min()]+': '+str(df[col].min())


                sum_data[col]=sum_data.get(col,[])+[mean, median, maxim, minin]
    else:
        for col in categories:
            if df.dtypes[col]=='float64':
                mean = df[col].mean()
                median = df[col].median()
                
                maxim = (df['Region'].loc[df[col].max()])+': '+str(df[col].max())
                minin = df['Region'].loc[df[col]==df[col].min()]+': '+str(df[col].min())


                sum_data[col]=sum_data.get(col,[])+[mean, median, maxim, minin]

    df_2 =pd.DataFrame(sum_data)
    stats=['mean', 'median', 'max', 'min']
    df_2.index = stats
    return df_2



In [95]:
col = 'Parks'
maxim_label = (df[['State']].loc[df[col]== df[col].max()], df[['Region']].loc[df[col]== df[col].max()] )
maxim_val =str(df[col].max())


In [96]:
sum_df = summary_stats(df)
sum_df.head()

Unnamed: 0,Retail & recreation,Grocery & pharmacy,Parks,Transit stations,Workplaces,Residential,total
mean,-37.5904,-15.9872,-16.3728,-4.96491,-30.8457,10.7539,-0.217646
median,-38,-16,-31,-6,-31,12,-0.207083
max,State Region Retail & recreatio...,State Region Grocery & pharm...,State Region Parks 1942 Misso...,State Region Transit sta...,State Region Workplaces 2006...,State Region Residential ...,State Region total 1434 Tex...
min,State Region Retail & re...,State Region Grocery & pharma...,State Region Parks 892 Colora...,State Regi...,State Region Workplaces 2139 Ut...,State Region ...,State Region total 892 Colora...


In [140]:
df.loc[(df['Parks']==-40)&(df['State']=='Texas')]

Unnamed: 0,Region,Retail & recreation,Grocery & pharmacy,Parks,Transit stations,Workplaces,Residential,State,total
1319,Archer County,,-40.0,-40.0,,-27.0,,Texas,-0.356667
1343,Carson County,-28.0,,-40.0,,-30.0,,Texas,-0.326667
1345,Castro County,-14.0,,-40.0,,-20.0,,Texas,-0.246667
1349,Clay County,-53.0,,-40.0,,-24.0,,Texas,-0.39
1350,Coke County,-21.0,,-40.0,,,-40.0,Texas,-0.336667
1359,Crockett County,-27.0,,-40.0,,,-40.0,Texas,-0.356667
1360,Crosby County,-15.0,,-40.0,,-35.0,,Texas,-0.3
1361,Culberson County,-46.0,,-40.0,,,-40.0,Texas,-0.42
1367,Delta County,,-40.0,-40.0,,-37.0,,Texas,-0.39
1370,Donley County,,-40.0,-40.0,,,-40.0,Texas,-0.4


In [142]:
df.loc[(df['Grocery & pharmacy']==-40)]

Unnamed: 0,Region,Retail & recreation,Grocery & pharmacy,Parks,Transit stations,Workplaces,Residential,State,total
1,Alexander County,,-40.0,-40.0,,-30.0,,Illinois,-0.366667
21,Edwards County,,-40.0,-40.0,,-39.0,,Illinois,-0.396667
31,Henderson County,,-40.0,-40.0,,-23.0,,Illinois,-0.343333
71,Pulaski County,,-40.0,-40.0,,-33.0,,Illinois,-0.376667
72,Putnam County,,-40.0,-40.0,,-38.0,,Illinois,-0.393333
...,...,...,...,...,...,...,...,...,...
2792,Kingsbury County,,-40.0,-40.0,,-27.0,,South Dakota,-0.356667
2796,McCook County,,-40.0,-40.0,,-24.0,,South Dakota,-0.346667
2799,Moody County,,-40.0,-40.0,,-27.0,,South Dakota,-0.356667
2803,Spink County,,-40.0,-40.0,,-28.0,,South Dakota,-0.360000
