### Analysis of Variance on broadband availability for regression feature selection


* All residential blocks in contiguous U.S. with complete data are considered, for year 2018
* Using ANOVA correlations for numerical input, categorical output (see https://machinelearningmastery.com/feature-selection-with-real-and-categorical-data/)
* helpful visualization for ANOVA correlations, showing why higher is better: https://datascience.stackexchange.com/questions/74465/how-to-understand-anova-f-for-feature-selection-in-python-sklearn-selectkbest-w


#### Outline:  
   1. data import
   2. Analysis of Variance (ANOVA) on contiguous U.S.
   3. ANOVA on RUCA categories


#### 1. data import

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest # for selecting variables based on criteria
from sklearn.feature_selection import f_classif # for ANOVA correlation

In [2]:
df=pd.read_csv('fcc477_2018_grouped_ruca_demographic_byblock_with_social_variables_with_DC.csv') 
df.info()
df.head()

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8254631 entries, 0 to 8254630
Columns: 101 entries, BlockCode to Violent Crimes p 100,000
dtypes: float64(69), int64(18), object(14)
memory usage: 6.2+ GB


Unnamed: 0,BlockCode,NumberOfUniqueProviderNames,NumberOfUniqueHocoNums,MeanMaxAdDown,MeanMaxAdUp,SdMaxAdDown,SdMaxAdUp,NumberOfUniqueTechCodes,GIDBG,State-County-Tract FIPS Code (lookup by address at http://www.ffiec.gov/Geocode/),...,% births to unmarried women,% women currently married,% children with single parent,"Non-religious non-profit organizations p 1,000","Religious congregations p 1,000",Informal Civic Engagement Subindex,"Presidential election voting rate, 2012 & 2016",Mail-back census response rate,Confidence in Institutions Subindex,"Violent Crimes p 100,000"
0,10010201001000,3,3,592.8,27.256,530.77792,25.056957,5,10010201001,1001020100,...,29.6,65.2,22.4,2.2,1.9,-1.032341,60.5,78.0,0.208587,265.7
1,10010201001001,1,1,940.0,35.0,,,1,10010201001,1001020100,...,29.6,65.2,22.4,2.2,1.9,-1.032341,60.5,78.0,0.208587,265.7
2,10010201001002,2,2,317.333333,12.008,539.245151,19.911656,3,10010201001,1001020100,...,29.6,65.2,22.4,2.2,1.9,-1.032341,60.5,78.0,0.208587,265.7
3,10010201001003,2,2,319.333333,12.008,537.521472,19.911656,3,10010201001,1001020100,...,29.6,65.2,22.4,2.2,1.9,-1.032341,60.5,78.0,0.208587,265.7
4,10010201001004,1,1,940.0,35.0,,,1,10010201001,1001020100,...,29.6,65.2,22.4,2.2,1.9,-1.032341,60.5,78.0,0.208587,265.7


Make categories based on RUCA code:

In [3]:
conditions = [(df['Primary RUCA Code 2010']==1) | (df['Primary RUCA Code 2010']==4) | (df['Primary RUCA Code 2010']==7) ,\
             (df['Primary RUCA Code 2010']==2) | (df['Primary RUCA Code 2010']==5) | (df['Primary RUCA Code 2010']==8), \
             (df['Primary RUCA Code 2010']==3) | (df['Primary RUCA Code 2010']==6) | (df['Primary RUCA Code 2010']==9), \
             (df['Primary RUCA Code 2010']==10) | (df['Primary RUCA Code 2010']==99)]
values = [1,2,3,4] # 1 = core, 2 = suburb, 3 = exurb, 4 = rural
df['ruca_cat'] = np.select(conditions, values)
df.head()

Unnamed: 0,BlockCode,NumberOfUniqueProviderNames,NumberOfUniqueHocoNums,MeanMaxAdDown,MeanMaxAdUp,SdMaxAdDown,SdMaxAdUp,NumberOfUniqueTechCodes,GIDBG,State-County-Tract FIPS Code (lookup by address at http://www.ffiec.gov/Geocode/),...,% women currently married,% children with single parent,"Non-religious non-profit organizations p 1,000","Religious congregations p 1,000",Informal Civic Engagement Subindex,"Presidential election voting rate, 2012 & 2016",Mail-back census response rate,Confidence in Institutions Subindex,"Violent Crimes p 100,000",ruca_cat
0,10010201001000,3,3,592.8,27.256,530.77792,25.056957,5,10010201001,1001020100,...,65.2,22.4,2.2,1.9,-1.032341,60.5,78.0,0.208587,265.7,1
1,10010201001001,1,1,940.0,35.0,,,1,10010201001,1001020100,...,65.2,22.4,2.2,1.9,-1.032341,60.5,78.0,0.208587,265.7,1
2,10010201001002,2,2,317.333333,12.008,539.245151,19.911656,3,10010201001,1001020100,...,65.2,22.4,2.2,1.9,-1.032341,60.5,78.0,0.208587,265.7,1
3,10010201001003,2,2,319.333333,12.008,537.521472,19.911656,3,10010201001,1001020100,...,65.2,22.4,2.2,1.9,-1.032341,60.5,78.0,0.208587,265.7,1
4,10010201001004,1,1,940.0,35.0,,,1,10010201001,1001020100,...,65.2,22.4,2.2,1.9,-1.032341,60.5,78.0,0.208587,265.7,1


Category distribution:

In [4]:
np.unique(df['ruca_cat'],return_counts=True)

(array([1, 2, 3, 4]), array([4632747, 2091370,  354439, 1176075]))

Filter out non-contiguous U.S. areas:

In [5]:
df_conus=df[(df['Select State']!='AK')& (df['Select State']!='HI')]
np.unique(df_conus['Select State']),len(np.unique(df_conus['Select State']))

(array(['AL', 'AR', 'AZ', 'CA', 'CO', 'CT', 'DC', 'DE', 'FL', 'GA', 'IA',
        'ID', 'IL', 'IN', 'KS', 'KY', 'LA', 'MA', 'MD', 'ME', 'MI', 'MN',
        'MO', 'MS', 'MT', 'NC', 'ND', 'NE', 'NH', 'NJ', 'NM', 'NV', 'NY',
        'OH', 'OK', 'OR', 'PA', 'RI', 'SC', 'SD', 'TN', 'TX', 'UT', 'VA',
        'VT', 'WA', 'WI', 'WV', 'WY'], dtype=object),
 49)

Numerical x & y dataframe:

In [6]:
df_conus_num=df_conus[['NumberOfUniqueProviderNames','NumberOfUniqueHocoNums',\
                       'NumberOfUniqueTechCodes','NumFCCEntriesWith25/3Speed','NumFCCEntriesWith100/10Speed',\
                       'AnyProviderWith25/3','AnyProviderWith100/10',
                       'MeanMaxAdDown','MeanMaxAdUp','Tot_Population_ACS_14_18', 'pct_URBANIZED_AREA_POP_CEN_2010',
       'pct_URBAN_CLUSTER_POP_CEN_2010', 'pct_RURAL_POP_CEN_2010',
       'avg_Tot_Prns_in_HHD_ACS_14_18', 'pct_Vacant_Units_ACS_14_18',
       'pct_Hispanic_ACS_14_18', 'pct_NH_White_alone_ACS_14_18',
       'pct_NH_Blk_alone_ACS_14_18', 'pct_NH_AIAN_alone_ACS_14_18',
       'pct_NH_Asian_alone_ACS_14_18', 'pct_NH_NHOPI_alone_ACS_14_18',
       'pct_NH_SOR_alone_ACS_14_18', 'pct_Othr_Lang_ACS_14_18',
       'pct_ENG_VW_ACS_14_18', 'pct_Not_HS_Grad_ACS_14_18',
       'pct_College_ACS_14_18', 'avg_Agg_HH_INC_ACS_14_18',
       'pct_Prs_Blw_Pov_Lev_ACS_14_18', 'pct_PUB_ASST_INC_ACS_14_18',
       'pct_Diff_HU_1yr_Ago_ACS_14_18', 'pct_Recent_Built_HU_ACS_14_18',
       'avg_Agg_House_Value_ACS_14_18','Primary RUCA Code 2010','Secondary RUCA Code, 2010 (see errata)', 'Tract Population, 2010',
       'Land Area (square miles), 2010',
       'Population Density (per square mile), 2010','rank','index','mobility_raw','pct_belowpov_raw',\
                      'pct_deeppov_raw','life_exp_raw','lbw_raw','urban','rural','reservation','city',\
                      'tot_pop','pct_lths','lfpr','pct_bachmore','County-Level Index', \
                       'Requiring all 4 Subindices', 'Excluding Collective Efficacy', 'Family Unity', \
                       'Community Health', 'Institutional Health', 'Collective Efficacy', \
                       '% births to unmarried women', '% women currently married', '% children with single parent', \
                       'Non-religious non-profit organizations p 1,000', 'Religious congregations p 1,000', \
                       'Informal Civic Engagement Subindex', 'Presidential election voting rate, 2012 & 2016', \
                       'Mail-back census response rate', 'Confidence in Institutions Subindex', 'Violent Crimes p 100,000','ruca_cat']]
# df_conus_num['teenbirthrate']=df_conus_num['teenbirthrate'].astype(float)  #doesn't work because has ''
df_conus_num.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8232809 entries, 0 to 8254630
Data columns (total 70 columns):
 #   Column                                          Dtype  
---  ------                                          -----  
 0   NumberOfUniqueProviderNames                     int64  
 1   NumberOfUniqueHocoNums                          int64  
 2   NumberOfUniqueTechCodes                         int64  
 3   NumFCCEntriesWith25/3Speed                      int64  
 4   NumFCCEntriesWith100/10Speed                    int64  
 5   AnyProviderWith25/3                             int64  
 6   AnyProviderWith100/10                           int64  
 7   MeanMaxAdDown                                   float64
 8   MeanMaxAdUp                                     float64
 9   Tot_Population_ACS_14_18                        float64
 10  pct_URBANIZED_AREA_POP_CEN_2010                 float64
 11  pct_URBAN_CLUSTER_POP_CEN_2010                  float64
 12  pct_RURAL_POP_CEN_2010      

drop rows with incomplete data:

In [7]:
df_num_2=df_conus_num.dropna() # a fair number left
df_num_2

Unnamed: 0,NumberOfUniqueProviderNames,NumberOfUniqueHocoNums,NumberOfUniqueTechCodes,NumFCCEntriesWith25/3Speed,NumFCCEntriesWith100/10Speed,AnyProviderWith25/3,AnyProviderWith100/10,MeanMaxAdDown,MeanMaxAdUp,Tot_Population_ACS_14_18,...,% women currently married,% children with single parent,"Non-religious non-profit organizations p 1,000","Religious congregations p 1,000",Informal Civic Engagement Subindex,"Presidential election voting rate, 2012 & 2016",Mail-back census response rate,Confidence in Institutions Subindex,"Violent Crimes p 100,000",ruca_cat
0,3,3,5,3,3,1,1,592.800000,27.256000,636.0,...,65.2,22.4,2.2,1.9,-1.032341,60.5,78.0,0.208587,265.7,1
1,1,1,1,1,1,1,1,940.000000,35.000000,636.0,...,65.2,22.4,2.2,1.9,-1.032341,60.5,78.0,0.208587,265.7,1
2,2,2,3,1,1,1,1,317.333333,12.008000,636.0,...,65.2,22.4,2.2,1.9,-1.032341,60.5,78.0,0.208587,265.7,1
3,2,2,3,1,1,1,1,319.333333,12.008000,636.0,...,65.2,22.4,2.2,1.9,-1.032341,60.5,78.0,0.208587,265.7,1
4,1,1,1,1,1,1,1,940.000000,35.000000,636.0,...,65.2,22.4,2.2,1.9,-1.032341,60.5,78.0,0.208587,265.7,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8254626,1,1,1,1,0,1,0,50.000000,5.000000,1375.0,...,82.5,23.1,8.0,2.5,0.414574,62.4,65.0,0.423832,143.4,1
8254627,1,1,1,0,0,0,0,10.000000,1.000000,1375.0,...,82.5,23.1,8.0,2.5,0.414574,62.4,65.0,0.423832,143.4,1
8254628,3,3,2,3,0,1,0,35.000000,8.333333,1375.0,...,82.5,23.1,8.0,2.5,0.414574,62.4,65.0,0.423832,143.4,1
8254629,1,1,1,0,0,0,0,10.000000,1.000000,1375.0,...,82.5,23.1,8.0,2.5,0.414574,62.4,65.0,0.423832,143.4,1


a list of all potential x variables:

In [8]:
all_x_li=['Tot_Population_ACS_14_18', 'pct_URBANIZED_AREA_POP_CEN_2010',
       'pct_URBAN_CLUSTER_POP_CEN_2010', 'pct_RURAL_POP_CEN_2010',
       'avg_Tot_Prns_in_HHD_ACS_14_18', 'pct_Vacant_Units_ACS_14_18',
       'pct_Hispanic_ACS_14_18', 'pct_NH_White_alone_ACS_14_18',
       'pct_NH_Blk_alone_ACS_14_18', 'pct_NH_AIAN_alone_ACS_14_18',
       'pct_NH_Asian_alone_ACS_14_18', 'pct_NH_NHOPI_alone_ACS_14_18',
       'pct_NH_SOR_alone_ACS_14_18', 'pct_Othr_Lang_ACS_14_18',
       'pct_ENG_VW_ACS_14_18', 'pct_Not_HS_Grad_ACS_14_18',
       'pct_College_ACS_14_18', 'avg_Agg_HH_INC_ACS_14_18',
       'pct_Prs_Blw_Pov_Lev_ACS_14_18', 'pct_PUB_ASST_INC_ACS_14_18',
       'pct_Diff_HU_1yr_Ago_ACS_14_18', 'pct_Recent_Built_HU_ACS_14_18',
       'avg_Agg_House_Value_ACS_14_18','Primary RUCA Code 2010','Secondary RUCA Code, 2010 (see errata)', 'Tract Population, 2010',
       'Land Area (square miles), 2010',
       'Population Density (per square mile), 2010','rank','index','mobility_raw','pct_belowpov_raw',\
                      'pct_deeppov_raw','life_exp_raw','lbw_raw','urban','rural','reservation',\
                      'tot_pop','pct_lths','lfpr','pct_bachmore','County-Level Index', \
                       'Requiring all 4 Subindices', 'Excluding Collective Efficacy', 'Family Unity', \
                       'Community Health', 'Institutional Health', 'Collective Efficacy', \
                       '% births to unmarried women', '% women currently married', '% children with single parent', \
                       'Non-religious non-profit organizations p 1,000', 'Religious congregations p 1,000', \
                       'Informal Civic Engagement Subindex', 'Presidential election voting rate, 2012 & 2016', \
                       'Mail-back census response rate', 'Confidence in Institutions Subindex', 'Violent Crimes p 100,000','ruca_cat']

Make x and y dataframe, and reset index so they match:

In [9]:
x=df_num_2[all_x_li]
y1=df_num_2['AnyProviderWith25/3']
y2=df_num_2['AnyProviderWith100/10']
x.reset_index(inplace=True,drop=True)
y1.reset_index(inplace=True,drop=True)
y2.reset_index(inplace=True,drop=True)
x.index,y1.index,y2.index

(RangeIndex(start=0, stop=7990776, step=1),
 RangeIndex(start=0, stop=7990776, step=1),
 RangeIndex(start=0, stop=7990776, step=1))

scale data to have mean = 0 and std = 1:

In [10]:
scaler = StandardScaler() 
x_scaled=scaler.fit_transform(x)
x_scaled = pd.DataFrame(x_scaled,columns=all_x_li)
x_scaled.head()

Unnamed: 0,Tot_Population_ACS_14_18,pct_URBANIZED_AREA_POP_CEN_2010,pct_URBAN_CLUSTER_POP_CEN_2010,pct_RURAL_POP_CEN_2010,avg_Tot_Prns_in_HHD_ACS_14_18,pct_Vacant_Units_ACS_14_18,pct_Hispanic_ACS_14_18,pct_NH_White_alone_ACS_14_18,pct_NH_Blk_alone_ACS_14_18,pct_NH_AIAN_alone_ACS_14_18,...,% women currently married,% children with single parent,"Non-religious non-profit organizations p 1,000","Religious congregations p 1,000",Informal Civic Engagement Subindex,"Presidential election voting rate, 2012 & 2016",Mail-back census response rate,Confidence in Institutions Subindex,"Violent Crimes p 100,000",ruca_cat
0,-0.718103,0.728437,-0.433958,-0.461839,-0.897132,-1.072048,-0.510827,0.35896,-0.061952,-0.178966,...,0.17741,-0.982957,-1.032835,0.254016,-0.89155,0.167864,0.553171,0.377297,-0.186864,-0.706168
1,-0.718103,0.728437,-0.433958,-0.461839,-0.897132,-1.072048,-0.510827,0.35896,-0.061952,-0.178966,...,0.17741,-0.982957,-1.032835,0.254016,-0.89155,0.167864,0.553171,0.377297,-0.186864,-0.706168
2,-0.718103,0.728437,-0.433958,-0.461839,-0.897132,-1.072048,-0.510827,0.35896,-0.061952,-0.178966,...,0.17741,-0.982957,-1.032835,0.254016,-0.89155,0.167864,0.553171,0.377297,-0.186864,-0.706168
3,-0.718103,0.728437,-0.433958,-0.461839,-0.897132,-1.072048,-0.510827,0.35896,-0.061952,-0.178966,...,0.17741,-0.982957,-1.032835,0.254016,-0.89155,0.167864,0.553171,0.377297,-0.186864,-0.706168
4,-0.718103,0.728437,-0.433958,-0.461839,-0.897132,-1.072048,-0.510827,0.35896,-0.061952,-0.178966,...,0.17741,-0.982957,-1.032835,0.254016,-0.89155,0.167864,0.553171,0.377297,-0.186864,-0.706168


#### 2. ANOVA on contiguous U.S


define functions to compute ANOVA correlations for each x, first function outputs the best k features with highest correlations, second function outputs the best k features as well as the correlation scores and indices to sort them:

In [11]:
def anova_df(x,y,k):
    test=SelectKBest(score_func=f_classif,k=k)
    fit=test.fit(x,y)
    cols = test.get_support(indices=True)
    features_df =x.iloc[:,cols]
    return features_df

In [12]:
def anova_df_2(x,y,k):
    test=SelectKBest(score_func=f_classif,k=k)
    fit=test.fit(x,y)
    cols = test.get_support(indices=True)
    features_df =x.iloc[:,cols]
    return features_df,fit.scores_,np.argsort(fit.scores_)

run ANOVA on the scaled x dataframe, with y1 (AnyProviderWith25/3) first:

In [13]:
x_4_res=anova_df_2(x_scaled,y1,4) # get 4 best features
x_4=x_4_res[0]
x_4.head()

Unnamed: 0,pct_URBANIZED_AREA_POP_CEN_2010,pct_RURAL_POP_CEN_2010,Primary RUCA Code 2010,ruca_cat
0,0.728437,-0.461839,-0.806122,-0.706168
1,0.728437,-0.461839,-0.806122,-0.706168
2,0.728437,-0.461839,-0.806122,-0.706168
3,0.728437,-0.461839,-0.806122,-0.706168
4,0.728437,-0.461839,-0.806122,-0.706168


In [14]:
x_4_scores=x_4_res[1] # ANOVA correlations
x_4_i=x_4_res[2] # index that would sort correlations in ascending order
x_4_scores,x_4_i

(array([7.18958834e+04, 9.03520410e+05, 3.79445202e+04, 1.30407217e+06,
        1.04832009e+03, 3.41839954e+05, 3.30230974e+04, 3.93989943e+04,
        3.85124397e+04, 2.31209416e+05, 1.26804754e+05, 2.80366801e+03,
        1.19455040e+04, 2.91188559e+04, 1.21040718e+04, 3.84651363e+04,
        2.39357361e+05, 1.19750577e+05, 1.99629585e+04, 4.60826146e+03,
        6.31312356e+04, 5.67598556e+02, 4.57903328e+04, 7.27778624e+05,
        7.26824225e+05, 7.87470112e+04, 5.20386010e+05, 2.97383642e+05,
        2.99072968e+04, 4.46983052e+04, 1.23532312e+05, 1.29459695e+05,
        6.92611259e+04, 2.27316503e+05, 4.71414909e+02, 5.97964070e+05,
        5.97964070e+05, 2.42983453e+04, 1.44295219e+05, 9.07476456e+04,
        3.31988574e+05, 4.55857047e+05, 4.31878826e+03, 4.31878826e+03,
        3.87593061e+03, 6.71838984e+03, 1.86561860e+05, 1.29595956e+05,
        7.26616925e+04, 2.83635934e+04, 8.04993615e+00, 4.22839399e+03,
        2.67734165e+04, 6.22325314e+05, 8.98384782e+03, 8.249486

In [21]:
scores_desc=np.flip(x_4_scores[x_4_i]) # sort scores in descending order
scores_desc

array([1.30407217e+06, 9.03520410e+05, 7.66266937e+05, 7.27778624e+05,
       7.26824225e+05, 6.22325314e+05, 5.97964070e+05, 5.97964070e+05,
       5.20386010e+05, 4.55857047e+05, 3.41839954e+05, 3.31988574e+05,
       3.08228237e+05, 2.97383642e+05, 2.39357361e+05, 2.31209416e+05,
       2.27316503e+05, 1.86561860e+05, 1.44295219e+05, 1.29595956e+05,
       1.29459695e+05, 1.26804754e+05, 1.23532312e+05, 1.19750577e+05,
       9.07476456e+04, 8.24948697e+04, 7.87470112e+04, 7.26616925e+04,
       7.26616920e+04, 7.18958834e+04, 6.92611259e+04, 6.31312356e+04,
       4.57903328e+04, 4.46983052e+04, 3.93989943e+04, 3.85124397e+04,
       3.84651363e+04, 3.79445202e+04, 3.30230974e+04, 2.99072968e+04,
       2.91188559e+04, 2.83635934e+04, 2.67734165e+04, 2.42983453e+04,
       1.99629585e+04, 1.21040718e+04, 1.19455040e+04, 9.72904243e+03,
       8.98384782e+03, 6.71838984e+03, 4.60826146e+03, 4.31878826e+03,
       4.31878826e+03, 4.22839399e+03, 3.87593061e+03, 2.80366801e+03,
      

In [22]:
cols_desc=list(x_scaled.iloc[:3,x_4_i].columns) 
cols_desc.reverse()
cols_desc # corresponding columns for highest correlations to lowest

['pct_RURAL_POP_CEN_2010',
 'pct_URBANIZED_AREA_POP_CEN_2010',
 'ruca_cat',
 'Primary RUCA Code 2010',
 'Secondary RUCA Code, 2010 (see errata)',
 'Religious congregations p 1,000',
 'urban',
 'rural',
 'Land Area (square miles), 2010',
 'pct_bachmore',
 'pct_Vacant_Units_ACS_14_18',
 'lfpr',
 'Mail-back census response rate',
 'Population Density (per square mile), 2010',
 'pct_College_ACS_14_18',
 'pct_NH_AIAN_alone_ACS_14_18',
 'life_exp_raw',
 'Community Health',
 'tot_pop',
 'Institutional Health',
 'pct_belowpov_raw',
 'pct_NH_Asian_alone_ACS_14_18',
 'mobility_raw',
 'avg_Agg_HH_INC_ACS_14_18',
 'pct_lths',
 'Presidential election voting rate, 2012 & 2016',
 'Tract Population, 2010',
 'Collective Efficacy',
 'Violent Crimes p 100,000',
 'Tot_Population_ACS_14_18',
 'pct_deeppov_raw',
 'pct_Diff_HU_1yr_Ago_ACS_14_18',
 'avg_Agg_House_Value_ACS_14_18',
 'index',
 'pct_NH_White_alone_ACS_14_18',
 'pct_NH_Blk_alone_ACS_14_18',
 'pct_Not_HS_Grad_ACS_14_18',
 'pct_URBAN_CLUSTER_POP_CE

In [26]:
# pip install dataframe_image

Make a dataframe to show variables and their ANOVA correlations side by side, in descending order:

In [27]:
import dataframe_image as dfi
res_df=pd.DataFrame({'Variable':cols_desc,
        'ANOVA corr':scores_desc})
# dfi.export(res_df, '/Users/yapinghe/Desktop/broadband research/conus_anova_corr.png')
res_df.head() 

Unnamed: 0,Variable,ANOVA corr
0,pct_RURAL_POP_CEN_2010,1304072.0
1,pct_URBANIZED_AREA_POP_CEN_2010,903520.4
2,ruca_cat,766266.9
3,Primary RUCA Code 2010,727778.6
4,"Secondary RUCA Code, 2010 (see errata)",726824.2


**contiguous U.S. result, y1:**

In [88]:
# res_df.to_csv('conus_anova_corr.csv')
res_df

Unnamed: 0,Variable,ANOVA corr
0,pct_RURAL_POP_CEN_2010,1304072.0
1,pct_URBANIZED_AREA_POP_CEN_2010,903520.4
2,ruca_cat,766266.9
3,Primary RUCA Code 2010,727778.6
4,"Secondary RUCA Code, 2010 (see errata)",726824.2
5,"Religious congregations p 1,000",622325.3
6,urban,597964.1
7,rural,597964.1
8,"Land Area (square miles), 2010",520386.0
9,pct_bachmore,455857.0


In [20]:
x_10_res=anova_df_2(x_scaled,y1,10)
x_10=x_10_res[0]
x_10.head() # match with above

Unnamed: 0,pct_URBANIZED_AREA_POP_CEN_2010,pct_RURAL_POP_CEN_2010,Primary RUCA Code 2010,"Secondary RUCA Code, 2010 (see errata)","Land Area (square miles), 2010",urban,rural,pct_bachmore,"Religious congregations p 1,000",ruca_cat
0,0.728437,-0.461839,-0.806122,-0.806349,-0.326735,0.774281,-0.774281,-0.080054,0.254016,-0.706168
1,0.728437,-0.461839,-0.806122,-0.806349,-0.326735,0.774281,-0.774281,-0.080054,0.254016,-0.706168
2,0.728437,-0.461839,-0.806122,-0.806349,-0.326735,0.774281,-0.774281,-0.080054,0.254016,-0.706168
3,0.728437,-0.461839,-0.806122,-0.806349,-0.326735,0.774281,-0.774281,-0.080054,0.254016,-0.706168
4,0.728437,-0.461839,-0.806122,-0.806349,-0.326735,0.774281,-0.774281,-0.080054,0.254016,-0.706168


In [21]:
x_10_scores=x_10_res[1]
x_10_i=x_10_res[2]
x_10_scores,x_10_i

(array([7.18958834e+04, 9.03520410e+05, 3.79445202e+04, 1.30407217e+06,
        1.04832009e+03, 3.41839954e+05, 3.30230974e+04, 3.93989943e+04,
        3.85124397e+04, 2.31209416e+05, 1.26804754e+05, 2.80366801e+03,
        1.19455040e+04, 2.91188559e+04, 1.21040718e+04, 3.84651363e+04,
        2.39357361e+05, 1.19750577e+05, 1.99629585e+04, 4.60826146e+03,
        6.31312356e+04, 5.67598556e+02, 4.57903328e+04, 7.27778624e+05,
        7.26824225e+05, 7.87470112e+04, 5.20386010e+05, 2.97383642e+05,
        2.99072968e+04, 4.46983052e+04, 1.23532312e+05, 1.29459695e+05,
        6.92611259e+04, 2.27316503e+05, 4.71414909e+02, 5.97964070e+05,
        5.97964070e+05, 2.42983453e+04, 1.44295219e+05, 9.07476456e+04,
        3.31988574e+05, 4.55857047e+05, 4.31878826e+03, 4.31878826e+03,
        3.87593061e+03, 6.71838984e+03, 1.86561860e+05, 1.29595956e+05,
        7.26616925e+04, 2.83635934e+04, 8.04993615e+00, 4.22839399e+03,
        2.67734165e+04, 6.22325314e+05, 8.98384782e+03, 8.249486

ANOVA correlations are identical no matter what k (number of best features) we input, so no need to run it every time.  
Below is a way to order data so that each column to the right has a higher correlation:

In [30]:
# x_4_scores==x_10_scores, x_4_i==x_10_i # identical, no need to run anova for each k
x_scaled.iloc[:3,x_10_i[-10:]] # cols from low to high score

Unnamed: 0,pct_bachmore,"Land Area (square miles), 2010",rural,urban,"Religious congregations p 1,000","Secondary RUCA Code, 2010 (see errata)",Primary RUCA Code 2010,ruca_cat,pct_URBANIZED_AREA_POP_CEN_2010,pct_RURAL_POP_CEN_2010
0,-0.080054,-0.326735,-0.774281,0.774281,0.254016,-0.806349,-0.806122,-0.706168,0.728437,-0.461839
1,-0.080054,-0.326735,-0.774281,0.774281,0.254016,-0.806349,-0.806122,-0.706168,0.728437,-0.461839
2,-0.080054,-0.326735,-0.774281,0.774281,0.254016,-0.806349,-0.806122,-0.706168,0.728437,-0.461839


In [33]:
x_6=x_scaled.iloc[:,x_10_i[-6:]]
x_6.head()

Unnamed: 0,"Religious congregations p 1,000","Secondary RUCA Code, 2010 (see errata)",Primary RUCA Code 2010,ruca_cat,pct_URBANIZED_AREA_POP_CEN_2010,pct_RURAL_POP_CEN_2010
0,0.254016,-0.806349,-0.806122,-0.706168,0.728437,-0.461839
1,0.254016,-0.806349,-0.806122,-0.706168,0.728437,-0.461839
2,0.254016,-0.806349,-0.806122,-0.706168,0.728437,-0.461839
3,0.254016,-0.806349,-0.806122,-0.706168,0.728437,-0.461839
4,0.254016,-0.806349,-0.806122,-0.706168,0.728437,-0.461839


In [34]:
x_8=x_scaled.iloc[:,x_10_i[-8:]]
x_8.head()

Unnamed: 0,rural,urban,"Religious congregations p 1,000","Secondary RUCA Code, 2010 (see errata)",Primary RUCA Code 2010,ruca_cat,pct_URBANIZED_AREA_POP_CEN_2010,pct_RURAL_POP_CEN_2010
0,-0.774281,0.774281,0.254016,-0.806349,-0.806122,-0.706168,0.728437,-0.461839
1,-0.774281,0.774281,0.254016,-0.806349,-0.806122,-0.706168,0.728437,-0.461839
2,-0.774281,0.774281,0.254016,-0.806349,-0.806122,-0.706168,0.728437,-0.461839
3,-0.774281,0.774281,0.254016,-0.806349,-0.806122,-0.706168,0.728437,-0.461839
4,-0.774281,0.774281,0.254016,-0.806349,-0.806122,-0.706168,0.728437,-0.461839


Repeat for y2 (AnyProviderWith100/10):

In [29]:
# y2
x_res_y2=anova_df_2(x_scaled,y2,4)
x_scores_y2=x_res_y2[1]
x_i_y2=x_res_y2[2]
x_scores_y2,x_i_y2

(array([9.08131493e+04, 1.78873317e+06, 4.72198279e+04, 2.55482889e+06,
        2.25919087e+02, 3.91994199e+05, 2.14702854e+04, 1.00325187e+05,
        1.11593340e+05, 1.60444361e+05, 2.18502024e+05, 2.96347296e+03,
        2.09144938e+04, 5.24788609e+04, 1.76943230e+04, 4.43511353e+04,
        3.83173983e+05, 1.38021839e+05, 5.01975608e+03, 1.08653151e+04,
        1.05533087e+05, 6.41775208e+01, 5.16274761e+04, 1.19353533e+06,
        1.19208167e+06, 9.82729965e+04, 5.69334970e+05, 5.74818641e+05,
        4.48987699e+03, 8.70278307e+03, 3.30830350e+05, 1.08660343e+05,
        4.54896131e+04, 2.25526195e+05, 2.95090110e+04, 8.30190240e+05,
        8.30190240e+05, 1.57302043e+04, 2.81002402e+05, 1.19133522e+05,
        3.45292643e+05, 7.26739964e+05, 5.36474647e+04, 5.36474647e+04,
        7.89174880e+03, 6.30650632e+03, 3.53743683e+05, 9.61148840e+04,
        1.59808411e+05, 7.56247732e+03, 4.10007887e+04, 5.82171455e+03,
        6.76928106e+04, 8.68920411e+05, 4.86124482e+04, 1.053126

**contiguous U.S result, y2:**

In [32]:
scores_desc_y2=np.flip(x_scores_y2[x_i_y2])
cols_desc_y2=list(x_scaled.iloc[:3,x_i_y2].columns)
cols_desc_y2.reverse()
res_df_y2=pd.DataFrame({'Variable':cols_desc_y2,
        'ANOVA corr':scores_desc_y2})
# dfi.export(res_df_y2, '/Users/yapinghe/Desktop/broadband research/conus_anova_corr_y2.png')
res_df_y2

Unnamed: 0,Variable,ANOVA corr
0,pct_RURAL_POP_CEN_2010,2554829.0
1,pct_URBANIZED_AREA_POP_CEN_2010,1788733.0
2,ruca_cat,1370581.0
3,Primary RUCA Code 2010,1193535.0
4,"Secondary RUCA Code, 2010 (see errata)",1192082.0
5,"Religious congregations p 1,000",868920.4
6,urban,830190.2
7,rural,830190.2
8,pct_bachmore,726740.0
9,"Population Density (per square mile), 2010",574818.6


In [33]:
# res_df_y2.to_csv('conus_anova_corr_y2.csv')

#### 3. ANOVA on RUCA categories  
Now we do the same analysis for each RUCA category (core, suburb, exurb, rural) and see if the results are different:  
I used 1 to indicate core, 2 for suburb, 3 for exurb, 4 for rural

In [36]:
# first by ruca category
rc1=df_num_2[df_num_2['ruca_cat']==1] #core
rc2=df_num_2[df_num_2['ruca_cat']==2] #suburb
rc3=df_num_2[df_num_2['ruca_cat']==3] #exurb
rc4=df_num_2[df_num_2['ruca_cat']==4] #rural
len(rc1),len(rc2),len(rc3),len(rc4)

(4587762, 2037480, 341765, 1023769)

create x and y dataframes for each category and reset index:

In [37]:
x_rc1=rc1[all_x_li]
x_rc2=rc2[all_x_li]
x_rc3=rc3[all_x_li]
x_rc4=rc4[all_x_li]
y1_rc1=rc1['AnyProviderWith25/3']
y2_rc1=rc1['AnyProviderWith100/10']
y1_rc2=rc2['AnyProviderWith25/3']
y2_rc2=rc2['AnyProviderWith100/10']
y1_rc3=rc3['AnyProviderWith25/3']
y2_rc3=rc3['AnyProviderWith100/10']
y1_rc4=rc4['AnyProviderWith25/3']
y2_rc4=rc4['AnyProviderWith100/10']
y1_rc1.reset_index(inplace=True,drop=True)
y2_rc1.reset_index(inplace=True,drop=True)
y1_rc2.reset_index(inplace=True,drop=True)
y2_rc2.reset_index(inplace=True,drop=True)
y1_rc3.reset_index(inplace=True,drop=True)
y2_rc3.reset_index(inplace=True,drop=True)
y1_rc4.reset_index(inplace=True,drop=True)
y2_rc4.reset_index(inplace=True,drop=True)

scale all the x dataframes for each category:

In [38]:
scaler = StandardScaler() 
x_rc1_scaled=scaler.fit_transform(x_rc1)
x_rc2_scaled=scaler.fit_transform(x_rc2)
x_rc3_scaled=scaler.fit_transform(x_rc3)
x_rc4_scaled=scaler.fit_transform(x_rc4)
x_rc1_scaled = pd.DataFrame(x_rc1_scaled,columns=all_x_li)
x_rc2_scaled = pd.DataFrame(x_rc2_scaled,columns=all_x_li)
x_rc3_scaled = pd.DataFrame(x_rc3_scaled,columns=all_x_li)
x_rc4_scaled = pd.DataFrame(x_rc4_scaled,columns=all_x_li)
x_rc1_scaled.index==y1_rc1.index, x_rc2_scaled.index==y1_rc2.index, x_rc3_scaled.index==y1_rc3.index, x_rc4_scaled.index==y1_rc4.index

(array([ True,  True,  True, ...,  True,  True,  True]),
 array([ True,  True,  True, ...,  True,  True,  True]),
 array([ True,  True,  True, ...,  True,  True,  True]),
 array([ True,  True,  True, ...,  True,  True,  True]))

Run ANOVA correlations on core area:

In [39]:
#rc1
x_rc1_res=anova_df_2(x_rc1_scaled,y1_rc1,4) # best 4 features
x_4_rc1=x_rc1_res[0]
x_rc1_scores=x_rc1_res[1]
x_rc1_i=x_rc1_res[2]
x_4_rc1.head() 

  f = msb / msw


Unnamed: 0,pct_URBANIZED_AREA_POP_CEN_2010,pct_RURAL_POP_CEN_2010,"Secondary RUCA Code, 2010 (see errata)","Land Area (square miles), 2010"
0,0.139383,0.455694,-0.540137,-0.146724
1,0.139383,0.455694,-0.540137,-0.146724
2,0.139383,0.455694,-0.540137,-0.146724
3,0.139383,0.455694,-0.540137,-0.146724
4,0.139383,0.455694,-0.540137,-0.146724


In [40]:
x_rc1_scores,x_rc1_i # correlation scores, and index for sorting in ascending order
# nan corresponds to ruca_cat, which is RUCA category, 
# which is the same in x of every category so it gives nan as score

(array([2.93655933e+03, 2.34983597e+05, 2.39491810e+04, 3.64106641e+05,
        7.58569344e+02, 5.48335892e+04, 1.85414052e+01, 4.98944866e+03,
        4.41968644e+03, 3.25559513e+04, 2.30219347e+04, 4.06611839e+02,
        1.79216693e+03, 4.66847811e+03, 1.65143891e+03, 9.23756232e+03,
        3.60260249e+04, 1.79703904e+04, 3.52222105e+03, 1.07090330e+03,
        4.38066839e+02, 1.49099905e+03, 1.05019286e+04, 2.17325683e+05,
        2.17362850e+05, 3.53331454e+01, 2.26648598e+05, 7.36512278e+04,
        9.05859886e+03, 1.23131515e+04, 3.56757164e+04, 2.99259762e+04,
        1.96121666e+04, 4.86152257e+04, 6.71276247e+02, 1.76838353e+05,
        1.76838353e+05, 1.06202937e+04, 2.52929220e+04, 3.10432365e+04,
        6.57624086e+04, 9.66087659e+04, 1.08900245e+02, 1.08900245e+02,
        5.19013615e+03, 2.10516289e+03, 4.05752704e+04, 4.54796026e+04,
        6.78907608e+03, 6.89137258e+03, 6.65758394e+01, 3.03388203e+03,
        4.15888134e+03, 1.69439362e+05, 2.97343475e+03, 3.797892

In [41]:
x_6_rc1=x_rc1_scaled.iloc[:,x_rc1_i[-7:-1]] # remove nan index (from ruca cat)
x_6_rc1.head() # best 6 features, sorted from lower scores (left) to higher (right)

Unnamed: 0,urban,Primary RUCA Code 2010,"Secondary RUCA Code, 2010 (see errata)","Land Area (square miles), 2010",pct_URBANIZED_AREA_POP_CEN_2010,pct_RURAL_POP_CEN_2010
0,0.504483,-0.538984,-0.540137,-0.146724,0.139383,0.455694
1,0.504483,-0.538984,-0.540137,-0.146724,0.139383,0.455694
2,0.504483,-0.538984,-0.540137,-0.146724,0.139383,0.455694
3,0.504483,-0.538984,-0.540137,-0.146724,0.139383,0.455694
4,0.504483,-0.538984,-0.540137,-0.146724,0.139383,0.455694


In [42]:
x_8_rc1=x_rc1_scaled.iloc[:,x_rc1_i[-9:-1]]
x_8_rc1.head() # best 8 features, sorted from lower scores (left) to higher (right)

Unnamed: 0,"Religious congregations p 1,000",rural,urban,Primary RUCA Code 2010,"Secondary RUCA Code, 2010 (see errata)","Land Area (square miles), 2010",pct_URBANIZED_AREA_POP_CEN_2010,pct_RURAL_POP_CEN_2010
0,0.935834,-0.504483,0.504483,-0.538984,-0.540137,-0.146724,0.139383,0.455694
1,0.935834,-0.504483,0.504483,-0.538984,-0.540137,-0.146724,0.139383,0.455694
2,0.935834,-0.504483,0.504483,-0.538984,-0.540137,-0.146724,0.139383,0.455694
3,0.935834,-0.504483,0.504483,-0.538984,-0.540137,-0.146724,0.139383,0.455694
4,0.935834,-0.504483,0.504483,-0.538984,-0.540137,-0.146724,0.139383,0.455694


In [43]:
x_10_rc1=x_rc1_scaled.iloc[:,x_rc1_i[-11:-1]]
x_10_rc1.head() # best 10 features, sorted from lower scores (left) to higher (right)

Unnamed: 0,Mail-back census response rate,pct_bachmore,"Religious congregations p 1,000",rural,urban,Primary RUCA Code 2010,"Secondary RUCA Code, 2010 (see errata)","Land Area (square miles), 2010",pct_URBANIZED_AREA_POP_CEN_2010,pct_RURAL_POP_CEN_2010
0,0.529322,-0.394264,0.935834,-0.504483,0.504483,-0.538984,-0.540137,-0.146724,0.139383,0.455694
1,0.529322,-0.394264,0.935834,-0.504483,0.504483,-0.538984,-0.540137,-0.146724,0.139383,0.455694
2,0.529322,-0.394264,0.935834,-0.504483,0.504483,-0.538984,-0.540137,-0.146724,0.139383,0.455694
3,0.529322,-0.394264,0.935834,-0.504483,0.504483,-0.538984,-0.540137,-0.146724,0.139383,0.455694
4,0.529322,-0.394264,0.935834,-0.504483,0.504483,-0.538984,-0.540137,-0.146724,0.139383,0.455694


construct dataframe with variable names and their correlations side by side:  
**core result:**

In [47]:
scores_desc_rc1=np.flip(x_rc1_scores[x_rc1_i])
cols_desc_rc1=list(x_rc1_scaled.iloc[:3,x_rc1_i].columns) #remove last index?
cols_desc_rc1.reverse()
res_df_rc1=pd.DataFrame({'Variable':cols_desc_rc1,
        'ANOVA corr':scores_desc_rc1}).iloc[1:,:]
# dfi.export(res_df_rc1, '/Users/yapinghe/Desktop/broadband research/ruca_core_anova_corr.png')
# res_df_rc1.to_csv('ruca_core_anova_corr.csv')
res_df_rc1

Unnamed: 0,Variable,ANOVA corr
1,pct_RURAL_POP_CEN_2010,364106.641416
2,pct_URBANIZED_AREA_POP_CEN_2010,234983.596836
3,"Land Area (square miles), 2010",226648.598132
4,"Secondary RUCA Code, 2010 (see errata)",217362.850407
5,Primary RUCA Code 2010,217325.683398
6,urban,176838.352925
7,rural,176838.352925
8,"Religious congregations p 1,000",169439.361888
9,pct_bachmore,96608.765903
10,Mail-back census response rate,79888.649973


repeat for suburb category:

In [48]:
# rc2
x_rc2_res=anova_df_2(x_rc2_scaled,y1_rc2,4)
x_4_rc2=x_rc2_res[0]
x_rc2_scores=x_rc2_res[1]
x_rc2_i=x_rc2_res[2]
x_4_rc2.head()

  f = msb / msw


Unnamed: 0,pct_RURAL_POP_CEN_2010,"Land Area (square miles), 2010","Religious congregations p 1,000",Mail-back census response rate
0,-0.216133,-0.349854,0.042894,0.518921
1,-0.216133,-0.349854,0.042894,0.518921
2,-0.216133,-0.349854,0.042894,0.518921
3,-0.216133,-0.349854,0.042894,0.518921
4,-0.216133,-0.349854,0.042894,0.518921


In [49]:
# x_rc2_scores,x_rc2_i # need to remove last index (from ruca cat)

In [50]:
x_10_rc2=x_rc2_scaled.iloc[:,x_rc2_i[-11:-1]] # remove nan index
x_10_rc2.head()

Unnamed: 0,"Secondary RUCA Code, 2010 (see errata)",Primary RUCA Code 2010,pct_bachmore,lfpr,Institutional Health,pct_NH_AIAN_alone_ACS_14_18,"Religious congregations p 1,000",Mail-back census response rate,"Land Area (square miles), 2010",pct_RURAL_POP_CEN_2010
0,-0.71695,-0.71643,0.251263,0.058532,0.464704,-0.201115,0.042894,0.518921,-0.349854,-0.216133
1,-0.71695,-0.71643,0.251263,0.058532,0.464704,-0.201115,0.042894,0.518921,-0.349854,-0.216133
2,-0.71695,-0.71643,0.251263,0.058532,0.464704,-0.201115,0.042894,0.518921,-0.349854,-0.216133
3,-0.71695,-0.71643,0.251263,0.058532,0.464704,-0.201115,0.042894,0.518921,-0.349854,-0.216133
4,-0.71695,-0.71643,0.251263,0.058532,0.464704,-0.201115,0.042894,0.518921,-0.349854,-0.216133


In [None]:
# can get x_6, x_8 easily

**suburb result:**

In [52]:
scores_desc_rc2=np.flip(x_rc2_scores[x_rc2_i])
cols_desc_rc2=list(x_rc2_scaled.iloc[:3,x_rc2_i].columns) #remove last index?
cols_desc_rc2.reverse()
res_df_rc2=pd.DataFrame({'Variable':cols_desc_rc2,
        'ANOVA corr':scores_desc_rc2}).iloc[1:,:]
# dfi.export(res_df_rc2, '/Users/yapinghe/Desktop/broadband research/ruca_suburb_anova_corr.png')
# res_df_rc2.to_csv('ruca_suburb_anova_corr.csv')
res_df_rc2

Unnamed: 0,Variable,ANOVA corr
1,pct_RURAL_POP_CEN_2010,68165.024077
2,"Land Area (square miles), 2010",66929.685478
3,Mail-back census response rate,60773.987977
4,"Religious congregations p 1,000",57228.353077
5,pct_NH_AIAN_alone_ACS_14_18,55830.133463
6,Institutional Health,51311.259277
7,lfpr,44342.764646
8,pct_bachmore,43257.119219
9,Primary RUCA Code 2010,41846.944409
10,"Secondary RUCA Code, 2010 (see errata)",41813.140155


repeat for exurb category:

In [53]:
# rc3
x_rc3_res=anova_df_2(x_rc3_scaled,y1_rc3,4)
x_4_rc3=x_rc3_res[0]
x_rc3_scores=x_rc3_res[1]
x_rc3_i=x_rc3_res[2]
x_4_rc3.head()

  f = msb / msw


Unnamed: 0,pct_NH_AIAN_alone_ACS_14_18,pct_belowpov_raw,pct_deeppov_raw,lfpr
0,-0.205765,-0.737204,-0.624973,0.064204
1,-0.205765,-0.737204,-0.624973,0.064204
2,-0.205765,-0.737204,-0.624973,0.064204
3,-0.205765,-0.737204,-0.624973,0.064204
4,-0.205765,-0.737204,-0.624973,0.064204


In [54]:
x_rc3_scores,x_rc3_i

(array([2.83921678e+03, 4.99754520e+02, 5.03430919e+03, 5.50422160e+03,
        4.62241763e+02, 3.15325174e+03, 1.36110848e+03, 5.33445435e+03,
        2.05372156e+03, 1.22187891e+04, 5.46269460e+02, 1.73630398e+01,
        3.97272166e+02, 3.17448657e+03, 2.73353829e+03, 5.22914994e+03,
        5.79651763e+03, 8.69824925e+03, 7.13725687e+03, 1.65474738e+02,
        1.11551255e+03, 1.88838162e+00, 3.41156813e+03, 4.25577182e+02,
        4.25577182e+02, 3.96817075e+03, 4.55049680e+03, 3.70070156e+03,
        7.81805483e+03, 8.99268488e+03, 3.12503876e+02, 1.13123271e+04,
        1.24390722e+04, 3.47749553e+03, 5.41432685e+03, 1.90425164e+03,
        1.90425164e+03, 2.15228153e+03, 2.30494485e+02, 3.26057502e+03,
        9.16645352e+03, 4.47468458e+03, 2.88442345e+03, 2.88442345e+03,
        3.88227630e+03, 3.93462445e+03, 1.50014212e+01, 4.33598342e+03,
        1.78291487e+02, 9.04301852e+02, 5.65775861e+03, 3.09902131e+03,
        2.69178114e+03, 8.45608951e+03, 4.03812236e+02, 1.569449

In [55]:
x_10_rc3=x_rc3_scaled.iloc[:,x_rc3_i[-11:-1]] # remove nan index
x_10_rc3.head()

Unnamed: 0,pct_College_ACS_14_18,pct_Prs_Blw_Pov_Lev_ACS_14_18,rank,"Religious congregations p 1,000",avg_Agg_HH_INC_ACS_14_18,index,lfpr,pct_belowpov_raw,pct_NH_AIAN_alone_ACS_14_18,pct_deeppov_raw
0,0.413788,-0.119464,0.351685,-0.668572,-1.45849,0.346571,0.064204,-0.737204,-0.205765,-0.624973
1,0.413788,-0.119464,0.351685,-0.668572,-1.45849,0.346571,0.064204,-0.737204,-0.205765,-0.624973
2,0.413788,-0.119464,0.351685,-0.668572,-1.45849,0.346571,0.064204,-0.737204,-0.205765,-0.624973
3,0.413788,-0.119464,0.351685,-0.668572,-1.45849,0.346571,0.064204,-0.737204,-0.205765,-0.624973
4,0.413788,-0.119464,0.351685,-0.668572,-1.45849,0.346571,0.064204,-0.737204,-0.205765,-0.624973


**exurb result:**

In [57]:
scores_desc_rc3=np.flip(x_rc3_scores[x_rc3_i])
cols_desc_rc3=list(x_rc3_scaled.iloc[:3,x_rc3_i].columns) #remove last index?
cols_desc_rc3.reverse()
res_df_rc3=pd.DataFrame({'Variable':cols_desc_rc3,
        'ANOVA corr':scores_desc_rc3}).iloc[1:,:]
# dfi.export(res_df_rc3, '/Users/yapinghe/Desktop/broadband research/ruca_exurb_anova_corr.png')
# res_df_rc3.to_csv('ruca_exurb_anova_corr.csv')
res_df_rc3

Unnamed: 0,Variable,ANOVA corr
1,pct_deeppov_raw,12439.072249
2,pct_NH_AIAN_alone_ACS_14_18,12218.78907
3,pct_belowpov_raw,11312.327061
4,lfpr,9166.453516
5,index,8992.684884
6,avg_Agg_HH_INC_ACS_14_18,8698.249251
7,"Religious congregations p 1,000",8456.08951
8,rank,7818.054833
9,pct_Prs_Blw_Pov_Lev_ACS_14_18,7137.256873
10,pct_College_ACS_14_18,5796.517631


repeat for rural category:

In [58]:
# rc4
x_rc4_res=anova_df_2(x_rc4_scaled,y1_rc4,4)
x_4_rc4=x_rc4_res[0]
x_rc4_scores=x_rc4_res[1]
x_rc4_i=x_rc4_res[2]
x_4_rc4.head()

  f = msb / msw


Unnamed: 0,Requiring all 4 Subindices,Excluding Collective Efficacy,Institutional Health,Confidence in Institutions Subindex
0,-1.604751,-1.509892,-0.486549,-0.007762
1,-1.604751,-1.509892,-0.486549,-0.007762
2,-1.604751,-1.509892,-0.486549,-0.007762
3,-1.604751,-1.509892,-0.486549,-0.007762
4,-1.604751,-1.509892,-0.486549,-0.007762


In [59]:
x_rc4_scores,x_rc4_i

(array([3.66161238e-01, 1.81463737e+02, 1.34585672e+03, 1.50276026e+03,
        1.13512755e+04, 3.56099488e+03, 4.69239223e+03, 2.32235420e+04,
        1.51601519e+03, 1.77612832e+04, 2.13797483e+03, 1.62090493e+01,
        2.07676186e+02, 1.89137082e+04, 7.28779238e+03, 8.41504892e+03,
        7.61832272e+03, 1.64601459e+04, 1.45043480e+04, 2.93780278e+01,
        2.06054350e+03, 6.38552386e+01, 2.56655312e+03,            nan,
        1.07892068e+00, 3.54645874e+02, 1.81924053e+04, 1.79463745e+04,
        1.69861278e+04, 2.08545720e+04, 9.26738226e+03, 1.98291430e+04,
        1.40608954e+04, 1.60489792e+04, 7.95735403e+03, 2.54374299e+03,
        2.54374299e+03, 2.80514056e+03, 1.10729067e+03, 1.21434764e+04,
        1.46601310e+04, 5.78349372e+03, 2.98217562e+04, 2.98217562e+04,
        3.31825561e+04, 1.36507336e+04, 6.69088302e+03, 4.56042561e+04,
        6.12078692e+03, 4.36336193e+03, 1.49803335e+04, 1.02667780e+04,
        7.78493825e+03, 6.98443930e+02, 1.35227367e+04, 1.872538

In [62]:
x_10_rc4=x_rc4_scaled.iloc[:,x_rc4_i[-12:-2]] # remove 2 nan index
x_10_rc4.head()

Unnamed: 0,"Presidential election voting rate, 2012 & 2016",pct_Othr_Lang_ACS_14_18,pct_belowpov_raw,index,pct_NH_White_alone_ACS_14_18,Requiring all 4 Subindices,County-Level Index,Excluding Collective Efficacy,Confidence in Institutions Subindex,Institutional Health
0,-0.703898,-0.134837,1.986277,-2.251683,-1.342512,-1.604751,-1.604751,-1.509892,-0.007762,-0.486549
1,-0.703898,-0.134837,1.986277,-2.251683,-1.342512,-1.604751,-1.604751,-1.509892,-0.007762,-0.486549
2,-0.703898,-0.134837,1.986277,-2.251683,-1.342512,-1.604751,-1.604751,-1.509892,-0.007762,-0.486549
3,-0.703898,-0.134837,1.986277,-2.251683,-1.342512,-1.604751,-1.604751,-1.509892,-0.007762,-0.486549
4,-0.703898,-0.134837,1.986277,-2.251683,-1.342512,-1.604751,-1.604751,-1.509892,-0.007762,-0.486549


**rural result:**

In [64]:
scores_desc_rc4=np.flip(x_rc4_scores[x_rc4_i])
cols_desc_rc4=list(x_rc4_scaled.iloc[:3,x_rc4_i].columns) #remove last index?
cols_desc_rc4.reverse()
res_df_rc4=pd.DataFrame({'Variable':cols_desc_rc4,
        'ANOVA corr':scores_desc_rc4}).iloc[2:,:]
# dfi.export(res_df_rc4, '/Users/yapinghe/Desktop/broadband research/ruca_rural_anova_corr.png')
# res_df_rc4.to_csv('ruca_rural_anova_corr.csv')
res_df_rc4

Unnamed: 0,Variable,ANOVA corr
2,Institutional Health,45604.256102
3,Confidence in Institutions Subindex,34260.283314
4,Excluding Collective Efficacy,33182.556115
5,County-Level Index,29821.756172
6,Requiring all 4 Subindices,29821.756172
7,pct_NH_White_alone_ACS_14_18,23223.541951
8,index,20854.572037
9,pct_belowpov_raw,19829.143025
10,pct_Othr_Lang_ACS_14_18,18913.708155
11,"Presidential election voting rate, 2012 & 2016",18725.382955


In [49]:
np.nanmean(x_rc1_scores),np.nanmean(x_rc2_scores),np.nanmean(x_rc3_scores),np.nanmean(x_rc4_scores)

(46221.38597736124, 22360.8489690902, 3556.966283982469, 10677.341447566228)

In [None]:
# on avg, highest anova f scores for rc1, then rc2, rc4, rc3

Repeat for all categories for y2 (AnyProviderWith100/10):

In [65]:
# repeat for y2
x_rc1_res_y2=anova_df_2(x_rc1_scaled,y2_rc1,10)
x_10_rc1_y2=x_rc1_res_y2[0]
x_rc1_scores_y2=x_rc1_res_y2[1]
x_rc1_i_y2=x_rc1_res_y2[2]
x_10_rc1_y2.head()

  f = msb / msw


Unnamed: 0,pct_URBANIZED_AREA_POP_CEN_2010,pct_RURAL_POP_CEN_2010,Primary RUCA Code 2010,"Secondary RUCA Code, 2010 (see errata)","Land Area (square miles), 2010","Population Density (per square mile), 2010",urban,rural,pct_bachmore,"Religious congregations p 1,000"
0,0.139383,0.455694,-0.538984,-0.540137,-0.146724,-0.524173,0.504483,-0.504483,-0.394264,0.935834
1,0.139383,0.455694,-0.538984,-0.540137,-0.146724,-0.524173,0.504483,-0.504483,-0.394264,0.935834
2,0.139383,0.455694,-0.538984,-0.540137,-0.146724,-0.524173,0.504483,-0.504483,-0.394264,0.935834
3,0.139383,0.455694,-0.538984,-0.540137,-0.146724,-0.524173,0.504483,-0.504483,-0.394264,0.935834
4,0.139383,0.455694,-0.538984,-0.540137,-0.146724,-0.524173,0.504483,-0.504483,-0.394264,0.935834


In [66]:
x_rc1_scores_y2,x_rc1_i_y2

(array([7.44027467e+02, 4.22683376e+05, 4.85125780e+04, 6.26742192e+05,
        2.28306484e+02, 5.45783287e+04, 4.76936836e+03, 4.23012215e+03,
        1.35413910e+04, 4.69561319e+04, 3.72200891e+04, 3.97398366e+02,
        3.26838324e+03, 2.11148291e+03, 2.25440792e+02, 2.27935558e+04,
        6.61696095e+04, 2.63816173e+04, 4.60617175e+03, 6.01909250e+02,
        9.52577706e+00, 5.73347686e+03, 1.56753613e+04, 3.59817631e+05,
        3.60489560e+05, 4.18403957e+03, 2.31889333e+05, 1.40348242e+05,
        1.05075896e+04, 1.27773806e+04, 6.49262527e+04, 4.96447317e+04,
        3.08849198e+04, 6.15605125e+04, 1.65619060e+03, 2.23142209e+05,
        2.23142209e+05, 1.49641264e+04, 5.58365231e+04, 6.48568600e+04,
        1.05821519e+05, 1.73349898e+05, 4.45190025e+01, 4.45190025e+01,
        4.54219999e+03, 3.94139151e+00, 5.28474991e+04, 6.73677388e+04,
        1.15001376e+04, 5.72627037e+03, 7.05814433e+03, 1.37943868e+02,
        1.69630130e+03, 2.34788804e+05, 7.61687233e+03, 7.295682

**core result, y2:**

In [70]:
scores_desc_rc1_y2=np.flip(x_rc1_scores_y2[x_rc1_i_y2])
cols_desc_rc1_y2=list(x_rc1_scaled.iloc[:3,x_rc1_i_y2].columns) #remove last index?
cols_desc_rc1_y2.reverse()
res_df_rc1_y2=pd.DataFrame({'Variable':cols_desc_rc1_y2,
        'ANOVA corr':scores_desc_rc1_y2}).iloc[1:,:]
# dfi.export(res_df_rc1_y2, '/Users/yapinghe/Desktop/broadband research/ruca_core_anova_corr_y2.png')
# res_df_rc1_y2.to_csv('ruca_core_anova_corr_y2.csv')
res_df_rc1_y2

Unnamed: 0,Variable,ANOVA corr
1,pct_RURAL_POP_CEN_2010,626742.191612
2,pct_URBANIZED_AREA_POP_CEN_2010,422683.375775
3,"Secondary RUCA Code, 2010 (see errata)",360489.56013
4,Primary RUCA Code 2010,359817.630824
5,"Religious congregations p 1,000",234788.803807
6,"Land Area (square miles), 2010",231889.332685
7,rural,223142.209231
8,urban,223142.209231
9,pct_bachmore,173349.897889
10,"Population Density (per square mile), 2010",140348.241511


In [71]:
#rc2
x_rc2_res_y2=anova_df_2(x_rc2_scaled,y2_rc2,10)
x_10_rc2_y2=x_rc2_res_y2[0]
x_rc2_scores_y2=x_rc2_res_y2[1]
x_rc2_i_y2=x_rc2_res_y2[2]
x_10_rc2_y2.head()

  f = msb / msw


Unnamed: 0,pct_URBANIZED_AREA_POP_CEN_2010,pct_URBAN_CLUSTER_POP_CEN_2010,pct_RURAL_POP_CEN_2010,"Land Area (square miles), 2010","Population Density (per square mile), 2010",urban,rural,pct_bachmore,"Religious congregations p 1,000",Mail-back census response rate
0,1.074533,-0.404258,-0.216133,-0.349854,-0.153644,0.919662,-0.919662,0.251263,0.042894,0.518921
1,1.074533,-0.404258,-0.216133,-0.349854,-0.153644,0.919662,-0.919662,0.251263,0.042894,0.518921
2,1.074533,-0.404258,-0.216133,-0.349854,-0.153644,0.919662,-0.919662,0.251263,0.042894,0.518921
3,1.074533,-0.404258,-0.216133,-0.349854,-0.153644,0.919662,-0.919662,0.251263,0.042894,0.518921
4,1.074533,-0.404258,-0.216133,-0.349854,-0.153644,0.919662,-0.919662,0.251263,0.042894,0.518921


In [72]:
x_rc2_scores_y2,x_rc2_i_y2

(array([3.00834716e+04, 4.47554983e+04, 7.11360477e+04, 1.27333045e+05,
        2.95953959e+03, 2.93767653e+04, 1.08747549e+04, 1.92240572e+04,
        7.56291029e+02, 3.13794950e+04, 1.18317316e+04, 6.33992236e+01,
        8.13329309e+02, 1.99383400e+04, 1.42384071e+04, 9.92341309e+03,
        2.05880493e+04, 8.44845539e+03, 4.30800514e+03, 6.73894012e+02,
        3.75191640e+03, 2.69630626e-01, 2.47202930e+03, 2.89635062e+04,
        2.88867152e+04, 2.47917569e+04, 6.15442487e+04, 5.89889569e+04,
        2.40561143e+03, 2.93357730e+03, 1.71118392e+04, 1.59889123e+04,
        1.06265357e+04, 8.93533174e+03, 3.30727313e+02, 3.34252341e+04,
        3.34252341e+04, 6.35694799e+03, 8.45194672e+03, 1.87445943e+04,
        1.30533792e+04, 3.24576843e+04, 2.86339822e+03, 2.86339822e+03,
        4.68936575e+03, 1.55569427e+03, 1.89826050e+04, 3.02244378e+04,
        7.93346956e+01, 1.87370886e+03, 7.57793709e+02, 1.08069726e+03,
        3.24915368e+03, 4.76052769e+04, 1.27284625e+03, 2.843574

**suburb result, y2:**

In [74]:
scores_desc_rc2_y2=np.flip(x_rc2_scores_y2[x_rc2_i_y2])
cols_desc_rc2_y2=list(x_rc2_scaled.iloc[:3,x_rc2_i_y2].columns) #remove last index?
cols_desc_rc2_y2.reverse()
res_df_rc2_y2=pd.DataFrame({'Variable':cols_desc_rc2_y2,
        'ANOVA corr':scores_desc_rc2_y2}).iloc[1:,:]
# dfi.export(res_df_rc2_y2, '/Users/yapinghe/Desktop/broadband research/ruca_suburb_anova_corr_y2.png')
# res_df_rc2_y2.to_csv('ruca_suburb_anova_corr_y2.csv')
res_df_rc2_y2

Unnamed: 0,Variable,ANOVA corr
1,pct_RURAL_POP_CEN_2010,127333.044679
2,pct_URBAN_CLUSTER_POP_CEN_2010,71136.047659
3,"Land Area (square miles), 2010",61544.24866
4,Mail-back census response rate,59704.761596
5,"Population Density (per square mile), 2010",58988.956924
6,"Religious congregations p 1,000",47605.276865
7,pct_URBANIZED_AREA_POP_CEN_2010,44755.498338
8,urban,33425.23409
9,rural,33425.23409
10,pct_bachmore,32457.684269


In [75]:
#rc3
x_rc3_res_y2=anova_df_2(x_rc3_scaled,y2_rc3,10)
x_10_rc3_y2=x_rc3_res_y2[0]
x_rc3_scores_y2=x_rc3_res_y2[1]
x_rc3_i_y2=x_rc3_res_y2[2]
x_10_rc3_y2.head()

  f = msb / msw


Unnamed: 0,pct_URBAN_CLUSTER_POP_CEN_2010,pct_RURAL_POP_CEN_2010,pct_NH_AIAN_alone_ACS_14_18,"Tract Population, 2010","Land Area (square miles), 2010","Population Density (per square mile), 2010",pct_belowpov_raw,pct_deeppov_raw,pct_bachmore,"Religious congregations p 1,000"
0,-0.319607,0.329467,-0.205765,-0.183551,0.726491,-0.318411,-0.737204,-0.624973,2.019242,-0.668572
1,-0.319607,0.329467,-0.205765,-0.183551,0.726491,-0.318411,-0.737204,-0.624973,2.019242,-0.668572
2,-0.319607,0.329467,-0.205765,-0.183551,0.726491,-0.318411,-0.737204,-0.624973,2.019242,-0.668572
3,-0.319607,0.329467,-0.205765,-0.183551,0.726491,-0.318411,-0.737204,-0.624973,2.019242,-0.668572
4,-0.319607,0.329467,-0.205765,-0.183551,0.726491,-0.318411,-0.737204,-0.624973,2.019242,-0.668572


In [76]:
x_rc3_scores_y2,x_rc3_i_y2

(array([2.65968466e+03, 6.74004276e+02, 1.06456214e+04, 1.13903203e+04,
        1.95476506e+03, 7.65650044e+02, 6.24833558e+02, 3.65381633e+03,
        1.09125563e+02, 4.60565058e+03, 6.73916282e+02, 2.04588454e+02,
        5.33488999e+02, 3.89569161e+03, 4.14442072e+03, 3.63754497e+03,
        4.35163634e+03, 1.50910570e+03, 1.87184977e+03, 4.97230028e+01,
        1.27905177e+03, 7.32502090e+02, 6.61427893e+02, 7.05001300e+02,
        7.05001300e+02, 5.15266236e+03, 7.55334873e+03, 8.24409789e+03,
        2.19964179e+03, 2.23436926e+03, 4.99680262e+02, 5.80409472e+03,
        4.73212492e+03, 1.67128728e+03, 7.46323060e+01, 1.55700420e+03,
        1.55700420e+03, 1.04279456e+03, 3.21666700e+01, 4.40822044e+03,
        2.68699068e+03, 7.39520616e+03, 5.04794188e+02, 5.04794188e+02,
        9.20611713e+02, 1.34660486e+02, 6.43275570e+02, 3.57775879e+03,
        2.12043023e-02, 1.51539027e+00, 3.38376667e+02, 1.48251293e+02,
        1.20238102e+02, 6.66087607e+03, 1.85500619e+02, 3.282208

**exurb result, y2:**

In [78]:
scores_desc_rc3_y2=np.flip(x_rc3_scores_y2[x_rc3_i_y2])
cols_desc_rc3_y2=list(x_rc3_scaled.iloc[:3,x_rc3_i_y2].columns) #remove last index?
cols_desc_rc3_y2.reverse()
res_df_rc3_y2=pd.DataFrame({'Variable':cols_desc_rc3_y2,
        'ANOVA corr':scores_desc_rc3_y2}).iloc[1:,:]
# dfi.export(res_df_rc3_y2, '/Users/yapinghe/Desktop/broadband research/ruca_exurb_anova_corr_y2.png')
# res_df_rc3_y2.to_csv('ruca_exurb_anova_corr_y2.csv')
res_df_rc3_y2

Unnamed: 0,Variable,ANOVA corr
1,pct_RURAL_POP_CEN_2010,11390.320282
2,pct_URBAN_CLUSTER_POP_CEN_2010,10645.621368
3,"Population Density (per square mile), 2010",8244.09789
4,"Land Area (square miles), 2010",7553.348731
5,pct_bachmore,7395.206155
6,"Religious congregations p 1,000",6660.876065
7,pct_belowpov_raw,5804.09472
8,"Tract Population, 2010",5152.662355
9,pct_deeppov_raw,4732.124916
10,pct_NH_AIAN_alone_ACS_14_18,4605.650579


In [79]:
#rc4
x_rc4_res_y2=anova_df_2(x_rc4_scaled,y2_rc4,10)
x_10_rc4_y2=x_rc4_res_y2[0]
x_rc4_scores_y2=x_rc4_res_y2[1]
x_rc4_i_y2=x_rc4_res_y2[2]
x_10_rc4_y2.head()

  f = msb / msw


Unnamed: 0,avg_Tot_Prns_in_HHD_ACS_14_18,pct_Hispanic_ACS_14_18,pct_NH_White_alone_ACS_14_18,pct_Othr_Lang_ACS_14_18,"Population Density (per square mile), 2010",County-Level Index,Requiring all 4 Subindices,Excluding Collective Efficacy,Institutional Health,Confidence in Institutions Subindex
0,-1.033981,-0.54038,-1.342512,-0.134837,-0.294442,-1.604751,-1.604751,-1.509892,-0.486549,-0.007762
1,-1.033981,-0.54038,-1.342512,-0.134837,-0.294442,-1.604751,-1.604751,-1.509892,-0.486549,-0.007762
2,-1.033981,-0.54038,-1.342512,-0.134837,-0.294442,-1.604751,-1.604751,-1.509892,-0.486549,-0.007762
3,-1.033981,-0.54038,-1.342512,-0.134837,-0.294442,-1.604751,-1.604751,-1.509892,-0.486549,-0.007762
4,-1.033981,-0.54038,-1.342512,-0.134837,-0.294442,-1.604751,-1.604751,-1.509892,-0.486549,-0.007762


In [80]:
x_rc4_scores_y2,x_rc4_i_y2

(array([6.45517553e+02, 3.98636827e+02, 7.62780350e+02, 9.78239577e+02,
        1.38092441e+04, 1.34732475e+01, 1.53991345e+04, 1.78360137e+04,
        3.57068578e+00, 6.34907342e+03, 5.47020146e+02, 5.70383140e+02,
        5.90568763e+02, 2.06082723e+04, 1.18658132e+04, 8.08299072e+03,
        1.01269435e+04, 8.80896280e+03, 3.66254028e+03, 1.97752074e+02,
        1.30711971e+03, 1.11235890e+01, 3.90166699e+02,            nan,
        1.21989618e+01, 1.84484740e+02, 7.51078382e+03, 1.78975181e+04,
        6.14528938e+03, 6.72394179e+03, 2.49454202e+03, 8.47170140e+03,
        4.71868612e+03, 5.36057236e+03, 1.52691016e+03, 2.37692928e+01,
        2.37692928e+01, 6.00260443e+01, 1.79486762e+02, 1.18911408e+04,
        4.45401696e+03, 6.96529986e+03, 1.79313616e+04, 1.79313616e+04,
        1.75926895e+04, 2.77177015e+03, 7.37249943e+03, 2.52958388e+04,
        8.64560933e+03, 5.26592812e+02, 3.45307177e+03, 2.33296310e+03,
        5.66486322e+03, 2.21814955e+02, 1.01426330e+04, 1.260626

**rural result, y2:**

In [83]:
scores_desc_rc4_y2=np.flip(x_rc4_scores_y2[x_rc4_i_y2])
cols_desc_rc4_y2=list(x_rc4_scaled.iloc[:3,x_rc4_i_y2].columns) #remove last index?
cols_desc_rc4_y2.reverse()
res_df_rc4_y2=pd.DataFrame({'Variable':cols_desc_rc4_y2,
        'ANOVA corr':scores_desc_rc4_y2}).iloc[2:,:]
# dfi.export(res_df_rc4_y2, '/Users/yapinghe/Desktop/broadband research/ruca_rural_anova_corr_y2.png')
# res_df_rc4_y2.to_csv('ruca_rural_anova_corr_y2.csv')
res_df_rc4_y2

Unnamed: 0,Variable,ANOVA corr
2,Institutional Health,25295.83879
3,Confidence in Institutions Subindex,22715.863185
4,pct_Othr_Lang_ACS_14_18,20608.272319
5,Requiring all 4 Subindices,17931.361587
6,County-Level Index,17931.361587
7,"Population Density (per square mile), 2010",17897.518124
8,pct_NH_White_alone_ACS_14_18,17836.013708
9,Excluding Collective Efficacy,17592.689475
10,pct_Hispanic_ACS_14_18,15399.134517
11,avg_Tot_Prns_in_HHD_ACS_14_18,13809.244064


In [None]:
# by ruca code (1-3,4-6,7-9,10)?

In [None]:
# by pct population (US anova result is highest for pct_RURAL_POP_CEN_2010)?

In [84]:
# plt.hist(df_num_2['pct_RURAL_POP_CEN_2010'],bins=20)

In [85]:
# df_num_2.describe()['pct_RURAL_POP_CEN_2010']

In [86]:
# for i in np.arange(0,105,5):
#     print(i,df_num_2['pct_RURAL_POP_CEN_2010'].quantile(i/100))

In [87]:
# # make a category for 100% rural, 25-99% rural, 0.01-24.99% rural, and 0% rural
# len(df_num_2[df_num_2['pct_RURAL_POP_CEN_2010']==100]), \
# len(df_num_2[(df_num_2['pct_RURAL_POP_CEN_2010'] >= 25) & (df_num_2['pct_RURAL_POP_CEN_2010'] <100)]) ,\
# len(df_num_2[(df_num_2['pct_RURAL_POP_CEN_2010'] >0) & (df_num_2['pct_RURAL_POP_CEN_2010'] <25)]), \
# len(df_num_2[df_num_2['pct_RURAL_POP_CEN_2010']==0])
# # rur_u5=df_num_2[df_num_2['ruca_cat']==1] #under 5% rural
# # rc2=df_num_2[df_num_2['ruca_cat']==2] # 5 to 
# # rc3=df_num_2[df_num_2['ruca_cat']==3] #exurb
# # rc4=df_num_2[df_num_2['ruca_cat']==4] #rural
# # len(rc1),len(rc2),len(rc3),len(rc4)