In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as st
from scipy.stats import linregress
import gmaps
import os

# Import API key
from api_keys import g_key
gmaps.configure(api_key=g_key)

import warnings
warnings.filterwarnings("ignore")

# import modules
import gmaps.geojson_geometries
from matplotlib.cm import viridis
from matplotlib.colors import to_hex

# load the geometries data for US counties
counties_geojson = gmaps.geojson_geometries.load_geometry('us-counties')

### The space below is used for analysis of the data from the Clean_Data directory

#### Vaccination by County Overview

In [None]:
# Jenny's codes start here

In [None]:
vaccine_county_coord = pd.read_csv("Clean_data/vaccine_county_coord.csv",low_memory=False)

In [None]:
vaccinate_stats = vaccine_county_coord.Vaccination_Complete_Pct.describe()
pd.DataFrame(vaccinate_stats)

In [None]:
iqr = vaccinate_stats[6]-vaccinate_stats[4]
lower_bound = round(vaccinate_stats[4] - (1.5*iqr),2)
upper_bound = round(vaccinate_stats[6] + (1.5*iqr), 2)
print(f"Values below {lower_bound} could be outliers.")
print(f"Values above {upper_bound} could be outliers.")

In [None]:
vaccine_county_coord.boxplot('Vaccination_Complete_Pct')
plt.savefig('Images/Vaccination Rate Box Plot.jpg', dpi=300)
plt.show()

In [None]:
plt.hist(vaccine_county_coord.Vaccination_Complete_Pct,bins=100)
plt.title('Vaccination Rate Hist Graph')
plt.xlabel('Vaccination (%)')
plt.ylabel('Counts')
plt.savefig('Images/Vaccination Rate Hist Graph.jpg', dpi=300)
plt.show()
print(st.normaltest(vaccine_county_coord.Vaccination_Complete_Pct))

#### Analysis between SVI vs. Viccination by county

In [None]:
vaccine_svi_df = pd.read_csv("Clean_data/vaccine_svi_df.csv",low_memory=False)

In [None]:
vaccine_svi_df.describe()

In [None]:
# Leanier Regression 

x_values = vaccine_svi_df['Social Vulnerability Index (SVI)']
y_values = vaccine_svi_df['Vaccination_Complete_Pct']
(slope, intercept, rvalue, pvalue, stderr) = linregress(x_values, y_values)
regress_values = x_values * slope + intercept
line_eq = "y = " + str(round(slope,2)) + "x + " + str(round(intercept,2))
plt.scatter(x_values,y_values)
plt.plot(x_values,regress_values,"r-")
plt.annotate(line_eq,(0,85),fontsize=15,color="red")
plt.title('Social Vulnerability Index vs. Vaccination Rate')
plt.xlabel('Social Vulnerability Index (SVI)')
plt.ylabel('Vaccination_Complete_Pct')
print(f"The r value is: {rvalue}")
print(f"The r-squared is: {rvalue**2}")
plt.savefig('Images/Social Vulnerability Index vs. Vaccination Rate.jpg', dpi=300)
plt.show()

In [None]:
# Anova Test: H0- vaccination rates from counties with different SVI level of concerns don't have significant difference (same mean)
vaccine_svi_df.boxplot('Vaccination_Complete_Pct', by='SVI Category', figsize=(20, 10))
plt.savefig('Images/Vaccination Rate by SVI Category.jpg', dpi=300)
plt.show()

In [None]:
svi_very_high_concern = vaccine_svi_df.loc[vaccine_svi_df['SVI Category']=='Very High Concern']['Vaccination_Complete_Pct']
svi_high_concern = vaccine_svi_df.loc[vaccine_svi_df['SVI Category']=='High Concern']['Vaccination_Complete_Pct']
svi_moderate_concern = vaccine_svi_df.loc[vaccine_svi_df['SVI Category']=='Moderate Concern']['Vaccination_Complete_Pct']
svi_low_concern = vaccine_svi_df.loc[vaccine_svi_df['SVI Category']=='Low Concern']['Vaccination_Complete_Pct']
svi_very_low_concern = vaccine_svi_df.loc[vaccine_svi_df['SVI Category']=='Very Low Concern']['Vaccination_Complete_Pct']

In [None]:
st.f_oneway(svi_very_high_concern, svi_high_concern, svi_moderate_concern, svi_low_concern, svi_very_low_concern)

#### Analysis between Political Parties DEM/REP vs. Viccination by County

In [None]:
vaccine_election_df = pd.read_csv("Clean_data/vaccine_election_df.csv",low_memory=False)

In [None]:
vaccine_election_party = vaccine_election_df.loc[vaccine_election_df.won==True]
vaccine_election_party.boxplot('Vaccination_Complete_Pct', by='party', figsize=(20, 10))
plt.savefig('Images/Vaccination Rate by Political Party Box Plot.jpg', dpi=300)
plt.show()

In [None]:
vaccine_election_party_vote = vaccine_election_party.groupby('party').mean().Vaccination_Complete_Pct
vaccine_election_party_vote.plot(kind='bar',title = 'Vaccination Rate Comparison Dem vs. Rep Parties')
plt.ylabel('Average Vaccination Rate (%)')
plt.savefig('Images/Vaccination Rate by Political Party Bar Chart.jpg', dpi=300)
plt.show()

In [None]:
# Anova Test: H0- vaccination rates from counties voting for different political parties don't have significant difference (same mean)
vaccine_election_dem = vaccine_election_df.loc[(vaccine_election_df.party=='DEM')&(vaccine_election_df.won==True)]['Vaccination_Complete_Pct']
vaccine_election_rep = vaccine_election_df.loc[(vaccine_election_df.party=='REP')&(vaccine_election_df.won==True)]['Vaccination_Complete_Pct']

In [None]:
st.f_oneway(vaccine_election_dem, vaccine_election_rep)

In [None]:
# Jenny's codes end here

#### Correlations Between 175 factors in Sociohealth/Economy vs. Vaccination Rate Comparison

In [None]:
# Feipeng's codes start here

In [None]:
# read clean data from fy_clean_data
fy_df = pd.read_csv("Clean_data/fy_clean_data.csv",low_memory=False)
fy_df.head()

In [None]:
# Calculate the correlation between "covid_vaccine_rate" and all factors in the dataframe
# put the columns in a list
columns = list(fy_df.columns)
column_data = columns[7:(len(columns))]
# define list to store factor
factors = []
# define list to store rsquared 
rsquared = []
for column in column_data:
    try:
        temp = fy_df[['covid_vaccine_rate', column]].dropna()
        temp = temp.loc[temp['covid_vaccine_rate']!=0]
        y_values = temp['covid_vaccine_rate']
        x_values = temp[column]
        (slope, intercept, rvalue, pvalue, stderr) = linregress(x_values, y_values)
        factors.append(column)
        rsquared.append(rvalue**2)
    except:
        pass

In [None]:
# put the factors and rsquard into a dataframe
factors_df = pd.DataFrame({'factor':factors, 'rsquared':rsquared})
factors_df = factors_df.sort_values('rsquared', ascending=False, ignore_index=True)
factors_df.head()

In [None]:
# Plot the rsquared for all factors
x_values = factors_df['factor']
y_values = factors_df['rsquared']

# Plot the highest 20 factors
plt.figure(figsize=(10,8))
plt.bar(x_values[0:20], y_values[0:20], facecolor='blue', edgecolor='black',alpha=0.5)
plt.title('Factors with Relatively High Correlations (Top 20)')
plt.xlabel("Factors")
plt.ylabel('r-squared')
plt.ylim(0,0.25)
plt.xticks(x_values[0:20])
plt.xticks(rotation = 90)
plt.tight_layout()
plt.savefig('Images/Factors_with_high_correlations.jpg', dpi=300)
plt.show()

# Plot the lowest 20 factors
plt.figure(figsize=(10,8))
plt.bar(x_values[-21:-1], y_values[-21:-1], facecolor='blue', edgecolor='black',alpha=0.5)
plt.title('Factors with Relatively Low Correlations (Bottom 20)')
plt.xlabel("Factors")
plt.ylabel('r-squared')
plt.ylim(0,0.25)
plt.xticks(x_values[-21:-1])
plt.xticks(rotation = 90)
plt.tight_layout()
plt.savefig('Images/Factors_with_low_correlations.jpg', dpi=300)
plt.show()

#### Analysis between Percent Uninsured Population vs. Vaccination Rate

In [None]:
insurance_df = fy_df[['covid_vaccine_rate', 'fips', 'percent_uninsured']].dropna()
# insurance = fy_df.loc[fy_df['covid_vaccine_rate']!=0]
x_values = insurance_df['percent_uninsured']
y_values = insurance_df['covid_vaccine_rate']
plt.figure(figsize=(8,5))
(slope, intercept, rvalue, pvalue, stderr) = linregress(x_values, y_values)
regress_values = x_values * slope + intercept
line_eq = "y = " + str(round(slope,2)) + "x + " + str(round(intercept,2))
plt.scatter(x_values, y_values, facecolor='cyan', edgecolor='black',alpha=0.6)
plt.plot(x_values,regress_values,"r-")
plt.annotate(line_eq,(18,80),fontsize=15,color="red")
plt.xlabel('percent_uninsured')
plt.ylabel('covid_vaccine_rate')
plt.title('Vaccination Rate vs. Uninsured Rate')
print(f"The r-squared for 'percent_uninsured' is: {rvalue**2}")
plt.tight_layout()
plt.savefig('Images/Vaccination_Rate_vs_Uninsured_Rate_plot.jpg', dpi=300)
plt.show()

In [None]:
# Create the bins in which Data will be held
# Bins are 0, 59.9, 69.9, 79.9, 89.9, 100.   
bins = [0, 5, 10, 15, 20, 25, 30]

# Create the names for the five bins
group_names = ["lowest", "low", "moderate low", "moderate high", "high", "highest"]

insurance_df["uninsured_rate"] = pd.cut(insurance_df["percent_uninsured"], bins, right=False, labels=group_names, include_lowest=True)
insurance_df = insurance_df.reset_index(drop=True)
insurance_df.head()

In [None]:
# put the data into each category. 
lowest = insurance_df.loc[insurance_df['uninsured_rate']=='lowest']['covid_vaccine_rate']
low = insurance_df.loc[insurance_df['uninsured_rate']=='low']['covid_vaccine_rate']
moderate_low = insurance_df.loc[insurance_df['uninsured_rate']=='moderate low']['covid_vaccine_rate']
moderate_high = insurance_df.loc[insurance_df['uninsured_rate']=='moderate high']['covid_vaccine_rate']
high = insurance_df.loc[insurance_df['uninsured_rate']=='high']['covid_vaccine_rate']
highest = insurance_df.loc[insurance_df['uninsured_rate']=='highest']['covid_vaccine_rate']

# Plot the boxplot
data = [lowest, low, moderate_low, moderate_high, high, highest]
plt.figure(figsize=(10,5))
plt.boxplot(data)
plt.xticks([1,2,3,4,5,6],['lowest', 'low', 'moderate_low', 'moderate_high', 'high', 'highest'])
plt.grid(linestyle='-', linewidth=0.2)
plt.title('Vaccination Rate vs. Uninsured Rate')
plt.xlabel("Uninsured Rate")
plt.ylabel('Vaccination Rate')
plt.tight_layout()
plt.savefig('Images/Vaccination_Rate_vs_Uninsured_Rate_box.png', dpi=300)
plt.show()

In [None]:
st.f_oneway(lowest, low, moderate_low, moderate_high, high, highest)

#### Vaccination Rate Visulization in Gmap

In [None]:
# Generate color data for each county base on each factor
factor = fy_df['covid_vaccine_rate']

# Scale the factor values to lie between 0 and 1
min_data = min(factor)
max_data = max(factor)
data_range = max_data - min_data

def calculate_color(current_data):
    # make factor a number between 0 and 1
    normalized_data = (current_data - min_data) / data_range
    # transform the gini coefficient to a matplotlib color
    mpl_color = viridis(normalized_data)
    # transform from a matplotlib color to a valid CSS color
    gmaps_color = to_hex(mpl_color, keep_alpha=False)
    return gmaps_color

# Generate colors list
fy_df_map = fy_df[['covid_vaccine_rate', 'fips']].dropna()
colors = []
for county in counties_geojson['features']:
    county_fips = county['properties']['GEO_ID'][-5:]
    try:
        value = fy_df_map.loc[fy_df_map['fips']== county_fips, 'covid_vaccine_rate'].iloc[0]
        color = calculate_color(value)
    except:
        # no data for that county: return default color
        color = (0, 0, 0, 0.3)
    colors.append(color)

# Set the figure layout
figure_layout = {
    'width': '800px',
    'height': '500px',
    'border': '1px solid black',
    'padding': '1px'
}

# Create the gmap 
fig = gmaps.figure(layout=figure_layout)

# Generate the layer with factors
factor_layer = gmaps.geojson_layer(counties_geojson,
                                    fill_color=colors,
                                    stroke_color=colors,
                                    fill_opacity=1)
fig.add_layer(factor_layer)
fig

#### Percent Uninsured Pop Visulization in Gmap

In [None]:
# Generate color data for each county base on each factor
factor = fy_df['percent_uninsured']

# Scale the factor values to lie between 0 and 1
min_data = min(factor)
max_data = max(factor)
data_range = max_data - min_data

def calculate_color(current_data):
    # make factor a number between 0 and 1
    normalized_data = (current_data - min_data) / data_range
    # transform the gini coefficient to a matplotlib color
    mpl_color = viridis(normalized_data)
    # transform from a matplotlib color to a valid CSS color
    gmaps_color = to_hex(mpl_color, keep_alpha=False)
    return gmaps_color

# Generate colors list
fy_df_map = fy_df[['percent_uninsured', 'fips']].dropna()
colors = []
for county in counties_geojson['features']:
    county_fips = county['properties']['GEO_ID'][-5:]
    try:
        value = fy_df_map.loc[fy_df_map['fips']== county_fips, 'percent_uninsured'].iloc[0]
        color = calculate_color(value)
    except:
        # no data for that county: return default color
        color = (0, 0, 0, 0.3)
    colors.append(color)

# Set the figure layout
figure_layout = {
    'width': '800px',
    'height': '500px',
    'border': '1px solid black',
    'padding': '1px'
}

# Create the gmap 
fig = gmaps.figure(layout=figure_layout)

# Generate the layer with factors
factor_layer = gmaps.geojson_layer(counties_geojson,
                                    fill_color=colors,
                                    stroke_color=colors,
                                    fill_opacity=1)
fig.add_layer(factor_layer)
fig

#### Analysis between Percent Fair/Poor Health vs. Vaccination Rate¶

In [None]:
poor_health_df = fy_df[['covid_vaccine_rate', 'fips', 'percent_fair_or_poor_health']].dropna()
# insurance = fy_df.loc[fy_df['covid_vaccine_rate']!=0]
x_values = poor_health_df['percent_fair_or_poor_health']
y_values = poor_health_df['covid_vaccine_rate']
plt.figure(figsize=(8,5))
(slope, intercept, rvalue, pvalue, stderr) = linregress(x_values, y_values)
regress_values = x_values * slope + intercept
line_eq = "y = " + str(round(slope,2)) + "x + " + str(round(intercept,2))
plt.scatter(x_values, y_values, facecolor='b', edgecolor='black',alpha=0.4)
plt.plot(x_values,regress_values,"r-")
plt.annotate(line_eq,(18,80),fontsize=15,color="red")
plt.xlabel('percent_fair_or_poor_health')
plt.ylabel('covid_vaccine_rate')
plt.title('Vaccination Rate vs. Fair or Poor Health Rate')
print(f"The r-squared for 'percent_fair_or_poor_health' is: {rvalue**2}")
plt.tight_layout()
plt.savefig('Images/Vaccination_Rate_vs_Fair_or_Poor_Health_Rate_plot.jpg', dpi=300)
plt.show()

In [None]:
# Create the bins in which Data will be held
# Bins are 0, 10, 20, 30, 40
bins = [0, 10, 20, 30, 40]
# Create the names for the five bins
group_names = ["lowest", "low", "medium", "high"]
poor_health_df["poor_health_rate"] = pd.cut(poor_health_df["percent_fair_or_poor_health"], bins, right=False, labels=group_names, include_lowest=True)
poor_health_df = poor_health_df.reset_index(drop=True)

In [None]:
# put the data into each category.
lowest = poor_health_df.loc[poor_health_df['poor_health_rate']=='lowest']['covid_vaccine_rate']
low = poor_health_df.loc[poor_health_df['poor_health_rate']=='low']['covid_vaccine_rate']
medium = poor_health_df.loc[poor_health_df['poor_health_rate']=='medium']['covid_vaccine_rate']
high = poor_health_df.loc[poor_health_df['poor_health_rate']=='high']['covid_vaccine_rate']

In [None]:
# Plot the boxplot
data = [lowest, low, medium, high]
plt.figure(figsize=(10,5))
plt.boxplot(data)
plt.xticks([1,2,3,4],['lowest', 'low', 'medium', 'high'])
plt.grid(linestyle='-', linewidth=0.2)
plt.title('Vaccination Rate vs. Fair or Poor Health Rate')
plt.xlabel('percent_fair_or_poor_health')
plt.ylabel('covid_vaccine_rate')
plt.tight_layout()
plt.savefig('Images/Vaccination_Rate_vs_Fair_or_Poor_Health_Rate_box.png', dpi=300)
plt.show()

In [None]:
st.f_oneway(lowest, low, moderate_low, moderate_high, high, highest)

In [None]:
# Feipeng's codes end here

#### Analysis between Income vs. Vaccination Rate

In [None]:
# Ricardo's codes start here

vaccine_socio_df= pd.read_csv("Clean_data/vaccine_socio_df.csv",low_memory=False)

In [None]:
x_values = vaccine_socio_df['per_capita_income']
y_values = vaccine_socio_df['Vaccination_Complete_Pct']
(slope, intercept, rvalue, pvalue, stderr) = linregress(x_values, y_values)
regress_values = x_values * slope + intercept
line_eq = "y = " + str(round(slope,2)) + "x + " + str(round(intercept,2))
plt.scatter(x_values,y_values)
plt.plot(x_values,regress_values,"r-")
plt.annotate(line_eq,(42000,20),fontsize=15,color="red")
plt.title('Vaccination Rate vs Per Capita Income')
plt.xlabel('Per Capita Income')
plt.ylabel('Vaccination Rate Percentage')
print(f"The r value is: {rvalue}")
print(f"The r-squared is: {rvalue**2}")
plt.savefig('Images/Vaccination Rate vs Per Capita Income.jpg', dpi=300)
plt.show()

In [None]:
x_values = vaccine_socio_df['median_household_income']
y_values = vaccine_socio_df['Vaccination_Complete_Pct']
(slope, intercept, rvalue, pvalue, stderr) = linregress(x_values, y_values)
regress_values = x_values * slope + intercept
line_eq = "y = " + str(round(slope,2)) + "x + " + str(round(intercept,2))
plt.scatter(x_values,y_values)
plt.plot(x_values,regress_values,"r-")
plt.annotate(line_eq,(80000,15),fontsize=15,color="red")
plt.title('Vaccination Rate vs Median Household Income')
plt.xlabel('Median Household Income')
plt.ylabel('Vaccination Rate Percentage')
print(f"The r value is: {rvalue}")
print(f"The r-squared is: {rvalue**2}")
plt.savefig('Images/Vaccination Rate vs Median House Income.jpg', dpi=300)
plt.show()

In [None]:
# Anova Test: - vaccination rates from counties with different median household incomes

In [None]:
first_sextile= vaccine_socio_df.loc[vaccine_socio_df['median_household_income']<40000]['Vaccination_Complete_Pct']
second_sextile =vaccine_socio_df.loc[(vaccine_socio_df['median_household_income']>=40000) & (vaccine_socio_df['median_household_income']<60000)]['Vaccination_Complete_Pct']
third_sextile =vaccine_socio_df.loc[(vaccine_socio_df['median_household_income']>=60000) & (vaccine_socio_df['median_household_income']<80000)]['Vaccination_Complete_Pct']
fourth_sextile =vaccine_socio_df.loc[(vaccine_socio_df['median_household_income']>=80000) & (vaccine_socio_df['median_household_income']<100000)]['Vaccination_Complete_Pct']
fifth_sextile =vaccine_socio_df.loc[(vaccine_socio_df['median_household_income']>=100000) & (vaccine_socio_df['median_household_income']<120000)]['Vaccination_Complete_Pct']
sixth_sextile =vaccine_socio_df.loc[(vaccine_socio_df['median_household_income']>=120000) & (vaccine_socio_df['median_household_income']<140000)]['Vaccination_Complete_Pct']


In [None]:
st.f_oneway(first_sextile,second_sextile,third_sextile,fourth_sextile,fifth_sextile,sixth_sextile)

In [None]:
first_sextile_PC= vaccine_socio_df.loc[vaccine_socio_df['per_capita_income']<10000]['Vaccination_Complete_Pct']
second_sextile_PC =vaccine_socio_df.loc[(vaccine_socio_df['per_capita_income']>=10000) & (vaccine_socio_df['per_capita_income']<20000)]['Vaccination_Complete_Pct']
third_sextile_PC =vaccine_socio_df.loc[(vaccine_socio_df['per_capita_income']>=20000) & (vaccine_socio_df['per_capita_income']<30000)]['Vaccination_Complete_Pct']
fourth_sextile_PC =vaccine_socio_df.loc[(vaccine_socio_df['per_capita_income']>=30000) & (vaccine_socio_df['per_capita_income']<40000)]['Vaccination_Complete_Pct']
fifth_sextile_PC =vaccine_socio_df.loc[(vaccine_socio_df['per_capita_income']>=40000) & (vaccine_socio_df['per_capita_income']<50000)]['Vaccination_Complete_Pct']
sixth_sextile_PC =vaccine_socio_df.loc[(vaccine_socio_df['per_capita_income']>=50000) & (vaccine_socio_df['per_capita_income']<60000)]['Vaccination_Complete_Pct']

In [None]:
st.f_oneway(first_sextile_PC,second_sextile_PC,third_sextile_PC,fourth_sextile_PC,fifth_sextile_PC,sixth_sextile_PC)

In [None]:
# Ricardo's codes end here

#### Analysis between Education (Percent Some College) vs. Vaccination Rate

In [None]:
# Eugene's codes start here

In [None]:
file1 = "Clean_Data/vaccine_socio_df.csv"
analysis_df = pd.read_csv(file1)

In [None]:
data_analysis_df = analysis_df[["County", "Vaccination_Complete_Pct","percent_some_college"]]

In [None]:
x_values = data_analysis_df["percent_some_college"]
y_values = data_analysis_df["Vaccination_Complete_Pct"]
(slope, intercept, rvalue, pvalue, stderr) = linregress(x_values, y_values)
regress_values = x_values * slope + intercept
line_eq = "y = " + str(round(slope,2)) + "x + " + str(round(intercept,2))
plt.scatter(x_values,y_values)
plt.plot(x_values,regress_values,"r-")
plt.annotate(line_eq,(0,85),fontsize=15,color="red")
plt.title('Percent Some College vs. Vaccination Rate')
plt.xlabel('percent_some_college')
plt.ylabel('Vaccination_Complete_Pct')
print(f"The r value is: {rvalue}")
print(f"The r-squared is: {rvalue**2}")
plt.savefig('Images/Percent Some College vs. Vaccination Rate.jpg', dpi=300)
plt.show()

In [None]:
first_section = data_analysis_df.loc[data_analysis_df["percent_some_college"]<30]['Vaccination_Complete_Pct']
second_section =data_analysis_df.loc[(data_analysis_df['percent_some_college']>=30) & (data_analysis_df['percent_some_college']<50)]['Vaccination_Complete_Pct']
third_section =data_analysis_df.loc[(data_analysis_df['percent_some_college']>=50) & (data_analysis_df['percent_some_college']<=75)]['Vaccination_Complete_Pct']
fourth_section =data_analysis_df.loc[(data_analysis_df['percent_some_college']>=75) & (data_analysis_df['percent_some_college']<90)]['Vaccination_Complete_Pct']

In [None]:
st.f_oneway(first_section,second_section,third_section,fourth_section,)

In [None]:
# Eugene's codes end here