## Convert your previously written Beam pipeline(s) to Dataflow. Run them on Dataflow over the entire input data; debug and fix as necessary.

In [3]:
%run location_dataflow.py

## Verify that each BigQuery output table contains a valid primary and/or foriegn key

In [4]:
%%bigquery
select count(State) as State_Count from datamart.Location_Dataflow

Unnamed: 0,State_Count
0,60


In [5]:
%%bigquery
select count(distinct(State)) as State_Count from datamart.Location_Dataflow

Unnamed: 0,State_Count
0,56


In [13]:
%%bigquery
select distinct(l.state) as state_fk, c.state as state_pk 
from datamart.Location_Dataflow l
right join datamart.Cases c on l.state = c.state

Unnamed: 0,state_fk,state_pk
0,Guam,Guam
1,Iowa,Iowa
2,Ohio,Ohio
3,Utah,Utah
4,Idaho,Idaho
5,Maine,Maine
6,Texas,Texas
7,Alaska,Alaska
8,Hawaii,Hawaii
9,Kansas,Kansas


## Relationship bewteen education level and COVID deaths cases. Education level for this query will be the proportion of people who obtained a higher education degree (professional degree after going to professional school or graduate school) among those who graduated high school for each state. Higher proportion will represent more people over 25 years old chose to pursue higher education after high school.

In [1]:
%%bigquery
select c2.state, c2.deaths, e.over_25yo_prof/e.over_25yo_HS_or_higher as higher_degree_proportion
from (select c.state, max(c.deaths) as deaths from datamart.Cases c group by c.state) c2
join datamart.Education e on c2.state = e.state
order by higher_degree_proportion desc
limit 10

Unnamed: 0,state,deaths,higher_degree_proportion
0,District of Columbia,1093,0.369855
1,Massachusetts,17445,0.222261
2,Maryland,8528,0.211135
3,Connecticut,7995,0.196673
4,Virginia,10549,0.191426
5,New York,51470,0.189447
6,New Jersey,25094,0.178121
7,Colorado,6186,0.173652
8,Vermont,242,0.171828
9,New Hampshire,1266,0.157099


## Relationship between COVID confirmed and/or death cases for each state and the proportion of noninstitutionalized elders with disabilities (65 y/o or above) for each state ordered from the state with the highest proportion of elders with disabilities to the lowest

In [2]:
%%bigquery
select c.state, sum(d.noninstitutionalized_65over_with_disability)/sum(d.noninstitutionalized) as proportion_elders_w_disabilities, max(c.deaths) as deaths, max(c.confirmed) as confirmed, avg(c.testing_rate) as avg_testing_rate
from datamart.Disability d
join datamart.Cases c on c.state = d.state
group by c.state
order by proportion_elders_w_disabilities desc
limit 10

Unnamed: 0,state,proportion_elders_w_disabilities,deaths,confirmed,avg_testing_rate
0,Puerto Rico,0.102215,2194,120571,8849.075095
1,West Virginia,0.084526,2777,148071,53718.041639
2,Arkansas,0.070664,5692,333186,43152.868909
3,Maine,0.069612,763,56525,51785.146599
4,Florida,0.067706,34330,2155744,44225.545261
5,New Mexico,0.067615,4001,194868,61299.580968
6,Mississippi,0.066701,7139,308737,30065.461635
7,Alabama,0.066345,10739,521623,25480.102988
8,Kentucky,0.065577,6317,436445,42104.372385
9,Hawaii,0.065472,474,32548,36024.221704


## States that are on the eastern half of the United States (longitude greater than -100) and show the covid death cases of those states ordered by number of confirmed cases to see if confirmed cases and number of deaths have a direct relationship

In [3]:
%%bigquery
select c2.state, c2.deaths, c2.confirmed, l.latitude
from (select c.state, max(c.deaths) as deaths, max(c.confirmed) as confirmed from datamart.Cases c group by c.state) c2
join datamart.Location l on c2.state = l.state
where l.longitude > -100
order by c2.confirmed desc
limit 10

Unnamed: 0,state,deaths,confirmed,latitude
0,Texas,49510,2847101,31.0545
1,Florida,34330,2155744,27.7663
2,New York,51470,1986681,42.1657
3,Illinois,23920,1296267,40.3495
4,Pennsylvania,25613,1100857,40.5908
5,Georgia,19718,1081629,33.0406
6,Ohio,18991,1050112,40.3888
7,New Jersey,25094,971782,40.2989
8,North Carolina,12387,943693,35.6301
9,Michigan,17858,867624,43.3266


## Query exploring whether the ancestry (in percentage) distribution in each state had relationship with confirmed cases of COVID in each state ordered by highest to lowest American ancestry percentage

In [7]:
%%bigquery
select c.state, max(c.confirmed) as confirmed, max(a.percent_American) as american, max(a.percent_Arab) as arabian, max(a.percent_Danish) as danish, max(a.percent_English) as english, max(a.percent_German) as german, max(a.percent_Portuguese) as portuguese, max(a.percent_Russian) as russian, max(a.percent_Subsaharan_African) as subsaharan_african 
from datamart.Cases c
join datamart.Ancestry a on c.state = a.state
group by c.state
order by max(a.percent_American) DESC
limit 10

Unnamed: 0,state,confirmed,american,arabian,danish,english,german,portuguese,russian,subsaharan_african
0,Alabama,521623,17.3,0.2,0.1,7.2,5.3,0.1,0.2,0.9
1,Kentucky,436445,13.8,0.4,0.1,9.8,12.3,0.1,0.3,0.9
2,Tennessee,830484,12.8,0.6,0.1,8.7,8.4,0.1,0.3,1.4
3,North Carolina,943693,11.1,0.4,0.1,9.0,9.0,0.2,0.4,1.3
4,Virginia,643220,10.7,0.9,0.3,8.7,9.7,0.2,0.6,2.6
5,South Carolina,567277,10.5,0.3,0.2,7.9,8.3,0.1,0.4,0.9
6,Mississippi,308737,10.2,0.3,0.1,6.7,4.9,0.1,0.1,0.7
7,West Virginia,148071,9.8,0.4,0.1,9.4,14.3,0.1,0.3,0.5
8,Georgia,1081629,9.7,0.3,0.1,7.1,5.9,0.2,0.4,1.9
9,Arkansas,333186,9.4,0.2,0.2,7.2,8.4,0.1,0.1,1.4


## Query exploring the relationship between citizenship status and confirmed COVID-19 cases in each state ordered by state that have the highest number of confirmed cases to the lowest.

In [9]:
%%bigquery
select c.state, max(c.confirmed) as confirmed, max(ct.percent_foreign_born) as percent_foreign_born, max(ct.percent_noncitizen) as percent_noncitizen, max(ct.percent_from_Asia) as percent_from_Asia, max(ct.percent_from_Africa) as percent_from_Africa
from datamart.Cases c
join datamart.Citizenship ct on c.state = ct.state
group by c.state
order by confirmed DESC
limit 10

Unnamed: 0,state,confirmed,percent_foreign_born,percent_noncitizen,percent_from_Asia,percent_from_Africa
0,California,3714587,26.7,46.4,40.2,2.1
1,Texas,2847101,17.1,60.4,22.8,5.9
2,Florida,2155744,21.1,42.6,10.6,1.7
3,New York,1986681,22.4,40.5,30.4,4.5
4,Illinois,1296267,13.9,46.1,31.6,4.4
5,Pennsylvania,1100857,7.0,44.3,40.2,8.9
6,Georgia,1081629,10.3,54.0,31.7,9.4
7,Ohio,1050112,4.8,47.6,42.8,17.1
8,New Jersey,971782,23.4,41.6,32.1,6.1
9,North Carolina,943693,8.4,58.2,27.7,9.1


## Wrap the queries into views and create the views in your reports dataset

In [4]:
dataset_id = "reports"

In [5]:
!bq --location=US mk --dataset {dataset_id}

BigQuery error in mk operation: Dataset 'still-primer-302701:reports' already
exists.


In [10]:
%%bigquery
CREATE VIEW reports.Citizenship_Status_and_COVID_Confirmed_Cases_by_State AS
select c.state, max(c.confirmed) as confirmed, max(ct.percent_foreign_born) as percent_foreign_born, max(ct.percent_noncitizen) as percent_noncitizen, max(ct.percent_from_Asia) as percent_from_Asia, max(ct.percent_from_Africa) as percent_from_Africa
from datamart.Cases c
join datamart.Citizenship ct on c.state = ct.state
group by c.state
order by confirmed DESC
limit 20

In [11]:
%%bigquery
CREATE VIEW reports.Highest_American_Heritage_and_COVID_Confirmed_Cases_by_State AS
select c.state, max(c.confirmed) as confirmed, max(a.percent_American) as american, max(a.percent_Arab) as arabian, max(a.percent_Danish) as danish, max(a.percent_English) as english, max(a.percent_German) as german, max(a.percent_Portuguese) as portuguese, max(a.percent_Russian) as russian, max(a.percent_Subsaharan_African) as subsaharan_african 
from datamart.Cases c
join datamart.Ancestry a on c.state = a.state
group by c.state
order by max(a.percent_American) DESC
limit 20

In [12]:
%%bigquery
CREATE VIEW reports.Most_Confirmed_Cases_In_Eastern_United_States_by_State AS
select c2.state, c2.deaths, c2.confirmed, l.latitude
from (select c.state, max(c.deaths) as deaths, max(c.confirmed) as confirmed from datamart.Cases c group by c.state) c2
join datamart.Location l on c2.state = l.state
where l.longitude > -100
order by c2.confirmed desc
limit 10