In [1]:
import pyspark
from pyspark.sql import SQLContext, DataFrameWriter
from pyspark.sql.functions import hour, when, col, date_format, to_timestamp, round, trim, to_date, unix_timestamp
sc = pyspark.SparkContext(appName="Project")
sqlContext = SQLContext(sc)

In [2]:
def load_data():
    df = sqlContext.read.option("header",True).csv("uszips.csv")
    return df

In [3]:
df = load_data()
df = df.withColumn('state_name', trim(df.state_name))\
        .withColumn('county_name',trim(df.county_name))\
        .withColumn("int_fips",col("county_fips").cast('int'))
df.printSchema()
df.count()

root
 |-- zip: string (nullable = true)
 |-- lat: string (nullable = true)
 |-- lng: string (nullable = true)
 |-- city: string (nullable = true)
 |-- state_id: string (nullable = true)
 |-- state_name: string (nullable = true)
 |-- zcta: string (nullable = true)
 |-- parent_zcta: string (nullable = true)
 |-- population: string (nullable = true)
 |-- density: string (nullable = true)
 |-- county_fips: string (nullable = true)
 |-- county_name: string (nullable = true)
 |-- county_weights: string (nullable = true)
 |-- county_names_all: string (nullable = true)
 |-- county_fips_all: string (nullable = true)
 |-- imprecise: string (nullable = true)
 |-- military: string (nullable = true)
 |-- timezone: string (nullable = true)
 |-- int_fips: integer (nullable = true)



33099

In [4]:
def load_covid_data():
    covid = sqlContext.read.option("header",True).csv("us-counties.csv")
    return covid

In [5]:
df1= load_covid_data()
#df1.count()
df1 = df1.filter(col("county") != "Unknown")
df1 = df1.withColumn('state', trim("state"))\
         .withColumn('county', trim("county"))
           
df1.count()

622957

In [10]:
 df1 = df.groupBy("state_name", "county_name", "county_fips") \
          .agg({"population":"sum" })\
          .withColumnRenamed('sum(population)', 'total_pop')\
          .withColumnRenamed('county_name', 'pop_county_name')\
          .withColumnRenamed('state_name', 'pop_state_name')\
          .withColumnRenamed('county_fips', 'pop_fips')\
          .orderBy("pop_state_name","pop_county_name")  
 df1.show(20) 
 df1.filter(col("pop_county_name") == "Baltimore").show()
 df1.printSchema()   
 

+--------------+---------------+--------+---------+
|pop_state_name|pop_county_name|pop_fips|total_pop|
+--------------+---------------+--------+---------+
|       Alabama|        Autauga|   01001|  57046.0|
|       Alabama|        Baldwin|   01003| 207694.0|
|       Alabama|        Barbour|   01005|  25278.0|
|       Alabama|           Bibb|   01007|  22859.0|
|       Alabama|         Blount|   01009|  45030.0|
|       Alabama|        Bullock|   01011|  10341.0|
|       Alabama|         Butler|   01013|  19770.0|
|       Alabama|        Calhoun|   01015| 111533.0|
|       Alabama|       Chambers|   01017|  35074.0|
|       Alabama|       Cherokee|   01019|  28103.0|
|       Alabama|        Chilton|   01021|  35064.0|
|       Alabama|        Choctaw|   01023|  12712.0|
|       Alabama|         Clarke|   01025|  25689.0|
|       Alabama|           Clay|   01027|  12391.0|
|       Alabama|       Cleburne|   01029|  14500.0|
|       Alabama|         Coffee|   01031|  54456.0|
|       Alab

In [11]:
#zip_data = load_data()
valid_zip = df.filter(col("population") > 0)
              #.drop("county_fips") 
zip_pop_join = valid_zip.join(df1,(valid_zip.county_fips == df1.pop_fips)& (valid_zip.state_name == df1.pop_state_name))
#zip_pop_join = valid_zip.join(df1, (valid_zip.county_name == df1.pop_county_name))                                                                                    
zip_pop_join.printSchema()
#results = zip_data.filter(col("county_name") == "New York")
results = zip_pop_join.filter(col("county_name") == "Baltimore")\
         .select("state_name", "zip", "lat", "lng", "city", "county_name", "population", "total_pop" )
zip_pop_join.count()    
results.show()


root
 |-- zip: string (nullable = true)
 |-- lat: string (nullable = true)
 |-- lng: string (nullable = true)
 |-- city: string (nullable = true)
 |-- state_id: string (nullable = true)
 |-- state_name: string (nullable = true)
 |-- zcta: string (nullable = true)
 |-- parent_zcta: string (nullable = true)
 |-- population: string (nullable = true)
 |-- density: string (nullable = true)
 |-- county_fips: string (nullable = true)
 |-- county_name: string (nullable = true)
 |-- county_weights: string (nullable = true)
 |-- county_names_all: string (nullable = true)
 |-- county_fips_all: string (nullable = true)
 |-- imprecise: string (nullable = true)
 |-- military: string (nullable = true)
 |-- timezone: string (nullable = true)
 |-- int_fips: integer (nullable = true)
 |-- pop_state_name: string (nullable = true)
 |-- pop_county_name: string (nullable = true)
 |-- pop_fips: string (nullable = true)
 |-- total_pop: double (nullable = true)

+----------+-----+--------+---------+-----------

In [12]:
from pyspark.sql.functions import from_unixtime
covid_data = load_covid_data()
covid_data = covid_data.withColumn("cases",col("cases").cast('float'))\
                  .withColumn("deaths",col("deaths").cast('float'))\
                  .withColumn("fips",col("fips").cast('int')) 
covid_data.count()
covid_data.printSchema()
results1 = covid_data.filter(col("county") == "Suffolk")         
#results1.show()
results2 = zip_pop_join.filter(col("county_name") == "Suffolk")\
                       .select("state_name", "zip","county_fips", "lat", "lng", "city", "county_name", "population", "total_pop" ) 
#results2.show()

norm_covid = covid_data.join(zip_pop_join,(covid_data.fips == zip_pop_join.int_fips) & (covid_data.county == zip_pop_join.county_name), 'inner' )
sample     = norm_covid.filter(col("county_name") == "Suffolk")\
             .select("date", "state_name","county_fips", "zip", "lat", "lng", "city", "county_name", "population", "total_pop" )
sample.show(10) 

norm_covid = norm_covid.withColumn('norm_cases', round((norm_covid.cases * norm_covid.population/norm_covid.total_pop)))\
                  .withColumn('norm_deaths',round((norm_covid.deaths * norm_covid.population/norm_covid.total_pop)))\
                  .withColumn("date", norm_covid.date.cast('Date'))\
                  .drop("cases", "deaths")\
                  .filter(col('date') <= "2020-09-30") 
norm_covid = norm_covid.withColumn("norm_cases",col("norm_cases").cast('int'))\
                        .withColumn("norm_deaths",col("norm_deaths").cast('int'))\
                        .withColumnRenamed('norm_cases', 'cases')\
                        .withColumnRenamed('norm_deaths', 'deaths')

norm_covid.printSchema()               
 #.withColumn("date", date_format(col("date"),"MM/dd/yyyy"))         
norm_covid.show()   
norm_covid.count()


root
 |-- date: string (nullable = true)
 |-- county: string (nullable = true)
 |-- state: string (nullable = true)
 |-- fips: integer (nullable = true)
 |-- cases: float (nullable = true)
 |-- deaths: float (nullable = true)

+----------+-------------+-----------+-----+--------+---------+------+-----------+----------+---------+
|      date|   state_name|county_fips|  zip|     lat|      lng|  city|county_name|population|total_pop|
+----------+-------------+-----------+-----+--------+---------+------+-----------+----------+---------+
|2020-10-13|Massachusetts|      25025|02108|42.35767|-71.06505|Boston|    Suffolk|      4082| 786312.0|
|2020-10-12|Massachusetts|      25025|02108|42.35767|-71.06505|Boston|    Suffolk|      4082| 786312.0|
|2020-10-11|Massachusetts|      25025|02108|42.35767|-71.06505|Boston|    Suffolk|      4082| 786312.0|
|2020-10-10|Massachusetts|      25025|02108|42.35767|-71.06505|Boston|    Suffolk|      4082| 786312.0|
|2020-10-09|Massachusetts|      25025|02108|4

6197581

In [14]:
data = norm_covid.select("date", "state_name", "zip", "county_fips", "city", "cases", "deaths")\
      .orderBy("date", "zip", "state_name") 
data.repartition(1).write.csv('covid_zip1.csv',  header=True)   
data.count()

6197581

In [15]:
#data1=data.filter( (col("city") == "Phoenix") & (col("state_name") == "Arizona") )
data1=data.filter( (col("state_id") == "AZ") & (col("city") == "Phoenix"))\
         .select("date", "state_name", "zip", "county_fips", "city", "cases", "deaths")
data1.repartition(1).write.csv('phoenix1.csv',  header=True)
data1.count()

10458

In [16]:
#data1=data.filter( (col("city") == "Phoenix") & (col("state_name") == "Arizona") )
data2=data.filter( (col("state_id") == "MD") & (col("city") == "Baltimore"))\
         .select("date", "state_name", "zip", "county_fips", "city", "cases", "deaths")
data2.repartition(1).write.csv('balt1.csv',  header=True)
data2.count()

408

In [17]:
#data1=data.filter( (col("city") == "Phoenix") & (col("state_name") == "Arizona") )
data3=data.filter( (col("state_id") == "MA") & (col("city") == "Boston"))\
         .select("date", "state_name", "zip", "county_fips", "city", "cases", "deaths")
data3.repartition(1).write.csv('boston1.csv',  header=True)
data3.count()

3402

In [18]:
#data1=data.filter( (col("city") == "Phoenix") & (col("state_name") == "Arizona") )
data4=data.filter( (col("state_id") == "CO") & (col("city") == "Denver"))\
         .select("date", "state_name", "zip", "county_fips", "city", "cases", "deaths")
data4.repartition(1).write.csv('denver1.csv',  header=True)
data4.count()

7916