In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName("RDDAssignment2").getOrCreate()

In [3]:
cases_rdd = spark.sparkContext.textFile("/public/trendytech/covid19/cases/covid_dataset_cases.csv")

<H3> Cases Header: </H3>
      date|state|positive|negative|pending|hospitalizedCurrently|hospitalizedCumulative|inIcuCurrently|inIcuCumulative|onVentilatorCurrently|onVentilatorCumulative|recovered|dataQualityGrade|lastUpdateEt|dateModified|checkTimeEt|death|hospitalized|dateChecked|totalTestsViral|positiveTestsViral|negativeTestsViral|positiveCasesViral|deathConfirmed|deathProbable|fips|positiveIncrease|negativeIncrease|total|totalTestResults|totalTestResultsIncrease|posNeg|deathIncrease|hospitalizedIncrease|hash|commercialScore|negativeRegularScore|negativeScore|positiveScore|score|grade

In [4]:
states_rdd = spark.sparkContext.textFile("/public/trendytech/covid19/states/covid_dataset_states.csv")

<H3> States Header:</H3>
    
    state|notes|covid19Site|covid19SiteSecondary|covid19SiteTertiary|twitter|covid19SiteOld|name|fips|pui|pum

In [5]:
cases_rdd.take(5)

['20200122,AP,2,0,48,26,15,18,2,38,10,34,B,18,19/05/2022,23,24,29,34,19,45,5,44,42,49,53,0,0,2,2,0,2,0,0,8f8db794931706272489cddd51e917a4a69c8c9b,0,0,0,0,0',
 '20200123,AP,2,0,48,41,2,20,30,40,5,50,B,1,08/11/2022,14,7,33,36,14,18,36,37,45,8,53,0,0,2,2,0,2,0,0,e16af2a6a8f060355ff5ba499a28309a262c0b1e,0,0,0,0,0',
 '20200124,HP,2,0,16,14,5,29,43,22,11,11,D,31,17/05/2022,10,37,11,25,45,25,2,32,30,41,53,0,0,2,2,0,2,0,0,094154f68e74bfc30b977cdee888f9c07be4360e,0,0,0,0,0',
 '20200125,HP,2,0,10,13,41,50,26,19,34,8,D,40,07/10/2022,32,5,33,9,50,31,18,38,7,16,53,0,0,2,2,0,2,0,0,9b52ca94dd2a996822542ea5f17a7363e7ad91cf,0,0,0,0,0',
 '20200126,AS,2,0,15,43,23,45,20,46,15,30,D,31,28/12/2022,22,14,1,29,2,24,15,12,9,10,53,0,0,2,2,0,2,0,0,7acb526e14f20a29cc74a0b32a37328bc6eac6c2,0,0,0,0,0']

In [6]:
states_rdd.take(5)

['HP,null,https://covid19.hp.gov.in/ ,https://covid-19archive.org/,https://www.azdhs.gov,@HPCovid,https://arcg.is/0brSGj,null,53,,',
 'AS,null,https://covid19.hp.gov.in/ ,https://covid-19archive.org/,https://www.azdhs.gov,@ASCovid,null,null,6,null,null',
 'HR,null,https://covid19.hp.gov.in/ ,https://covid-19archive.org/,https://www.azdhs.gov,@HRCovid,null,null,9,null,null',
 'KA,null,https://covid19.hp.gov.in/ ,https://covid-19archive.org/,https://www.azdhs.gov,@KACovid,null,null,53,null,null',
 'WA,null,https://covid19.hp.gov.in/ ,https://covid-19archive.org/,https://www.azdhs.gov,@WACovid,null,null,44,null,null']

<H3> 1. Top 10 States with Highest No. of Positive Cases

In [7]:
statewise_positive_cases = cases_rdd.map(lambda x: (x.split(",")[1],int(x.split(",")[2])))

In [8]:
agg_pos_cases = statewise_positive_cases.reduceByKey(lambda x,y: x+y)

In [9]:
agg_pos_cases.sortBy(lambda x: x[1], ascending = False).take(10)

[('WA', 1701),
 ('GA', 1017),
 ('MH', 730),
 ('MI', 61),
 ('CA', 53),
 ('GJ', 35),
 ('BR', 23),
 ('JH', 13),
 ('CG', 8),
 ('RI', 6)]

### 2. Total count of people in ICU currently

In [10]:
patient_in_icu = cases_rdd.map(lambda x: int(x.split(",")[7])).sum()

In [11]:
print("Patient in ICU: {}".format(patient_in_icu))

Patient in ICU: 1344


### 3. Top 15 States having maximum no. of recovered patients

In [12]:
statewise_recovered_patients = cases_rdd.map(lambda x: (x.split(",")[1],int(x.split(",")[11])))

In [13]:
statewise_recovered_patients.reduceByKey(lambda x,y: x+y). \
                                sortBy(lambda x: x[1],ascending = False). \
                                take(15)

[('WA', 451),
 ('MH', 165),
 ('MI', 101),
 ('GA', 87),
 ('AP', 84),
 ('RI', 72),
 ('BR', 68),
 ('JH', 50),
 ('KA', 43),
 ('AZ', 38),
 ('AS', 30),
 ('GJ', 27),
 ('CA', 23),
 ('HR', 20),
 ('HP', 19)]

### 4. Three States with least no. of confirmed deaths

In [14]:
statewise_deaths = cases_rdd.map(lambda x: (x.split(",")[1],int(x.split(",")[23])))

In [15]:
statewise_deaths.reduceByKey(lambda x,y: x+y).sortBy(lambda x: x[1], ascending = True).take(3)

[('AS', 9), ('JH', 10), ('CG', 31)]

### 5. Total No. of people hospitalized currently

In [16]:
cases_rdd.map(lambda x: int(x.split(",")[5])).sum()

1319

### 6. Print the Twitter handle and fips code of the top 15 states with highest no. of total cases

In [17]:
statewise_total_patients = cases_rdd.map(lambda x: (x.split(",")[1],int(x.split(",")[28])))

In [20]:
top15_by_total_count = statewise_total_patients.reduceByKey(lambda x,y: x+y). \
                          sortBy(lambda x: x[1],ascending = False)

In [32]:
top15_by_total_count.take(15)

[('WA', 2100),
 ('GA', 1034),
 ('MH', 730),
 ('CA', 515),
 ('MI', 61),
 ('GJ', 35),
 ('AZ', 34),
 ('BR', 23),
 ('RI', 16),
 ('JH', 13),
 ('CG', 8),
 ('KA', 5),
 ('AP', 4),
 ('HP', 4),
 ('AS', 2)]

In [33]:
# Getting the twitter handle and fips code of states

twitter_fips = states_rdd.map(lambda x: (x.split(",")[0],(x.split(",")[5],x.split(",")[8])))

In [34]:
states_rdd.map(lambda x: (x.split(",")[0],(x.split(",")[5],x.split(",")[8]))).take(10)

[('HP', ('@HPCovid', '53')),
 ('AS', ('@ASCovid', '6')),
 ('HR', ('@HRCovid', '9')),
 ('KA', ('@KACovid', '53')),
 ('WA', ('@WACovid', '44')),
 ('CG', ('@CGCovid', '53')),
 ('BR', ('@BRCovid', '53')),
 ('JH', ('@JHCovid', '53')),
 ('GJ', ('@GJCovid', '44')),
 ('MH', ('@MHCovid', '26'))]

In [36]:
top15_by_total_count.join(twitter_fips).sortBy(lambda x: x[1][0], False).collect()

[('WA', (2100, ('@WACovid', '44'))),
 ('GA', (1034, ('@GACovid', '44'))),
 ('MH', (730, ('@MHCovid', '26'))),
 ('CA', (515, ('@CACovid', '4'))),
 ('MI', (61, ('@MICovid', '53'))),
 ('GJ', (35, ('@GJCovid', '44'))),
 ('AZ', (34, ('@AZCovid', '53'))),
 ('BR', (23, ('@BRCovid', '53'))),
 ('RI', (16, ('@RICovid', '26'))),
 ('JH', (13, ('@JHCovid', '53'))),
 ('CG', (8, ('@CGCovid', '53'))),
 ('KA', (5, ('@KACovid', '53'))),
 ('HP', (4, ('@HPCovid', '53'))),
 ('AS', (2, ('@ASCovid', '6'))),
 ('HR', (2, ('@HRCovid', '9')))]