# Spark

### Download dataset

In [None]:
!wget -O natural_disasters_BIG.csv https://www.dropbox.com/s/b74bpkbr89l8k43/natural_disasters_BIG.csv?dl=0

### 1. How many disasters occurred in continent C?

In [1]:
%%file spark_1.py
import pyspark
from operator import add as sum
sc = pyspark.SparkContext('local[*]')
try :
    # import input CSV file
    lines = sc.textFile('natural_disasters_BIG.csv')
    # separate each line by its columns
    lines = lines.map( lambda line : line.split(","))
    # retrieve only the columns with continents and occurrences
    continents = lines.map( lambda line: (line[5], line[9]))
    # for each continent sum the number of occurrences
    occurrences = continents.reduceByKey(lambda v1,v2: int(v1) + int(v2))
    # print all results
    for oc in occurrences.collect():
        print(f"{oc}\n")
    sc.stop()
except Exception as e:
    print(e)
    sc.stop()

Overwriting spark_1.py


In [2]:
!bash -c "time python spark_1.py"

19/11/12 14:57:23 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
('Europe', 362135)                                                              

('Oceania', 89256)

('Africa', 576686)

('Americas', 626963)

('Asia', 1191797)


real	0m13.909s
user	0m0.585s
sys	0m0.201s


### 2. In which regions there were disasters of type X?

In [3]:
%%file spark_2.py
import pyspark
sc = pyspark.SparkContext('local[*]')
try :
    # import input CSV file
    lines = sc.textFile('natural_disasters_BIG.csv')
    # separate each line by its columns
    lines = lines.map( lambda line : line.split(","))
    # retrieve only the columns with disaster type and regions
    type_region = lines.map( lambda line: (line[3], line[6]))
    # maintain only the distinct regions
    distinct_regions = type_region.distinct()
    # group the regions by disaster type and aggregate them in a list
    region_by_type = distinct_regions.groupByKey().map(lambda x: (x[0], list(x[1])))
    # print all results
    for type_x in region_by_type.collect():
        print(f"{type_x}\n")
    sc.stop()
except Exception as e:
    print(e)
    sc.stop()

Overwriting spark_2.py


In [5]:
!bash -c "time python spark_2.py"

19/11/12 14:58:17 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
('Transport accident', ['Western Europe', 'Western Africa', 'Southern Africa', 'Middle Africa', 'South-Eastern Asia', 'Northern America', 'South America', 'Northern Africa', 'Central America', 'Western Asia', 'Polynesia', 'Northern Europe', 'Eastern Europe', 'Southern Europe', 'Southern Asia', 'Eastern Africa', 'Melanesia', 'Eastern Asia', 'Australia and New Zealand', 'Caribbean', 'Russian Federation', 'Central Asia', 'Micronesia'])

('Drought', ['Eastern Asia', 'South America', 'Australia and New Zealand', 'Northern Africa', 'Western Asia', 'Southern Africa', 'Middle Africa', 'Russian Federation', 'South-Eastern Asia', 'Central Asia', 'No

### 3. What are the probabilities of getting injured or dying in a natural disaster of type T in the continent C during decade D (190x, 191x, 192x, ..., 199x, 200x, 201x)?

In [4]:
%%file spark_3.py
import pyspark
sc = pyspark.SparkContext('local[*]')
try :
    # import input CSV file
    lines = sc.textFile('natural_disasters_BIG.csv')
    # separate each line by its columns and remove the empty lines in the columns of interest
    lines = lines.map( lambda line : line.split(",")).filter(lambda lines: len(lines[3])>0 and len(lines[5])>0 and \
                                                             len(lines[10])>0 and len(lines[11]) and len(lines[14])>0)
    
    # create a RDD with the decade, disaster type and continent as key and the deaths as value
    key_value_death = lines.map( lambda lines: (f"{lines[0][:len(lines[0])-1]}0,{lines[3]},{lines[5]}",f"{lines[10]}"))
    
    # create a RDD with the decade, disaster type and continent as key and the injured as value
    key_value_injured = lines.map( lambda lines: (f"{lines[0][:len(lines[0])-1]}0,{lines[3]},{lines[5]}",f"{lines[11]}"))
    
    # create a RDD with the decade, disaster type and continent as key and the total affected as value
    key_value_affected = lines.map( lambda lines: (f"{lines[0][:len(lines[0])-1]}0,{lines[3]},{lines[5]}",f"{lines[14]}"))
    
    # sum the number of deaths for each combination of decade, disaster type and continent
    total_deaths = key_value_death.reduceByKey(lambda v1, v2: int(v1)+int(v2))
    # do the same for the number of injured and total affected
    total_injured = key_value_injured.reduceByKey(lambda v1, v2: int(v1)+int(v2))
    total_affected = key_value_affected.reduceByKey(lambda v1, v2: int(v1)+int(v2))
    
    # join the RDD containing the summed deaths with the RDD containing the summed total affected 
    deaths_joint = total_deaths.join(total_affected)
    # for each key (combination of decade, disaster type and continent) divide the summed deaths by the summed total affected
    death_probs = deaths_joint.map(lambda x: (x[0], int(x[1][0])/int(x[1][1])))
    
    # do the same for the RDD containing the summed injured
    injured_joint = total_injured.join(total_affected)
    injured_probs = injured_joint.map(lambda x: (x[0], int(x[1][0])/int(x[1][1])))
    
    # join the two RDDs containing the probabilities 
    joint_probs = death_probs.join(injured_probs)
    
    # print all results
    for prob in joint_probs.collect():
        print(f"{prob}\n")
    sc.stop()
except Exception as e:
    print(e)
    sc.stop()
    


Overwriting spark_3.py


In [6]:
!bash -c "time python spark_3.py"

19/11/12 14:58:32 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
('1920,Earthquake,Asia', (0.6777652879787639, 0.5234847966759577))              

('1930,Industrial accident,Europe', (1.57791932059448, 1.0339702760084926))

('1960,Storm,Americas', (0.002320734689153696, 0.007446561348323535))

('1970,Miscellaneous accident,Europe', (0.4377650851099926, 0.9932986484193859))

('1970,Landslide,Asia', (0.06241701579116984, 0.004357073799548823))

('1970,Volcanic activity,Europe', (0.3935860058309038, 0.9628279883381924))

('1980,Earthquake,Europe', (0.010266170106388674, 0.008806389354177605))

('1980,Flood,Americas', (0.0003943165465470656, 0.0036641335095017203))

('1980,Storm,Asia', (0.000168270389238912

### 4. Optional Exercise. What is the mean total damage and most costly disaster's expanses of a certain disaster subgroup in each country?

In [5]:
%%file spark_4.py
import pyspark
sc = pyspark.SparkContext('local[*]')
try :
    # import input CSV file
    lines = sc.textFile('natural_disasters_BIG.csv')
    # separate each line by its columns and remove the empty lines in the column of interest
    lines = lines.map( lambda line : line.split(",")).filter(lambda lines: len(lines[15])>0)
    
    # create a RDD with the country and disaster subgroup as key and the total damage as value
    key_value_damage = lines.map( lambda lines: (f"{lines[8]},{lines[2]}",f"{lines[15]}"))
    
    # create a RDD with the country and disaster subgroup as key and the occurrences as value
    key_value_occcurrences = lines.map( lambda lines: (f"{lines[8]},{lines[2]}",f"{lines[9]}"))
    
    # sum the total damage for each combination of country and disaster subgroup
    total_damage = key_value_damage.reduceByKey(lambda v1, v2: float(v1)+float(v2))
    # do the same for the occurrences
    total_occurrences = key_value_occcurrences.reduceByKey(lambda v1, v2: int(v1)+int(v2))
    
    # join the RDD containing the total damage with the RDD containing the total occurrences
    damage_occurrences_joint = total_damage.join(total_occurrences)
    # for each key (combination of country and disaster subgroup) divide the total damage by the total occurrences
    mean_damage = damage_occurrences_joint.map(lambda x: (x[0], float(x[1][0])/float(x[1][1])))
    
    # find the maximum damage for each combination of country and disaster subgroup
    max_damage = key_value_damage.reduceByKey(lambda v1, v2: max(float(v1), float(v2)))
    
    # join the two RDDs containing the damages
    joint_rdd = mean_damage.join(max_damage)
    
    # print all results
    for damage in joint_rdd.collect():
        print(f"{damage}\n")
    sc.stop()
except Exception as e:
    print(e)
    sc.stop()


Overwriting spark_4.py


In [6]:
!bash -c "time python spark_4.py"

19/11/14 11:48:30 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
('United States of America (the),Hydrological', (571690.463576159, 12600000.0)) 

('Ethiopia,Geophysical', (3535.0, 6750.0))

('Chile,Geophysical', (1290252.5, 30000000.0))

('Hong Kong,Meteorological', (58578.71428571428, 755500.0))

('Italy,Geophysical', (2108498.153846154, 20000000.0))

('Taiwan (Province of China),Meteorological', (100978.39344262295, 1300000.0))

('Philippines (the),Meteorological', (68841.74198757764, 10136563.0))

('Canada,Climatological', (1388409.0909090908, 4200000.0))

('Turkey,Technological', (139000.0, 178000.0))

('Germany Fed Rep,Meteorological', (348300.0, 1300000.0))

('New Zealand,Geophysical', (3585958.6