In [2]:
import pyspark.sql.functions as fn
import pyspark.sql.types
from pyspark.sql import SparkSession
from pyspark.sql.types import *

In [57]:
# 1. Filters years from 2005-2020
# 2. Unbucketize the columns
# 3. Filters chosen countries
def filterCountryData(df,countries_chosen):
    
    # range b/w 2005-2020
    years = list(map(lambda x: str(x),list(range(2005,2021,1)))) 
    
    cols =["Country Name","Country Code","Indicator Name","Indicator Code"]+years
    country_2005_20 = df.select(cols)
    
    
    # filters countries chosen and fills any missing year values with 0.00
    countries_chosen_2005_20 = country_2005_20.filter(fn.col("Country Name").isin(countries_chosen)).fillna(0.00, subset=years)
    
    #unbucketize the data
    unpivotStr= list(map(lambda x: " '{t}',`{t}`".format(t=x),years))
    sep = ','
    unpivotExpr = "stack("+str(len(years))+", "+sep.join(unpivotStr)+") as (Year, Value)"
    columns_without_years= set(countries_chosen_2005_20.columns ) - set(years)
    res = countries_chosen_2005_20.select("Country Name","Country Code","Indicator Name",fn.expr(unpivotExpr))
    
    #TODO: join the dimensions to make a fact table

    return res

In [55]:
# Date Dimension

def dateDimension():
    year_df= spark.range(2005,2021).withColumnRenamed('id','year')
    tmp = (year_df
              .withColumn("decade",
                          fn.when(fn.col("year") % 10 >=5,fn.col("year")-fn.col("year")%10+10)
                              .otherwise(fn.col("year")- fn.col("year") % 10))
               .withColumn("year_code",fn.monotonically_increasing_id())

          )
    date_dim = (tmp
                   .select(tmp.year_code,*set(tmp.columns)-set(["year_code"]))
               )
    
    return date_dim

In [100]:
def naturalDisasterDim(df,filePath,countries_chosen):
    natural_disaster_df = (spark
                       .read
                       .format('csv')
                       .option("inferSchema",True)
                       .option("header",True)
                       .load(filePath))
    
    tmp_nd = (natural_disaster_df
                  .withColumn("Country",fn.when(fn.lower(fn.col("Country")).contains("united states"),"united states").otherwise(fn.lower(fn.col("Country"))))
             )

    nd_j_on_date = df.join(tmp_nd,['Year']).select(*tmp_nd.columns)
    

    tmp = (nd_j_on_date
           .filter(fn.col("Country").isin(list(map(lambda x: x.lower(),countries_chosen))))
           .withColumn("NaturalDisasterKey",fn.monotonically_increasing_id())
           
          )
    print(tmp.columns)
    lookup = tmp.select("NaturalDisasterKey","Country","Year")  
    
    natural_disaster_dimension = (tmp
                                  .select(fn.col("NaturalDisasterKey"),
                                          fn.col("country").alias("region"),
                                          fn.col("Disaster Type").alias("disaster_type"),
                                          fn.col("disaster subtype").alias("diaster_subtype"),
                                          fn.col("disaster subsubtype").alias("disaster_nestedsubtype"),
                                           fn.col("disaster subgroup").alias("disaster_subgroup"),
                                           fn.col("event name").alias("event_name"),
                                           fn.col("no injured").alias("ttl_injured"),
                                   fn.col("no affected").alias("ttl_affected"),
                                   fn.col("no homeless").alias("ttl_homeless"),
                                   fn.col("Total Damages ('000 US$)").alias("ttl_damaged_usd_thousands"),
                                   fn.col("ofda response").alias("ofda_response"))
                                 )
    
    return natural_disaster_dimension,lookup

In [101]:
#MAIN block
countries_chosen = ["United States", "Canada","Mexico","Thailand","Finland","Nigeria","Somalia","Norway","Japan"]
spark = SparkSession.builder.appName("ds_datastage").getOrCreate()

df=spark.read.format("csv").option("header",True).option("inferSchema",True).load("AssignmentData/HNP_StatsData.csv")

#filtered data
filterdCountryDf=filterCountryData(df,countries_chosen)
dateDim = dateDimension()
naturalDisasterDimension, nd_lookup=naturalDisasterDim(
    dateDim,
    countries_chosen=countries_chosen,
    filePath="AssignmentData/ExternalSources/DISASTERS/1900_2021_DISASTERS.xlsx - emdat data.csv"
)

display(filterdCountryDf.toPandas())
display(dateDim.toPandas())
display(naturalDisasterDimension.toPandas())
display(nd_lookup.toPandas())

['Year', 'Seq', 'Glide', 'Disaster Group', 'Disaster Subgroup', 'Disaster Type', 'Disaster Subtype', 'Disaster Subsubtype', 'Event Name', 'Country', 'ISO', 'Region', 'Continent', 'Location', 'Origin', 'Associated Dis', 'Associated Dis2', 'OFDA Response', 'Appeal', 'Declaration', 'Aid Contribution', 'Dis Mag Value', 'Dis Mag Scale', 'Latitude', 'Longitude', 'Local Time', 'River Basin', 'Start Year', 'Start Month', 'Start Day', 'End Year', 'End Month', 'End Day', 'Total Deaths', 'No Injured', 'No Affected', 'No Homeless', 'Total Affected', "Insured Damages ('000 US$)", "Total Damages ('000 US$)", 'CPI', 'Adm Level', 'Admin1 Code', 'Admin2 Code', 'Geo Locations', 'NaturalDisasterKey']


Unnamed: 0,Country Name,Country Code,Indicator Name,Year,Value
0,Canada,CAN,"Adolescent fertility rate (births per 1,000 wo...",2005,14.0642
1,Canada,CAN,"Adolescent fertility rate (births per 1,000 wo...",2006,13.8706
2,Canada,CAN,"Adolescent fertility rate (births per 1,000 wo...",2007,13.6770
3,Canada,CAN,"Adolescent fertility rate (births per 1,000 wo...",2008,13.2404
4,Canada,CAN,"Adolescent fertility rate (births per 1,000 wo...",2009,12.8038
...,...,...,...,...,...
65803,United States,USA,Young people (ages 15-24) newly infected with HIV,2016,6800.0000
65804,United States,USA,Young people (ages 15-24) newly infected with HIV,2017,6500.0000
65805,United States,USA,Young people (ages 15-24) newly infected with HIV,2018,6400.0000
65806,United States,USA,Young people (ages 15-24) newly infected with HIV,2019,6100.0000


Unnamed: 0,year_code,decade,year
0,0,2010,2005
1,8589934592,2010,2006
2,17179869184,2010,2007
3,25769803776,2010,2008
4,34359738368,2010,2009
5,42949672960,2010,2010
6,51539607552,2010,2011
7,60129542144,2010,2012
8,68719476736,2010,2013
9,77309411328,2010,2014


Unnamed: 0,NaturalDisasterKey,region,disaster_type,diaster_subtype,disaster_nestedsubtype,disaster_subgroup,event_name,ttl_injured,ttl_affected,ttl_homeless,ttl_damaged_usd_thousands,ofda_response
0,0,japan,Earthquake,Ground movement,,Geophysical,,735.0,2800.0,,400000.0,
1,1,japan,Earthquake,Ground movement,,Geophysical,,58.0,837.0,,,
2,2,united states,Flood,Riverine flood,,Hydrological,,,150.0,,250000.0,
3,3,united states,Flood,Riverine flood,,Hydrological,,8.0,500.0,,200000.0,
4,4,united states,Storm,Convective storm,Winter storm/Blizzard,Meteorological,,,,,350000.0,
...,...,...,...,...,...,...,...,...,...,...,...,...
803,8589934777,united states,Storm,Convective storm,Severe storm,Meteorological,,,,,2900000.0,
804,8589934778,united states,Flood,Riverine flood,,Hydrological,,,3000.0,,175000.0,
805,8589934779,united states,Storm,Tropical cyclone,,Meteorological,Tropical storm 'Cristobal',,,,325000.0,
806,8589934780,united states,Storm,Tropical cyclone,,Meteorological,Hurricane 'Isaias',,,,4800000.0,


Unnamed: 0,NaturalDisasterKey,Country,Year
0,0,japan,2005
1,1,japan,2005
2,2,united states,2005
3,3,united states,2005
4,4,united states,2005
...,...,...,...
803,8589934777,united states,2020
804,8589934778,united states,2020
805,8589934779,united states,2020
806,8589934780,united states,2020


In [None]:
nd = spark.read.format("csv").option("inferSchema",True).option("header",True).load("")