In [3]:
import pyspark.sql.functions as fn
import pyspark.sql.types
from pyspark.sql import SparkSession
from pyspark.sql.types import *

In [113]:
# 1. Filters years from 2005-2020
# 2. Unbucketize the columns
# 3. Filters chosen countries
def filterCountryData(df,countries_chosen):
    
    # range b/w 2005-2020
    years = list(map(lambda x: str(x),list(range(2005,2021,1)))) 
    
    cols =["Country Name","Country Code","Indicator Name","Indicator Code"]+years
    country_2005_20 = df.select(cols)
    
    
    # filters countries chosen and fills any missing year values with 0.00
    ts = "2020-04-01"
    countries_chosen_2005_20 = (country_2005_20
                                .filter(fn.col("Country Name").isin(countries_chosen)).fillna(0.00, subset=years)
                                .withColumn("date",fn.date_format(fn.lit(ts),"yyyy-MM-dd"))
                               )
    
    #unbucketize the data
    unpivotStr= list(map(lambda x: " '{t}',`{t}`".format(t=x),years))
    sep = ','
    unpivotExpr = "stack("+str(len(years))+", "+sep.join(unpivotStr)+") as (Year, Value)"
    columns_without_years= set(countries_chosen_2005_20.columns ) - set(years)
    
    res = countries_chosen_2005_20.select(
        "Country Name",
        "Country Code",
        "Indicator Name",
        fn.expr(unpivotExpr),
        fn.month("date").alias("month"),
        fn.dayofmonth("date").alias("day"),
        fn.quarter("date").alias("quarter")            
    ).groupBy("Country Name","Year").pivot("Indicator Name").sum("Value")
    
    #TODO: join the dimensions to make a fact table

    return res

In [2]:
# Date Dimension
def generate_dates(spark,range_list,interval=60*60*24,dt_col="date_time_ref"): # TODO: attention to sparkSession
     """
     Create a Spark DataFrame with a single column named dt_col and a range of date within a specified interval (start and stop included).
     With hourly data, dates end at 23 of stop day

     :param spark: SparkSession or sqlContext depending on environment (server vs local)
     :param range_list: array of strings formatted as "2018-01-20" or "2018-01-20 00:00:00"
     :param interval: number of seconds (frequency), output from get_freq()
     :param dt_col: string with date column name. Date column must be TimestampType

     :returns: df from range
     """
     start,stop = range_list
     temp_df = spark.createDataFrame([(start, stop)], ("start", "stop"))
     temp_df = temp_df.select([fn.col(c).cast("timestamp") for c in ("start", "stop")])
     temp_df = temp_df.withColumn("stop",fn.date_add("stop",1).cast("timestamp"))
     temp_df = temp_df.select([fn.col(c).cast("long") for c in ("start", "stop")])
     start, stop = temp_df.first()
     return spark.range(start,stop,interval).select(fn.col("id").cast("timestamp").alias(dt_col))


def dateDimension():
    time_rng = ["2005-01-01","2020-12-31"]
    year_df= generate_dates(spark,time_rng)
    tmp = (year_df
           .withColumn("year",fn.year("date_time_ref"))
           .withColumn("month",fn.month("date_time_ref"))
           .withColumn("day",fn.dayofmonth("date_time_ref"))
           .withColumn("quarter",fn.quarter("date_time_ref"))
           .withColumn("decade",
                          fn.when(fn.col("year") % 10 >=5,fn.col("year")-fn.col("year")%10+10)
                              .otherwise(fn.col("year")- fn.col("year") % 10))
           .withColumn("year_code",fn.monotonically_increasing_id())

          )
    date_dim = (tmp
                   .select(tmp.year_code,*set(tmp.columns)-set(["year_code"]))
               )
    
    return date_dim

In [5]:
def naturalDisasterDim(df,filePath,countries_chosen):
    """
        creates natural disaster dimension + look up table
    
        df - date dataframe
        filePath - filePath to natural disaster csv
        countries_chosen - list of strings of countries to work on
    """
    
    columns = ["total deaths","Total Damages ('000 US$)"]
    
    # reads csv
    natural_disaster_df = (spark
                       .read
                       .format('csv')
                           .option("inferSchema",True)
                           .option("header",True)
                           .load(filePath)
                           .fillna(0.00,subset=columns)).dropDuplicates()
    

    # reconfigures column names + banding
    tmp_nd = (natural_disaster_df
                  # replaces United States of America -> united states
              .withColumn("Country",fn.when(fn.lower(fn.col("Country")).contains("united states"),"united states").otherwise(fn.lower(fn.col("Country"))))
              .withColumn("start_month",fn.col("Start Month"))
                  .withColumn("start_year",fn.col("Start Year"))
                  .withColumn("start_day",fn.col("Start Day"))
                  .withColumn("end_month",fn.col("End Month"))
                  .withColumn("end_year",fn.col("End Year"))
                  .withColumn("end_day",fn.col("End Day"))
              .withColumn("disaster_type",fn.col("Disaster Type"))
              .withColumn("disaster_subtype",fn.col("disaster subtype"))
              .withColumn("disaster_nestedsubtype",fn.col("disaster subsubtype"))
              .withColumn("disaster_subgroup",fn.col("disaster subgroup"))
              .withColumn("event_name",fn.col("event name"))
              .withColumn("ofda_response",fn.col("ofda response"))
              .fillna(1.0,["start_day","start_month","start_year","end_day","end_month","end_year"])
              # TODO figure out what to do about start and end year
              .fillna("Not Available",["disaster_type","disaster_subtype","disaster_nestedsubtype","disaster_subgroup","event_name","ofda_response"])
              .withColumn("ttl_death",
                          # range (low,medium, high)
                          fn.when(fn.col("total deaths")>7000,
                                  fn.when(fn.col("total deaths")>14000,"high").otherwise("medium")).otherwise("low")
                         )
              .withColumn("ttl_damages",
                          # 
                          fn.when(fn.col("Total Damages ('000 US$)")>1000000,
                                  fn.when(fn.col("Total Damages ('000 US$)")>100000000,"high").otherwise("medium")).otherwise("low")
                         )
              
                  .drop("year")
             )

    # join on start year
    max_year = df.select(fn.max("year")).limit(1).collect()[0][0]
    min_year = df.select(fn.min("year")).limit(1).collect()[0][0]
    
    nd_j_on_date = tmp_nd.filter(fn.col("start_year")>=min_year).filter(fn.col("end_year")<=max_year)

    # filter countries chosen
    filtered_byCountry_date = (nd_j_on_date
           .filter(fn.col("Country").isin(list(map(lambda x: x.lower(),countries_chosen))))
           
    )
    
    # distinct banded rows with key
    res = (filtered_byCountry_date  
                                  .select([
                                           "disaster_type",
                                           "disaster_subtype",
                                           "disaster_nestedsubtype",
                                           "disaster_subgroup",
                                           "event_name",
                                           "ttl_death",
                                           "ttl_damages",
                                           "ofda_response"])                                   
                                 ).distinct().withColumn("natural_disaster_key",fn.monotonically_increasing_id())
    
    
    lookup=(res.join(
        filtered_byCountry_date,
        on = [
            "disaster_type","disaster_subtype","disaster_nestedsubtype","disaster_subgroup","event_name","ofda_response","ttl_damages","ttl_death"
        ])
        .select("natural_disaster_key","Country","start_year","start_month","start_day","end_year","end_month","end_day")
        )
    
    
    # dimension, lookup
    return res,lookup



In [88]:
def countryDimension(time_df,indicators,countries_chosen,filePath):
    max_year = time_df.select(fn.max("year")).limit(1).collect()[0][0]
    min_year = time_df.select(fn.min("year")).limit(1).collect()[0][0]
    
    countries = (spark
                       .read
                       .format('csv')
                           .option("inferSchema",True)
                           .option("header",True)
                           .load(filePath)
                        )

    indicators = (indicators
                  .withColumn("country_name",fn.lower("Country Name"))
                  .drop("Country Name")
                  .withColumn("age_dependency_ratio_workingage",
                              fn.when(fn.col("Age dependency ratio (% of working-age population)")>100.00, 100.00)
                              .otherwise(fn.col("Age dependency ratio (% of working-age population)")),)
                  .withColumn("labor_force_total",
                              fn.when(fn.col("Labor force, total")>30000000,
                                     fn.when(fn.col("Labor force, total")>80000000,"high").otherwise("medium")
                                     ).otherwise("low"))
                  .select(
                      fn.col("country_name"),
                      fn.col("Population, total").alias("population_total"),
                      fn.col("Population growth (annual %)").alias("population_growth"),
                      fn.col("Urban population growth (annual %)").alias("urban_population_growth"),
                      fn.col("Urban population").alias("urban_population"),
                      fn.col("Rural population").alias("rural_population"),
                      fn.col("Unemployment, total (% of total labor force)").alias("unemployment_rate"),
                      fn.col("age_dependency_ratio_workingage"),
                      fn.col("Poverty headcount ratio at national poverty line (% of population)").alias("poverty_headcount_percentage"),
                      fn.col("labor_force_total"),
                      fn.col("Net migration").alias("net_migration"),
                      fn.col("year")
                  )
#                   .fillna(
#                       indicators.select(fn.avg("Age dependency ratio (% of working-age population)")).collect()[0][0],
#                       subset=["age_dependency_ratio_workingage"]
#                   )
                 )
    tmp = (countries
               .filter(fn.lower(fn.col("short name")).isin(list(map(lambda x: x.lower(),countries_chosen))))
               .select(
                   fn.lower("Currency Unit").alias("currency"),
                   fn.lower("short name").alias("country_name"),
                   fn.col("region"),
               )
          )
    
    res = (tmp.join(indicators,on=["country_name"]).withColumn("country_key",fn.monotonically_increasing_id()))
    
    lookup = (res.select(
        "country_name",
        "year",
        "country_key"
    ))

           
    return res.drop("year"), lookup
               
dateDim = dateDimension()
df,tmp = countryDimension(  
    dateDim,
    filterdCountryDf,
    countries_chosen=countries_chosen,
    filePath="AssignmentData/HNP_StatsCountry.csv")

display(df.toPandas())
display(tmp.toPandas())

Unnamed: 0,country_name,currency,region,population_total,population_growth,urban_population_growth,urban_population,rural_population,unemployment_rate,age_dependency_ratio_workingage,poverty_headcount_percentage,labor_force_total,net_migration,country_key
0,nigeria,nigerian naira,Sub-Saharan Africa,176404931.0,2.665007,4.521129,82878565.0,93526366.0,4.56,88.498487,0.0,medium,0.0,0
1,somalia,somali shilling,Sub-Saharan Africa,13423571.0,2.717396,4.042955,5729046.0,7694525.0,13.32,100.000000,0.0,low,0.0,1
2,finland,euro,Europe & Central Asia,5313399.0,0.465549,0.622925,4426008.0,887391.0,6.37,50.079271,13.8,low,0.0,17179869184
3,mexico,mexican peso,Latin America & Caribbean,121858251.0,1.241165,1.610136,96615314.0,25242937.0,4.31,52.347721,0.0,medium,0.0,34359738368
4,thailand,thai baht,East Asia & Pacific,68971313.0,0.373015,1.941560,33415222.0,35556091.0,0.69,40.157546,8.6,medium,0.0,51539607552
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
139,canada,canadian dollar,North America,35702908.0,0.746339,0.796808,29011826.0,6691082.0,6.91,47.156456,0.0,low,0.0,1666447310848
140,norway,norwegian krone,Europe & Central Asia,5379475.0,0.588757,1.021155,4463566.0,915909.0,4.62,53.335836,0.0,low,0.0,1666447310849
141,mexico,mexican peso,Latin America & Caribbean,107560155.0,1.456213,1.859028,82408288.0,25151867.0,3.57,59.071730,0.0,medium,0.0,1675037245440
142,somalia,somali shilling,Sub-Saharan Africa,10763904.0,2.989724,5.139336,3993408.0,6770496.0,13.19,100.000000,0.0,low,0.0,1675037245441


Unnamed: 0,country_name,year,country_key
0,nigeria,2014,0
1,somalia,2014,1
2,finland,2008,17179869184
3,mexico,2015,34359738368
4,thailand,2016,51539607552
...,...,...,...
139,canada,2015,1666447310848
140,norway,2020,1666447310849
141,mexico,2006,1675037245440
142,somalia,2006,1675037245441


In [6]:
spark = SparkSession.builder.appName("ds_datastage").getOrCreate()

In [114]:
#MAIN block
countries_chosen = ["United States", "Canada","Mexico","Thailand","China","India","Niger","Madagascar","Guinea"]

df=spark.read.format("csv").option("header",True).option("inferSchema",True).load("AssignmentData/HNP_StatsData.csv")

#filtered data
filterdCountryDf=filterCountryData(df,countries_chosen)

dateDim = dateDimension()
naturalDisasterDimension, nd_lookup=naturalDisasterDim(
    dateDim,
    countries_chosen=countries_chosen,
    filePath="AssignmentData/ExternalSources/DISASTERS/1900_2021_DISASTERS.xlsx - emdat data.csv"
)

# countryDimension = 
# display(naturalDisasterDimension.toPandas())
# display(nd_lookup.toPandas())
display(filterdCountryDf.toPandas())
# display(dateDim.toPandas())
filterdCountryDf.count()

  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series


  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series


  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series


  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series


  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series


  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series


  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series


  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series


  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series


  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series


  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series


  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series


  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series


  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series
  df[column_name] = series


Unnamed: 0,Country Name,Year,AIDS estimated deaths (UNAIDS estimates),ARI treatment (% of children under 5 taken to a health provider),"Adolescent fertility rate (births per 1,000 women ages 15-19)",Adults (ages 15+) and children (0-14 years) living with HIV,Adults (ages 15+) and children (ages 0-14) newly infected with HIV,Adults (ages 15+) living with HIV,Adults (ages 15-49) newly infected with HIV,"Age at first marriage, female",...,Urban population (% of total population),Urban population growth (annual %),Urban poverty headcount ratio at national poverty lines (% of urban population),Use of insecticide-treated bed nets (% of under-5 population),Vitamin A supplementation coverage rate (% of children ages 6-59 months),Wanted fertility rate (births per woman),Women who were first married by age 15 (% of women ages 20-24),Women who were first married by age 18 (% of women ages 20-24),Women's share of population ages 15+ living with HIV (%),Young people (ages 15-24) newly infected with HIV
0,Madagascar,2015,1100.0,0.0,113.7668,24000.0,4400.0,23000.0,3800.0,0.0,...,35.193,4.584426,0.0,0.0,97.0,0.0,0.0,0.0,46.5,1000.0
1,Mexico,2015,5900.0,73.1,62.6070,280000.0,19000.0,280000.0,17000.0,23.2,...,79.285,1.610136,0.0,0.0,0.0,0.0,3.8,26.1,18.8,4000.0
2,Madagascar,2019,1700.0,0.0,105.8848,38000.0,5800.0,36000.0,5000.0,0.0,...,37.861,4.441921,0.0,0.0,0.0,0.0,0.0,0.0,46.7,1000.0
3,Thailand,2016,20000.0,79.5,46.2390,540000.0,9500.0,530000.0,9300.0,0.0,...,48.448,1.941560,0.0,0.0,0.0,0.0,0.0,0.0,44.9,4900.0
4,Thailand,2010,29000.0,0.0,47.9138,620000.0,15000.0,610000.0,14000.0,24.9,...,43.856,3.529244,0.0,0.0,0.0,0.0,0.0,0.0,45.8,6900.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
139,Mexico,2006,6700.0,0.0,71.4640,200000.0,15000.0,190000.0,14000.0,0.0,...,76.616,1.859028,0.0,0.0,0.0,0.0,0.0,22.7,18.3,3200.0
140,Niger,2012,1900.0,53.1,200.7430,35000.0,1300.0,30000.0,1000.0,17.2,...,16.212,3.874067,0.0,20.1,98.0,7.4,28.0,76.3,51.8,200.0
141,China,2012,0.0,0.0,7.3770,0.0,0.0,0.0,0.0,0.0,...,51.765,3.130657,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
142,India,2020,0.0,0.0,0.0000,2300000.0,0.0,0.0,0.0,0.0,...,34.926,2.297828,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


144

In [48]:

### TESTING BLOCK


# ## LOOKUP TABLE LOGIC
# # 2006-2010
# tmp = dateDim.filter(fn.col("year")==2006).select(fn.col("year").alias("year_2"))
# dateDim_a = dateDim.alias("a")
# tmp_b = tmp.alias("b")

# test2 = dateDim_a.join(tmp_b.alias("b"),tmp_b.year_2<dateDim_a.year)
# test3 = dateDim_a.join(tmp_b.alias("b"),2010>dateDim_a.year)

# test2.show()
# test3.show()

# test3.intersect(test2).show()


AttributeError: 'SparkSession' object has no attribute 'parallelize'

In [66]:
### TEST FUNCTIONS

import matplotlib.pyplot as plt 
import numpy as np

# df - dataframe
# col - column to observe
def nullCount(df,cl):
    non_null =(df
     .filter(fn.col(cl).isNotNull())
    )
    
    null = (df
        .filter(fn.col(cl).isNull()))
    
    print("Number of non null values: "+str(non_null.count()))
    print("Number of null values: "+str(null.count()))
    
    
def summary_df(df,cl,bns = 10):
    """
        returns null counts, basic statistics & plot of current values in a column
        
        df - dataframe you wish to observer these statistics
        cl - column of which you wish to observe
        bns - bins (number of bars) histogram will try to bucketize data in
    """
    nullCount(df,cl)
    
#     df.groupBy(fn.col(cl)).count().orderBy(fn.asc(fn.col(cl))).show()
#     df.groupBy(fn.col(cl)).count().orderBy(fn.desc(fn.col(cl))).show()
    
    tmp = df.filter(fn.col(cl).isNotNull())
    tmp.select(cl).describe().show()

    pd_data = tmp.select(fn.col(cl)).toPandas()
    # display(pd_data)
    plt.hist(pd_data,bins = bns)
    plt.title("Histogram of " +str(cl))
    plt.xlabel(cl)
    plt.ylabel("count")

In [115]:
#"Total Damages ('000 US$)"
summary_df(tmp,"net_migration",bns=3)

Number of non null values: 144
Number of null values: 0


AnalysisException: cannot resolve '`net_migration`' given input columns: [country_key, country_name, year];
'Project ['net_migration]
+- Project [country_name#63837381, year#11767306, country_key#63837399L]
   +- Filter isnotnull(net_migration#63837367)
      +- Project [country_name#63837381, year#11767306, country_key#63837399L, net_migration#63837367]
         +- Project [country_name#63837381, currency#63837380, region#63835462, population_total#63837360, population_growth#63837361, urban_population_growth#63837362, urban_population#63837363, rural_population#63837364, unemployment_rate#63837365, age_dependency_ratio_workingage#63836437, poverty_headcount_percentage#63837366, labor_force_total#63836898, net_migration#63837367, year#11767306, monotonically_increasing_id() AS country_key#63837399L]
            +- Project [country_name#63837381, currency#63837380, region#63835462, population_total#63837360, population_growth#63837361, urban_population_growth#63837362, urban_population#63837363, rural_population#63837364, unemployment_rate#63837365, age_dependency_ratio_workingage#63836437, poverty_headcount_percentage#63837366, labor_force_total#63836898, net_migration#63837367, year#11767306]
               +- Join Inner, (country_name#63837381 = country_name#63835517)
                  :- Project [lower(Currency Unit#63835460) AS currency#63837380, lower(short name#63835456) AS country_name#63837381, region#63835462]
                  :  +- Filter lower(short name#63835456) IN (united states,canada,mexico,thailand,finland,nigeria,somalia,norway,japan)
                  :     +- Relation[Country Code#63835455,Short Name#63835456,Table Name#63835457,Long Name#63835458,2-alpha code#63835459,Currency Unit#63835460,Special Notes#63835461,Region#63835462,Income Group#63835463,WB-2 code#63835464,National accounts base year#63835465,National accounts reference year#63835466,SNA price valuation#63835467,Lending category#63835468,Other groups#63835469,System of National Accounts#63835470,Alternative conversion factor#63835471,PPP survey year#63835472,Balance of Payments Manual in use#63835473,External debt Reporting status#63835474,System of trade#63835475,Government Accounting concept#63835476,IMF data dissemination standard#63835477,Latest population census#63835478,... 7 more fields] csv
                  +- Project [country_name#63835517, Population, total#11768599 AS population_total#63837360, Population growth (annual %)#11768594 AS population_growth#63837361, Urban population growth (annual %)#11768712 AS urban_population_growth#63837362, Urban population#11768710 AS urban_population#63837363, Rural population#11768668 AS rural_population#63837364, Unemployment, total (% of total labor force)#11768708 AS unemployment_rate#63837365, age_dependency_ratio_workingage#63836437, Poverty headcount ratio at national poverty line (% of population)#11768601 AS poverty_headcount_percentage#63837366, labor_force_total#63836898, Net migration#11768451 AS net_migration#63837367, year#11767306]
                     +- Project [Year#11767306, AIDS estimated deaths (UNAIDS estimates)#11768264, ARI treatment (% of children under 5 taken to a health provider)#11768265, Adolescent fertility rate (births per 1,000 women ages 15-19)#11768266, Adults (ages 15+) and children (0-14 years) living with HIV#11768267, Adults (ages 15+) and children (ages 0-14) newly infected with HIV#11768268, Adults (ages 15+) living with HIV#11768269, Adults (ages 15-49) newly infected with HIV#11768270, Age at first marriage, female#11768271, Age at first marriage, male#11768272, Age dependency ratio (% of working-age population)#11768273, Age dependency ratio, old#11768274, Age dependency ratio, young#11768275, Age population, age 00, female, interpolated#11768276, Age population, age 00, male, interpolated#11768277, Age population, age 01, female, interpolated#11768278, Age population, age 01, male, interpolated#11768279, Age population, age 02, female, interpolated#11768280, Age population, age 02, male, interpolated#11768281, Age population, age 03, female, interpolated#11768282, Age population, age 03, male, interpolated#11768283, Age population, age 04, female, interpolated#11768284, Age population, age 04, male, interpolated#11768285, Age population, age 05, female, interpolated#11768286, ... 437 more fields]
                        +- Project [Year#11767306, AIDS estimated deaths (UNAIDS estimates)#11768264, ARI treatment (% of children under 5 taken to a health provider)#11768265, Adolescent fertility rate (births per 1,000 women ages 15-19)#11768266, Adults (ages 15+) and children (0-14 years) living with HIV#11768267, Adults (ages 15+) and children (ages 0-14) newly infected with HIV#11768268, Adults (ages 15+) living with HIV#11768269, Adults (ages 15-49) newly infected with HIV#11768270, Age at first marriage, female#11768271, Age at first marriage, male#11768272, Age dependency ratio (% of working-age population)#11768273, Age dependency ratio, old#11768274, Age dependency ratio, young#11768275, Age population, age 00, female, interpolated#11768276, Age population, age 00, male, interpolated#11768277, Age population, age 01, female, interpolated#11768278, Age population, age 01, male, interpolated#11768279, Age population, age 02, female, interpolated#11768280, Age population, age 02, male, interpolated#11768281, Age population, age 03, female, interpolated#11768282, Age population, age 03, male, interpolated#11768283, Age population, age 04, female, interpolated#11768284, Age population, age 04, male, interpolated#11768285, Age population, age 05, female, interpolated#11768286, ... 436 more fields]
                           +- Project [Year#11767306, AIDS estimated deaths (UNAIDS estimates)#11768264, ARI treatment (% of children under 5 taken to a health provider)#11768265, Adolescent fertility rate (births per 1,000 women ages 15-19)#11768266, Adults (ages 15+) and children (0-14 years) living with HIV#11768267, Adults (ages 15+) and children (ages 0-14) newly infected with HIV#11768268, Adults (ages 15+) living with HIV#11768269, Adults (ages 15-49) newly infected with HIV#11768270, Age at first marriage, female#11768271, Age at first marriage, male#11768272, Age dependency ratio (% of working-age population)#11768273, Age dependency ratio, old#11768274, Age dependency ratio, young#11768275, Age population, age 00, female, interpolated#11768276, Age population, age 00, male, interpolated#11768277, Age population, age 01, female, interpolated#11768278, Age population, age 01, male, interpolated#11768279, Age population, age 02, female, interpolated#11768280, Age population, age 02, male, interpolated#11768281, Age population, age 03, female, interpolated#11768282, Age population, age 03, male, interpolated#11768283, Age population, age 04, female, interpolated#11768284, Age population, age 04, male, interpolated#11768285, Age population, age 05, female, interpolated#11768286, ... 435 more fields]
                              +- Project [Country Name#11767071, Year#11767306, AIDS estimated deaths (UNAIDS estimates)#11768264, ARI treatment (% of children under 5 taken to a health provider)#11768265, Adolescent fertility rate (births per 1,000 women ages 15-19)#11768266, Adults (ages 15+) and children (0-14 years) living with HIV#11768267, Adults (ages 15+) and children (ages 0-14) newly infected with HIV#11768268, Adults (ages 15+) living with HIV#11768269, Adults (ages 15-49) newly infected with HIV#11768270, Age at first marriage, female#11768271, Age at first marriage, male#11768272, Age dependency ratio (% of working-age population)#11768273, Age dependency ratio, old#11768274, Age dependency ratio, young#11768275, Age population, age 00, female, interpolated#11768276, Age population, age 00, male, interpolated#11768277, Age population, age 01, female, interpolated#11768278, Age population, age 01, male, interpolated#11768279, Age population, age 02, female, interpolated#11768280, Age population, age 02, male, interpolated#11768281, Age population, age 03, female, interpolated#11768282, Age population, age 03, male, interpolated#11768283, Age population, age 04, female, interpolated#11768284, Age population, age 04, male, interpolated#11768285, ... 436 more fields]
                                 +- Project [Country Name#11767071, Year#11767306, __pivot_sum(`Value`) AS `sum(``Value``)`#11768263[0] AS AIDS estimated deaths (UNAIDS estimates)#11768264, __pivot_sum(`Value`) AS `sum(``Value``)`#11768263[1] AS ARI treatment (% of children under 5 taken to a health provider)#11768265, __pivot_sum(`Value`) AS `sum(``Value``)`#11768263[2] AS Adolescent fertility rate (births per 1,000 women ages 15-19)#11768266, __pivot_sum(`Value`) AS `sum(``Value``)`#11768263[3] AS Adults (ages 15+) and children (0-14 years) living with HIV#11768267, __pivot_sum(`Value`) AS `sum(``Value``)`#11768263[4] AS Adults (ages 15+) and children (ages 0-14) newly infected with HIV#11768268, __pivot_sum(`Value`) AS `sum(``Value``)`#11768263[5] AS Adults (ages 15+) living with HIV#11768269, __pivot_sum(`Value`) AS `sum(``Value``)`#11768263[6] AS Adults (ages 15-49) newly infected with HIV#11768270, __pivot_sum(`Value`) AS `sum(``Value``)`#11768263[7] AS Age at first marriage, female#11768271, __pivot_sum(`Value`) AS `sum(``Value``)`#11768263[8] AS Age at first marriage, male#11768272, __pivot_sum(`Value`) AS `sum(``Value``)`#11768263[9] AS Age dependency ratio (% of working-age population)#11768273, __pivot_sum(`Value`) AS `sum(``Value``)`#11768263[10] AS Age dependency ratio, old#11768274, __pivot_sum(`Value`) AS `sum(``Value``)`#11768263[11] AS Age dependency ratio, young#11768275, __pivot_sum(`Value`) AS `sum(``Value``)`#11768263[12] AS Age population, age 00, female, interpolated#11768276, __pivot_sum(`Value`) AS `sum(``Value``)`#11768263[13] AS Age population, age 00, male, interpolated#11768277, __pivot_sum(`Value`) AS `sum(``Value``)`#11768263[14] AS Age population, age 01, female, interpolated#11768278, __pivot_sum(`Value`) AS `sum(``Value``)`#11768263[15] AS Age population, age 01, male, interpolated#11768279, __pivot_sum(`Value`) AS `sum(``Value``)`#11768263[16] AS Age population, age 02, female, interpolated#11768280, __pivot_sum(`Value`) AS `sum(``Value``)`#11768263[17] AS Age population, age 02, male, interpolated#11768281, __pivot_sum(`Value`) AS `sum(``Value``)`#11768263[18] AS Age population, age 03, female, interpolated#11768282, __pivot_sum(`Value`) AS `sum(``Value``)`#11768263[19] AS Age population, age 03, male, interpolated#11768283, __pivot_sum(`Value`) AS `sum(``Value``)`#11768263[20] AS Age population, age 04, female, interpolated#11768284, __pivot_sum(`Value`) AS `sum(``Value``)`#11768263[21] AS Age population, age 04, male, interpolated#11768285, ... 435 more fields]
                                    +- Aggregate [Country Name#11767071, Year#11767306], [Country Name#11767071, Year#11767306, pivotfirst(Indicator Name#11767073, sum(`Value`)#11767347, AIDS estimated deaths (UNAIDS estimates), ARI treatment (% of children under 5 taken to a health provider), Adolescent fertility rate (births per 1,000 women ages 15-19), Adults (ages 15+) and children (0-14 years) living with HIV, Adults (ages 15+) and children (ages 0-14) newly infected with HIV, Adults (ages 15+) living with HIV, Adults (ages 15-49) newly infected with HIV, Age at first marriage, female, Age at first marriage, male, Age dependency ratio (% of working-age population), Age dependency ratio, old, Age dependency ratio, young, Age population, age 00, female, interpolated, Age population, age 00, male, interpolated, Age population, age 01, female, interpolated, Age population, age 01, male, interpolated, Age population, age 02, female, interpolated, Age population, age 02, male, interpolated, Age population, age 03, female, interpolated, Age population, age 03, male, interpolated, Age population, age 04, female, interpolated, Age population, age 04, male, interpolated, Age population, age 05, female, interpolated, Age population, age 05, male, interpolated, Age population, age 06, female, interpolated, Age population, age 06, male, interpolated, Age population, age 07, female, interpolated, Age population, age 07, male, interpolated, Age population, age 08, female, interpolated, Age population, age 08, male, interpolated, Age population, age 09, female, interpolated, Age population, age 09, male, interpolated, Age population, age 10, female, interpolated, Age population, age 10, male, interpolated, Age population, age 11, female, interpolated, Age population, age 11, male, interpolated, Age population, age 12, female, interpolated, Age population, age 12, male, interpolated, Age population, age 13, female, interpolated, Age population, age 13, male, interpolated, Age population, age 14, female, interpolated, Age population, age 14, male, interpolated, Age population, age 15, female, interpolated, Age population, age 15, male, interpolated, Age population, age 16, female, interpolated, Age population, age 16, male, interpolated, Age population, age 17, female, interpolated, Age population, age 17, male, interpolated, Age population, age 18, female, interpolated, Age population, age 18, male, interpolated, Age population, age 19, female, interpolated, Age population, age 19, male, interpolated, Age population, age 20, female, interpolated, Age population, age 20, male, interpolated, Age population, age 21, female, interpolated, Age population, age 21, male, interpolated, Age population, age 22, female, interpolated, Age population, age 22, male, interpolated, Age population, age 23, female, interpolated, Age population, age 23, male, interpolated, Age population, age 24, female, interpolated, Age population, age 24, male, interpolated, Age population, age 25, female, interpolated, Age population, age 25, male, interpolated, Antiretroviral therapy coverage (% of people living with HIV), Antiretroviral therapy coverage for PMTCT (% of pregnant women living with HIV), Birth rate, crude (per 1,000 people), Births attended by skilled health staff (% of total), Capital health expenditure (% of GDP), Cause of death, by communicable diseases and maternal, prenatal and nutrition conditions (% of total), Cause of death, by injury (% of total), Cause of death, by non-communicable diseases (% of total), Children (0-14) living with HIV, Children (ages 0-14) newly infected with HIV, Children orphaned by HIV/AIDS, Children with fever receiving antimalarial drugs (% of children under age 5 with fever), Community health workers (per 1,000 people), Completeness of birth registration (%), Completeness of birth registration, female (%), Completeness of birth registration, male (%), Completeness of birth registration, rural (%), Completeness of birth registration, urban (%), Completeness of death registration with cause-of-death information (%), Comprehensive correct knowledge of HIV/AIDS, ages 15-24, female (2 prevent ways and reject 3 misconceptions), Comprehensive correct knowledge of HIV/AIDS, ages 15-24, male (2 prevent ways and reject 3 misconceptions), Comprehensive correct knowledge of HIV/AIDS, ages 15-49, female (2 prevent ways and reject 3 misconceptions), Comprehensive correct knowledge of HIV/AIDS, ages 15-49, male (2 prevent ways and reject 3 misconceptions), Condom use at last high-risk sex, adult female (% ages 15-49), Condom use at last high-risk sex, adult male (% ages 15-49), Condom use, population ages 15-24, female (% of females ages 15-24), Condom use, population ages 15-24, male (% of males ages 15-24), Consumption of iodized salt (% of households), Contraceptive prevalence, any method (% of married women ages 15-49), Contraceptive prevalence, any method (% of sexually active unmarried women ages 15-49), Contraceptive prevalence, any modern method (% of married women ages 15-49), Contraceptive prevalence, any modern method (% of sexually active unmarried women ages 15-49), Current health expenditure (% of GDP), Current health expenditure per capita (current US$), Current health expenditure per capita, PPP (current international $), Death rate, crude (per 1,000 people), Demand for family planning satisfied by any methods (% of married women with demand for family planning), Demand for family planning satisfied by modern methods (% of married women with demand for family planning), Diabetes prevalence (% of population ages 20 to 79), Diarrhea treatment (% of children under 5 receiving oral rehydration and continued feeding), Diarrhea treatment (% of children under 5 who received ORS packet), Domestic general government health expenditure (% of GDP), Domestic general government health expenditure (% of current health expenditure), Domestic general government health expenditure (% of general government expenditure), Domestic general government health expenditure per capita (current US$), Domestic general government health expenditure per capita, PPP (current international $), Domestic private health expenditure (% of current health expenditure), Domestic private health expenditure per capita (current US$), Domestic private health expenditure per capita, PPP  (current international $), Exclusive breastfeeding (% of children under 6 months), External health expenditure (% of current health expenditure), External health expenditure channeled through government (% of external health expenditure), External health expenditure per capita (current US$), External health expenditure per capita, PPP (current international $), Female headed households (% of households with a female head), Fertility rate, total (births per woman), GNI per capita, Atlas method (current US$), Hospital beds (per 1,000 people), Human capital index (HCI) (scale 0-1), Human capital index (HCI), female (scale 0-1), Human capital index (HCI), female, lower bound (scale 0-1), Human capital index (HCI), female, upper bound (scale 0-1), Human capital index (HCI), lower bound (scale 0-1), Human capital index (HCI), male (scale 0-1), Human capital index (HCI), male, lower bound (scale 0-1), Human capital index (HCI), male, upper bound (scale 0-1), Human capital index (HCI), upper bound (scale 0-1), Immunization, BCG (% of one-year-old children), Immunization, DPT (% of children ages 12-23 months), Immunization, HepB3 (% of one-year-old children), Immunization, Hib3 (% of children ages 12-23 months), Immunization, Pol3 (% of one-year-old children), Immunization, measles (% of children ages 12-23 months), Immunization, measles second dose (% of children by the nationally recommended age), Incidence of HIV, ages 15-24 (per 1,000 uninfected population ages 15-24), Incidence of HIV, ages 15-49 (per 1,000 uninfected population ages 15-49), Incidence of HIV, ages 50+ (per 1,000 uninfected population ages 50+), Incidence of HIV, all (per 1,000 uninfected population), Incidence of malaria (per 1,000 population at risk), Incidence of tuberculosis (per 100,000 people), Infant and young child feeding practices, all 3 IYCF (% children ages 6-23 months), Intermittent preventive treatment (IPT) of malaria in pregnancy (% of pregnant women), Labor force, female (% of total labor force), Labor force, total, Life expectancy at birth, female (years), Life expectancy at birth, male (years), Life expectancy at birth, total (years), Lifetime risk of maternal death (%), Lifetime risk of maternal death (1 in: rate varies by country), Literacy rate, adult female (% of females ages 15 and above), Literacy rate, adult male (% of males ages 15 and above), Literacy rate, adult total (% of people ages 15 and above), Literacy rate, youth male (% of males ages 15-24), Literacy rate, youth total (% of people ages 15-24), Low-birthweight babies (% of births), Malaria cases reported, Maternal leave benefits (% of wages paid in covered period), Maternal mortality ratio (modeled estimate, per 100,000 live births), Maternal mortality ratio (national estimate, per 100,000 live births), Mortality caused by road traffic injury (per 100,000 people), Mortality caused by road traffic injury, female (per 100,000 female population), Mortality caused by road traffic injury, male (per 100,000 male population), Mortality from CVD, cancer, diabetes or CRD between exact ages 30 and 70 (%), Mortality from CVD, cancer, diabetes or CRD between exact ages 30 and 70, female (%), Mortality from CVD, cancer, diabetes or CRD between exact ages 30 and 70, male (%), Mortality rate attributed to household and ambient air pollution (per 100,000 population), Mortality rate attributed to household and ambient air pollution, age-standardized, female (per 100,000 female population), Mortality rate attributed to household and ambient air pollution, age-standardized, male (per 100,000 male population), Mortality rate attributed to unintentional poisoning (per 100,000 population), Mortality rate attributed to unintentional poisoning, female (per 100,000 female population), Mortality rate attributed to unintentional poisoning, male (per 100,000 male population), Mortality rate attributed to unsafe water, unsafe sanitation and lack of hygiene (per 100,000 population), Mortality rate attributed to unsafe water, unsafe sanitation and lack of hygiene, female (per 100,000 female population), Mortality rate attributed to unsafe water, unsafe sanitation and lack of hygiene, male (per 100,000 male population), Mortality rate, adult, female (per 1,000 female adults), Mortality rate, adult, male (per 1,000 male adults), Mortality rate, infant (per 1,000 live births), Mortality rate, infant, female (per 1,000 live births), Mortality rate, infant, male (per 1,000 live births), Mortality rate, neonatal (per 1,000 live births), Mortality rate, under-5 (per 1,000), Mortality rate, under-5, female (per 1,000), Mortality rate, under-5, male (per 1,000), Net migration, Newborns protected against tetanus (%), Number of deaths ages 10-14 years, Number of deaths ages 10-14 years, female, Number of deaths ages 10-14 years, male, Number of deaths ages 10-19 years, Number of deaths ages 10-19 years, female, Number of deaths ages 10-19 years, male, Number of deaths ages 15-19 years, Number of deaths ages 15-19 years, female, Number of deaths ages 15-19 years, male, Number of deaths ages 20-24 years, Number of deaths ages 20-24 years, female, Number of deaths ages 20-24 years, male, Number of deaths ages 5-9 years, Number of deaths ages 5-9 years, female, Number of deaths ages 5-9 years, male, Number of infant deaths, Number of infant deaths, female, Number of infant deaths, male, Number of maternal deaths, Number of neonatal deaths, Number of people pushed below the $1.90 ($ 2011 PPP) poverty line by out-of-pocket health care expenditure, Number of people pushed below the $3.20 ($ 2011 PPP) poverty line by out-of-pocket health care expenditure, Number of people pushed further below the $1.90 ($ 2011 PPP) poverty line by out-of-pocket health care expenditure, Number of people pushed further below the $3.20 ($ 2011 PPP) poverty line by out-of-pocket health care expenditure, Number of people spending more than 10% of household consumption or income on out-of-pocket health care expenditure, Number of people spending more than 25% of household consumption or income on out-of-pocket health care expenditure, Number of people who are undernourished, Number of stillbirths, Number of surgical procedures (per 100,000 population), Number of under-five deaths, Number of under-five deaths, female, Number of under-five deaths, male, Nurses and midwives (per 1,000 people), Out-of-pocket expenditure (% of current health expenditure), Out-of-pocket expenditure per capita (current US$), Out-of-pocket expenditure per capita, PPP (current international $), People practicing open defecation (% of population), People practicing open defecation, rural (% of rural population), People practicing open defecation, urban (% of urban population), People using at least basic drinking water services (% of population), People using at least basic drinking water services, rural (% of rural population), People using at least basic drinking water services, urban (% of urban population), People using at least basic sanitation services (% of population), People using at least basic sanitation services, rural (% of rural population), People using at least basic sanitation services, urban  (% of urban population), People using safely managed drinking water services (% of population), People using safely managed drinking water services, rural (% of rural population), People using safely managed drinking water services, urban (% of urban population), People using safely managed sanitation services (% of population), People using safely managed sanitation services, rural (% of rural population), People using safely managed sanitation services, urban  (% of urban population), People with basic handwashing facilities including soap and water (% of population), People with basic handwashing facilities including soap and water, rural (% of rural population), People with basic handwashing facilities including soap and water, urban (% of urban population), Physicians (per 1,000 people), Population ages 0-14 (% of total population), Population ages 0-14, female, Population ages 0-14, female (% of female population), Population ages 0-14, male, Population ages 0-14, male (% of male population), Population ages 00-04, female, Population ages 00-04, female (% of female population), Population ages 00-04, male, Population ages 00-04, male (% of male population), Population ages 00-14, total, Population ages 05-09, female, Population ages 05-09, female (% of female population), Population ages 05-09, male, Population ages 05-09, male (% of male population), Population ages 10-14, female, Population ages 10-14, female (% of female population), Population ages 10-14, male, Population ages 10-14, male (% of male population), Population ages 15-19, female, Population ages 15-19, female (% of female population), Population ages 15-19, male, Population ages 15-19, male (% of male population), Population ages 15-64 (% of total population), Population ages 15-64, female, Population ages 15-64, female (% of female population), Population ages 15-64, male, Population ages 15-64, male (% of male population), Population ages 15-64, total, Population ages 20-24, female, Population ages 20-24, female (% of female population), Population ages 20-24, male, Population ages 20-24, male (% of male population), Population ages 25-29, female, Population ages 25-29, female (% of female population), Population ages 25-29, male, Population ages 25-29, male (% of male population), Population ages 30-34, female, Population ages 30-34, female (% of female population), Population ages 30-34, male, Population ages 30-34, male (% of male population), Population ages 35-39, female, Population ages 35-39, female (% of female population), Population ages 35-39, male, Population ages 35-39, male (% of male population), Population ages 40-44, female, Population ages 40-44, female (% of female population), Population ages 40-44, male, Population ages 40-44, male (% of male population), Population ages 45-49, female, Population ages 45-49, female (% of female population), Population ages 45-49, male, Population ages 45-49, male (% of male population), Population ages 50-54, female, Population ages 50-54, female (% of female population), Population ages 50-54, male, Population ages 50-54, male (% of male population), Population ages 55-59, female, Population ages 55-59, female (% of female population), Population ages 55-59, male, Population ages 55-59, male (% of male population), Population ages 60-64, female, Population ages 60-64, female (% of female population), Population ages 60-64, male, Population ages 60-64, male (% of male population), Population ages 65 and above (% of total population), Population ages 65 and above, female, Population ages 65 and above, female (% of female population), Population ages 65 and above, male, Population ages 65 and above, male (% of male population), Population ages 65 and above, total, Population ages 65-69, female, Population ages 65-69, female (% of female population), Population ages 65-69, male, Population ages 65-69, male (% of male population), Population ages 70-74, female, Population ages 70-74, female (% of female population), Population ages 70-74, male, Population ages 70-74, male (% of male population), Population ages 75-79, female, Population ages 75-79, female (% of female population), Population ages 75-79, male, Population ages 75-79, male (% of male population), Population ages 80 and above, female, Population ages 80 and above, male, Population ages 80 and above, male (% of male population), Population ages 80 and older, female (% of female population), Population growth (annual %), Population, female, Population, female (% of total population), Population, male, Population, male (% of total population), Population, total, Postnatal care coverage (% mothers), Poverty headcount ratio at national poverty line (% of population), Pregnant women receiving prenatal care (%), Pregnant women receiving prenatal care of at least four visits (% of pregnant women), Prevalence of HIV, female (% ages 15-24), Prevalence of HIV, male (% ages 15-24), Prevalence of HIV, total (% of population ages 15-49), Prevalence of anemia among children (% of children ages 6-59 months), Prevalence of anemia among non-pregnant women (% of women ages 15-49), Prevalence of anemia among pregnant women (%), Prevalence of anemia among women of reproductive age (% of women ages 15-49), Prevalence of current tobacco use (% of adults), Prevalence of current tobacco use, females (% of female adults), Prevalence of current tobacco use, males (% of male adults), Prevalence of hypertension (% of adults ages 30-79), Prevalence of hypertension, female (% of female adults ages 30-79), Prevalence of hypertension, male (% of male adults ages 30-79), Prevalence of overweight (% of adults), Prevalence of overweight (% of children under 5), Prevalence of overweight (modeled estimate, % of children under 5), Prevalence of overweight, female (% of children under 5), Prevalence of overweight, female (% of female adults), Prevalence of overweight, male (% of children under 5), Prevalence of overweight, male (% of male adults), Prevalence of severe wasting, weight for height (% of children under 5), Prevalence of severe wasting, weight for height, female (% of children under 5), Prevalence of severe wasting, weight for height, male (% of children under 5), Prevalence of stunting, height for age (% of children under 5), Prevalence of stunting, height for age (modeled estimate, % of children under 5), Prevalence of stunting, height for age, female (% of children under 5), Prevalence of stunting, height for age, male (% of children under 5), Prevalence of syphilis (% of women attending antenatal care), Prevalence of undernourishment (% of population), Prevalence of underweight, weight for age (% of children under 5), Prevalence of underweight, weight for age, female (% of children under 5), Prevalence of underweight, weight for age, male (% of children under 5), Prevalence of wasting, weight for height (% of children under 5), Prevalence of wasting, weight for height, female (% of children under 5), Prevalence of wasting, weight for height, male (% of children under 5), Primary completion rate, female (% of relevant age group), Primary completion rate, male (% of relevant age group), Primary completion rate, total (% of relevant age group), Probability of dying among adolescents ages 10-14 years (per 1,000), Probability of dying among adolescents ages 10-14 years, female (per 1,000), Probability of dying among adolescents ages 10-14 years, male (per 1,000), Probability of dying among adolescents ages 10-19 years (per 1,000), Probability of dying among adolescents ages 10-19 years, female (per 1,000), Probability of dying among adolescents ages 10-19 years, male (per 1,000), Probability of dying among adolescents ages 15-19 years (per 1,000), Probability of dying among adolescents ages 15-19 years, female (per 1,000), Probability of dying among adolescents ages 15-19 years, male (per 1,000), Probability of dying among children ages 5-9 years (per 1,000), Probability of dying among children ages 5-9 years, female (per 1,000), Probability of dying among children ages 5-9 years, male (per 1,000), Probability of dying among youth ages 20-24 years (per 1,000), Probability of dying among youth ages 20-24 years, female (per 1,000), Probability of dying among youth ages 20-24 years, male (per 1,000), Proportion of population pushed below the $1.90 ($ 2011 PPP) poverty line by out-of-pocket health care expenditure (%), Proportion of population pushed below the $3.20 ($ 2011 PPP) poverty line by out-of-pocket health care expenditure (%), Proportion of population pushed further below the $1.90 ($ 2011 PPP) poverty line by out-of-pocket health care expenditure (%), Proportion of population pushed further below the $3.20 ($ 2011 PPP) poverty line by out-of-pocket health care expenditure (%), Proportion of population spending more than 10% of household consumption or income on out-of-pocket health care expenditure (%), Proportion of population spending more than 25% of household consumption or income on out-of-pocket health care expenditure (%), Public spending on education, total (% of GDP), Ratio of school attendance of orphans to school attendance of non-orphans ages 10-14, Ratio of young literate females to males (% ages 15-24), Risk of catastrophic expenditure for surgical care (% of people at risk), Risk of impoverishing expenditure for surgical care (% of people at risk), Rural population, Rural population (% of total population), Rural population growth (annual %), Rural poverty headcount ratio at national poverty lines (% of rural population), School enrollment, primary (% gross), School enrollment, primary (% net), School enrollment, primary, female (% gross), School enrollment, primary, female (% net), School enrollment, primary, male (% gross), School enrollment, primary, male (% net), School enrollment, secondary (% gross), School enrollment, secondary (% net), School enrollment, secondary, female (% gross), School enrollment, secondary, female (% net), School enrollment, secondary, male (% gross), School enrollment, secondary, male (% net), School enrollment, tertiary (% gross), School enrollment, tertiary, female (% gross), Sex ratio at birth (male births per female births), Share of women employed in the nonagricultural sector (% of total nonagricultural employment), Specialist surgical workforce (per 100,000 population), Stillbirth rate (per 1,000 total births), Suicide mortality rate (per 100,000 population), Suicide mortality rate, female (per 100,000 female population), Suicide mortality rate, male (per 100,000 male population), Survival to age 65, female (% of cohort), Survival to age 65, male (% of cohort), Teenage mothers (% of women ages 15-19 who have had children or are currently pregnant), Total alcohol consumption per capita (liters of pure alcohol, projected estimates, 15+ years of age), Total alcohol consumption per capita, female (liters of pure alcohol, projected estimates, female 15+ years of age), Total alcohol consumption per capita, male (liters of pure alcohol, projected estimates, male 15+ years of age), Treatment for hypertension (% of adults ages 30-79 with hypertension), Treatment for hypertension, female (% of female adults ages 30-79 with hypertension), Treatment for hypertension, male (% of male adults ages 30-79 with hypertension), Tuberculosis case detection rate (%, all forms), Tuberculosis death rate (per 100,000 people), Tuberculosis treatment success rate (% of new cases), UHC service coverage index, Unemployment, female (% of female labor force), Unemployment, male (% of male labor force), Unemployment, total (% of total labor force), Unmet need for contraception (% of married women ages 15-49), Urban population, Urban population (% of total population), Urban population growth (annual %), Urban poverty headcount ratio at national poverty lines (% of urban population), Use of insecticide-treated bed nets (% of under-5 population), Vitamin A supplementation coverage rate (% of children ages 6-59 months), Wanted fertility rate (births per woman), Women who were first married by age 15 (% of women ages 20-24), Women who were first married by age 18 (% of women ages 20-24), Women's share of population ages 15+ living with HIV (%), Young people (ages 15-24) newly infected with HIV, 0, 0) AS __pivot_sum(`Value`) AS `sum(``Value``)`#11768263]
                                       +- Aggregate [Country Name#11767071, Year#11767306, Indicator Name#11767073], [Country Name#11767071, Year#11767306, Indicator Name#11767073, sum(Value#11767307) AS sum(`Value`)#11767347]
                                          +- Project [Country Name#11767071, Country Code#11767072, Indicator Name#11767073, Year#11767306, Value#11767307, month(cast(date#11767281 as date)) AS month#11767303, dayofmonth(cast(date#11767281 as date)) AS day#11767304, quarter(cast(date#11767281 as date)) AS quarter#11767305]
                                             +- Generate stack(16, 2005, 2005#11767245, 2006, 2006#11767246, 2007, 2007#11767247, 2008, 2008#11767248, 2009, 2009#11767249, 2010, 2010#11767250, 2011, 2011#11767251, 2012, 2012#11767252, 2013, 2013#11767253, 2014, 2014#11767254, 2015, 2015#11767255, 2016, ... 9 more fields), false, [Year#11767306, Value#11767307]
                                                +- Project [Country Name#11767071, Country Code#11767072, Indicator Name#11767073, Indicator Code#11767074, 2005#11767245, 2006#11767246, 2007#11767247, 2008#11767248, 2009#11767249, 2010#11767250, 2011#11767251, 2012#11767252, 2013#11767253, 2014#11767254, 2015#11767255, 2016#11767256, 2017#11767257, 2018#11767258, 2019#11767259, 2020#11767260, date_format(cast(2020-04-01 as timestamp), yyyy-MM-dd, Some(America/Toronto)) AS date#11767281]
                                                   +- Project [Country Name#11767071, Country Code#11767072, Indicator Name#11767073, Indicator Code#11767074, coalesce(nanvl(2005#11767120, cast(null as double)), cast(0.0 as double)) AS 2005#11767245, coalesce(nanvl(2006#11767121, cast(null as double)), cast(0.0 as double)) AS 2006#11767246, coalesce(nanvl(2007#11767122, cast(null as double)), cast(0.0 as double)) AS 2007#11767247, coalesce(nanvl(2008#11767123, cast(null as double)), cast(0.0 as double)) AS 2008#11767248, coalesce(nanvl(2009#11767124, cast(null as double)), cast(0.0 as double)) AS 2009#11767249, coalesce(nanvl(2010#11767125, cast(null as double)), cast(0.0 as double)) AS 2010#11767250, coalesce(nanvl(2011#11767126, cast(null as double)), cast(0.0 as double)) AS 2011#11767251, coalesce(nanvl(2012#11767127, cast(null as double)), cast(0.0 as double)) AS 2012#11767252, coalesce(nanvl(2013#11767128, cast(null as double)), cast(0.0 as double)) AS 2013#11767253, coalesce(nanvl(2014#11767129, cast(null as double)), cast(0.0 as double)) AS 2014#11767254, coalesce(nanvl(2015#11767130, cast(null as double)), cast(0.0 as double)) AS 2015#11767255, coalesce(nanvl(2016#11767131, cast(null as double)), cast(0.0 as double)) AS 2016#11767256, coalesce(nanvl(2017#11767132, cast(null as double)), cast(0.0 as double)) AS 2017#11767257, coalesce(nanvl(2018#11767133, cast(null as double)), cast(0.0 as double)) AS 2018#11767258, coalesce(nanvl(2019#11767134, cast(null as double)), cast(0.0 as double)) AS 2019#11767259, coalesce(nanvl(2020#11767135, cast(null as double)), cast(0.0 as double)) AS 2020#11767260]
                                                      +- Filter Country Name#11767071 IN (United States,Canada,Mexico,Thailand,Finland,Nigeria,Somalia,Norway,Japan)
                                                         +- Project [Country Name#11767071, Country Code#11767072, Indicator Name#11767073, Indicator Code#11767074, 2005#11767120, 2006#11767121, 2007#11767122, 2008#11767123, 2009#11767124, 2010#11767125, 2011#11767126, 2012#11767127, 2013#11767128, 2014#11767129, 2015#11767130, 2016#11767131, 2017#11767132, 2018#11767133, 2019#11767134, 2020#11767135]
                                                            +- Relation[Country Name#11767071,Country Code#11767072,Indicator Name#11767073,Indicator Code#11767074,1960#11767075,1961#11767076,1962#11767077,1963#11767078,1964#11767079,1965#11767080,1966#11767081,1967#11767082,1968#11767083,1969#11767084,1970#11767085,1971#11767086,1972#11767087,1973#11767088,1974#11767089,1975#11767090,1976#11767091,1977#11767092,1978#11767093,1979#11767094,... 43 more fields] csv


In [104]:
filterdCountryDf.columns

['Country Name',
 'Year',
 'AIDS estimated deaths (UNAIDS estimates)',
 'ARI treatment (% of children under 5 taken to a health provider)',
 'Adolescent fertility rate (births per 1,000 women ages 15-19)',
 'Adults (ages 15+) and children (0-14 years) living with HIV',
 'Adults (ages 15+) and children (ages 0-14) newly infected with HIV',
 'Adults (ages 15+) living with HIV',
 'Adults (ages 15-49) newly infected with HIV',
 'Age at first marriage, female',
 'Age at first marriage, male',
 'Age dependency ratio (% of working-age population)',
 'Age dependency ratio, old',
 'Age dependency ratio, young',
 'Age population, age 00, female, interpolated',
 'Age population, age 00, male, interpolated',
 'Age population, age 01, female, interpolated',
 'Age population, age 01, male, interpolated',
 'Age population, age 02, female, interpolated',
 'Age population, age 02, male, interpolated',
 'Age population, age 03, female, interpolated',
 'Age population, age 03, male, interpolated',
 'Age 

In [109]:
cl = 'Primary completion rate, total (% of relevant age group)'
first = filterdCountryDf.filter(fn.col(cl).isNotNull()).groupBy("Country Name").agg(fn.count("*").alias("nonnull"))
second = filterdCountryDf.filter(fn.col(cl).isNull()).groupBy("Country Name").agg(fn.count("*").alias("null"))

In [112]:
second.join(first,["Country Name"],'right').withColumn("ratio",fn.col("nonnull")/fn.col("null")).orderBy(fn.desc("ratio")).show(300,False)

+-------------+----+-------+-----+
|Country Name |null|nonnull|ratio|
+-------------+----+-------+-----+
|Canada       |null|16     |null |
|Thailand     |null|16     |null |
|Mexico       |null|16     |null |
|Madagascar   |null|16     |null |
|Guinea       |null|16     |null |
|India        |null|16     |null |
|China        |null|16     |null |
|United States|null|16     |null |
|Niger        |null|16     |null |
+-------------+----+-------+-----+

