## Read Parquet and Normalize

#### Imports

In [None]:
import os 

# Move the execution of the folder up one directory
os.chdir('..')

from pyspark.sql import SparkSession
from etl.read_normalize import ingest_parquet, create_country_ids, create_country_code_ids, join_country_and_olympics, \
    fuzzy_match



In [2]:
spark = SparkSession.builder.appName("OlympicCountryDataPipeline").getOrCreate()
spark.sparkContext.setLogLevel("ERROR")

25/01/15 20:25:12 WARN Utils: Your hostname, Coles-MacBook-Pro.local resolves to a loopback address: 127.0.0.1; using 10.0.0.235 instead (on interface en0)
25/01/15 20:25:12 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/01/15 20:25:13 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


#### Reading parquet files

In [3]:
# Declare countries path 
countries_input = "datasets/countries.parquet"
df_countries = ingest_parquet(input_path = countries_input, spark = spark)

olympics_input = "datasets/olympic_combined.parquet"
df_olympics = ingest_parquet(input_path = olympics_input, spark = spark)

                                                                                

In [4]:
df_countries.show()

                                                                                

+------------------+--------------------+------------+----------+---------------------+--------------------------+-------------+--------------------------------+--------------+----------------+---------------+--------------+-------------+-------------+-------+---------+---------+-----------+--------+-------+
|      Country_Name|              Region|  Population|Area_sq_mi|Pop_Density_per_sq_mi|Coastline_coast_area_ratio|Net_migration|Infant_mortality_per_1000_births|GDP_per_capita|Literacy_percent|Phones_per_1000|Arable_percent|Crops_percent|Other_percent|Climate|Birthrate|Deathrate|Agriculture|Industry|Service|
+------------------+--------------------+------------+----------+---------------------+--------------------------+-------------+--------------------------------+--------------+----------------+---------------+--------------+-------------+-------------+-------+---------+---------+-----------+--------+-------+
|      Afghanistan |ASIA (EX. NEAR EA...| 3.1056997E7|  647500.0|     

#### Country Code Mapping

In [5]:
# Add Country_ID to df_countries
df_countries_id = create_country_ids(df_countries)

# Add Country_Code_ID to df_olympics
df_olympics_id = create_country_code_ids(df_olympics)


In [6]:
df_olympics_id.show()



+------------+---------------+
|Country_Code|Country_Code_ID|
+------------+---------------+
|         AFG|              1|
|         AIN|              2|
|         ALB|              3|
|         ALG|              4|
|         ARG|              5|
|         ARM|              6|
|         AUS|              7|
|         AUT|              8|
|         AZE|              9|
|         BAH|             10|
|         BAR|             11|
|         BDI|             12|
|         BEL|             13|
|         BER|             14|
|         BLR|             15|
|         BOT|             16|
|         BRA|             17|
|         BRN|             18|
|         BUL|             19|
|         BUR|             20|
+------------+---------------+
only showing top 20 rows



                                                                                

## Joining countries and olympics

In [7]:
df_countries_olympics = join_country_and_olympics(df_countries, df_olympics)

In [8]:
df_countries_olympics.show()

+------------+----+------+------+-----+----+---------------+-----------------+--------------------+------------+----------+---------------------+--------------------------+-------------+--------------------------------+--------------+----------------+---------------+--------------+-------------+-------------+-------+---------+---------+-----------+--------+-------+----------+
|Country_Code|Gold|Silver|Bronze|Total|Year|Country_Code_ID|     Country_Name|              Region|  Population|Area_sq_mi|Pop_Density_per_sq_mi|Coastline_coast_area_ratio|Net_migration|Infant_mortality_per_1000_births|GDP_per_capita|Literacy_percent|Phones_per_1000|Arable_percent|Crops_percent|Other_percent|Climate|Birthrate|Deathrate|Agriculture|Industry|Service|Country_ID|
+------------+----+------+------+-----+----+---------------+-----------------+--------------------+------------+----------+---------------------+--------------------------+-------------+--------------------------------+--------------+--------

## Extra - Fuzzy Matching! For Fun

The below code shows how you could create your own dictionary to pair 3 letter country codes to their names. This is not perfect, but gets closer to the actual result expected!

In [10]:
## Example of a way to use fuzzy match to get closer to the real country combinations

df_countries_olympics_fuzzy = fuzzy_match(
    spark,
    df_countries,
    df_olympics
)

df_countries_olympics_fuzzy.show()

                                                                                

+------------+----+------+------+-----+----+------------+--------------------+-----------+----------+---------------------+--------------------------+-------------+--------------------------------+--------------+----------------+---------------+--------------+-------------+-------------+-------+---------+---------+-----------+--------+-------+
|Country_Code|Gold|Silver|Bronze|Total|Year|Country_Name|              Region| Population|Area_sq_mi|Pop_Density_per_sq_mi|Coastline_coast_area_ratio|Net_migration|Infant_mortality_per_1000_births|GDP_per_capita|Literacy_percent|Phones_per_1000|Arable_percent|Crops_percent|Other_percent|Climate|Birthrate|Deathrate|Agriculture|Industry|Service|
+------------+----+------+------+-----+----+------------+--------------------+-----------+----------+---------------------+--------------------------+-------------+--------------------------------+--------------+----------------+---------------+--------------+-------------+-------------+-------+---------+--

In [11]:
# Write the fuzzy joined data as parquet

df_countries_olympics_fuzzy.write.parquet("country_olympics_join.parquet")

                                                                                