## Read Parquet and Normalize

#### Imports

In [1]:
import os 

# Move the execution of the folder up one directory
os.chdir('..')

from pyspark.sql import SparkSession
from etl.read_normalize import ingest_parquet, create_country_ids, create_country_code_ids, join_country_and_olympics
from fuzzywuzzy import process



In [2]:
spark = SparkSession.builder.appName("OlympicCountryDataPipeline").getOrCreate()
spark.sparkContext.setLogLevel("ERROR")

25/01/15 15:21:47 WARN Utils: Your hostname, Coles-MacBook-Pro.local resolves to a loopback address: 127.0.0.1; using 10.0.0.235 instead (on interface en0)
25/01/15 15:21:47 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/01/15 15:21:48 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


#### Reading parquet files

In [4]:
# Declare countries path 
countries_input = "datasets/countries.parquet"
df_countries = ingest_parquet(input_path = countries_input, spark = spark)

olympics_input = "datasets/olympic_combined.parquet"
df_olympics = ingest_parquet(input_path = olympics_input, spark = spark)

                                                                                

In [8]:
df_countries.show()

+------------------+--------------------+------------+----------+---------------------+--------------------------+-------------+--------------------------------+--------------+----------------+---------------+--------------+-------------+-------------+-------+---------+---------+-----------+--------+-------+
|      Country_Name|              Region|  Population|Area_sq_mi|Pop_Density_per_sq_mi|Coastline_coast_area_ratio|Net_migration|Infant_mortality_per_1000_births|GDP_per_capita|Literacy_percent|Phones_per_1000|Arable_percent|Crops_percent|Other_percent|Climate|Birthrate|Deathrate|Agriculture|Industry|Service|
+------------------+--------------------+------------+----------+---------------------+--------------------------+-------------+--------------------------------+--------------+----------------+---------------+--------------+-------------+-------------+-------+---------+---------+-----------+--------+-------+
|      Afghanistan |ASIA (EX. NEAR EA...| 3.1056997E7|  647500.0|     

#### Country Code Mapping

In [9]:
# Add Country_ID to df_countries
df_countries_id = create_country_ids(df_countries)

# Add Country_Code_ID to df_olympics
df_olympics_id = create_country_code_ids(df_olympics)


In [10]:
df_olympics_id.show()

+------------+---------------+
|Country_Code|Country_Code_ID|
+------------+---------------+
|         AFG|              1|
|         AIN|              2|
|         ALB|              3|
|         ALG|              4|
|         ARG|              5|
|         ARM|              6|
|         AUS|              7|
|         AUT|              8|
|         AZE|              9|
|         BAH|             10|
|         BAR|             11|
|         BDI|             12|
|         BEL|             13|
|         BER|             14|
|         BLR|             15|
|         BOT|             16|
|         BRA|             17|
|         BRN|             18|
|         BUL|             19|
|         BUR|             20|
+------------+---------------+
only showing top 20 rows



## Joining countries and olympics

In [11]:
df_countries_olympics = join_country_and_olympics(df_countries, df_olympics)

In [16]:
df_countries_olympics.show()

+------------+----+------+------+-----+----+---------------+-------------------+--------------------+------------+----------+---------------------+--------------------------+-------------+--------------------------------+--------------+----------------+---------------+--------------+-------------+-------------+-------+---------+---------+-----------+--------+-------+----------+
|Country_Code|Gold|Silver|Bronze|Total|Year|Country_Code_ID|       Country_Name|              Region|  Population|Area_sq_mi|Pop_Density_per_sq_mi|Coastline_coast_area_ratio|Net_migration|Infant_mortality_per_1000_births|GDP_per_capita|Literacy_percent|Phones_per_1000|Arable_percent|Crops_percent|Other_percent|Climate|Birthrate|Deathrate|Agriculture|Industry|Service|Country_ID|
+------------+----+------+------+-----+----+---------------+-------------------+--------------------+------------+----------+---------------------+--------------------------+-------------+--------------------------------+--------------+--

## Extra - Fuzzy Matching! For Fun

The below code shows how you could create your own dictionary to pair 3 letter country codes to their names. This is not perfect, but gets closer to the actual result expected!

In [11]:
## Example of a way to use fuzzy match to get closer to the real country combinations

country_codes = (df_olympics
                 .select("Country_Code")
                 .distinct()
                 .orderBy("Country_Code")
                 .rdd.map(lambda x: x[0]).collect()
                 )

country_names = (df_countries
                 .select("Country_Name")
                 .distinct()
                 .orderBy("Country_Name")
                 .rdd.map(lambda x: x[0]).collect()
                 )

matches = []
matched_countries = set()  # A set to keep track of matched country names

for code in country_codes:
    # Filter out the already matched countries
    remaining_countries = [name for name in country_names if name not in matched_countries]
    
    # Find the best match from the remaining countries
    best_match = process.extractOne(code, remaining_countries)
    
    # If a best match is found, append it to the list of matches
    if best_match:
        matched_countries.add(best_match[0])  # Add this matched country to the set
        matches.append((code, best_match[0]))  # (Country Code, Best Match Country Name, Match Score)

matches

                                                                                

[('AFG', 'Afghanistan '),
 ('AIN', 'Bahrain '),
 ('ALB', 'Albania '),
 ('ALG', 'Algeria '),
 ('ARG', 'Argentina '),
 ('ARM', 'Armenia '),
 ('AUS', 'Australia '),
 ('AUT', 'Guinea-Bissau '),
 ('AZE', 'Azerbaijan '),
 ('BAH', 'Bahamas, The '),
 ('BAR', 'Antigua & Barbuda '),
 ('BDI', 'British Virgin Is. '),
 ('BEL', 'Belarus '),
 ('BER', 'Bermuda '),
 ('BLR', 'Barbados '),
 ('BOT', 'Botswana '),
 ('BRA', 'Brazil '),
 ('BRN', 'Brunei '),
 ('BUL', 'Bulgaria '),
 ('BUR', 'Burkina Faso '),
 ('CAN', 'American Samoa '),
 ('CHI', 'Chile '),
 ('CHN', 'China '),
 ('CIV', 'Bolivia '),
 ('CMR', 'Cambodia '),
 ('COL', 'Colombia '),
 ('CPV', 'Cape Verde '),
 ('CRC', 'Croatia '),
 ('CRO', 'Micronesia, Fed. St. '),
 ('CUB', 'Cuba '),
 ('CYP', 'Cyprus '),
 ('CZE', 'Czech Republic '),
 ('DEN', 'Denmark '),
 ('DMA', 'Canada '),
 ('DOM', 'Dominica '),
 ('ECU', 'Ecuador '),
 ('EGY', 'Egypt '),
 ('ERI', 'Eritrea '),
 ('ESP', 'Maldives '),
 ('EST', 'Estonia '),
 ('ETH', 'Ethiopia '),
 ('FIJ', 'Fiji '),
 ('FIN