## Analytics Queries

In [1]:
import os 

# Move the execution of the folder up one directory
os.chdir('..')

from pyspark.sql import SparkSession
from etl.read_normalize import ingest_parquet
from pyspark.sql.functions import sum, col



In [2]:
spark = SparkSession.builder.appName("OlympicCountryDataPipeline").getOrCreate()

25/01/16 07:15:38 WARN Utils: Your hostname, Coles-MacBook-Pro.local resolves to a loopback address: 127.0.0.1; using 10.0.0.235 instead (on interface en0)
25/01/16 07:15:38 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/01/16 07:15:39 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/01/16 07:15:41 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [3]:
df_denormalized = ingest_parquet(
    input_path = "datasets/countries_olympics_join.parquet"
    , spark = spark
)

                                                                                

In [4]:
df_denormalized.show(10)

25/01/16 07:15:50 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

+------------+----+------+------+-----+----+------------+--------------------+-----------+----------+---------------------+--------------------------+-------------+--------------------------------+--------------+----------------+---------------+--------------+-------------+-------------+-------+---------+---------+-----------+--------+-------+
|Country_Code|Gold|Silver|Bronze|Total|Year|Country_Name|              Region| Population|Area_sq_mi|Pop_Density_per_sq_mi|Coastline_coast_area_ratio|Net_migration|Infant_mortality_per_1000_births|GDP_per_capita|Literacy_percent|Phones_per_1000|Arable_percent|Crops_percent|Other_percent|Climate|Birthrate|Deathrate|Agriculture|Industry|Service|
+------------+----+------+------+-----+----+------------+--------------------+-----------+----------+---------------------+--------------------------+-------------+--------------------------------+--------------+----------------+---------------+--------------+-------------+-------------+-------+---------+--

#### Question 1
Who has won the most silver medals across all years of data?

A: Looks to be USA. In my fuzzy text matching, this ended up getting joined to Bhutan.

In [8]:
q1_work = (df_denormalized
           .groupBy("Country_Name", "Country_Code")
           .agg(sum("Silver").alias("silver_total"))
           .orderBy("silver_total", ascending = False)
           )

q1_work.head(5)

                                                                                

[Row(Country_Name='Bhutan ', Country_Code='USA', silver_total=315),
 Row(Country_Name='China ', Country_Code='CHN', silver_total=192),
 Row(Country_Name='Germany ', Country_Code='GER', silver_total=188),
 Row(Country_Name='Aruba ', Country_Code='RUS', silver_total=151),
 Row(Country_Name='France ', Country_Code='FRA', silver_total=124)]

#### Question 2
Which year did that country win the most total medals?

A: 2024

In [12]:
q2_work = (df_denormalized
           .filter(col("Country_Code") == "USA")
           .orderBy("Total", ascending = False)
           ).select("Country_Code", "Total", "Year")

q2_work.head(5)

[Row(Country_Code='USA', Total=126, Year='2024'),
 Row(Country_Code='USA', Total=121, Year='2016'),
 Row(Country_Code='USA', Total=113, Year='2020'),
 Row(Country_Code='USA', Total=104, Year='2012'),
 Row(Country_Code='USA', Total=101, Year='2004')]

#### Question 3 
Is there a correlation between Population Density and winning medals?

A: From the extremely low correlation value, we can confidently say no. Note that the lack of integrity of the join undoubtedly impacted this result.

In [14]:
q3_work = (df_denormalized
           .groupBy("Country_Name", "Country_Code", "Pop_Density_per_sq_mi")
           .agg(sum("Total").alias("total_agg"))
           )

correlation = q3_work.stat.corr("total_agg", "Pop_Density_per_sq_mi")

print(correlation)

-0.05221361513497098


#### Question 4
Is there a correlation between GDP and winning gold medals?

A: Yes, it is weak but significant. Note that the lack of integrity of the join undoubtedly impacted this result. 

In [15]:
q4_work = (df_denormalized
           .groupBy("Country_Name", "Country_Code", "GDP_per_capita")
           .agg(sum("Gold").alias("gold_total"))
           )

correlation = q4_work.stat.corr("gold_total", "GDP_per_capita")

print(correlation)

0.20548406822263218


Despite this being a weak correlation, we can still measure if it is significant! Running a t-test to see how significant this value is.

Given the value is below 0.05, we can say this is a significant correlation

In [16]:
import math

# Function to compute the cumulative distribution function (CDF) of the t-distribution
# Using an approximation based on the error function (erf), which is commonly used in statistics
def t_cdf(t, df):
    # Using the approximation of the CDF for the t-distribution
    return 0.5 * (1 + math.erf(t / math.sqrt(2)))

# Given values
r = correlation # Correlation coefficient
n = q4_work.count()   # Sample size

# Calculate the t-statistic
t_statistic = r * math.sqrt(n - 2) / math.sqrt(1 - r**2)

# Degrees of freedom (n - 2)
df = n - 2

# Calculate the two-tailed p-value for the t-distribution
# Multiply by 2 because it's a two-tailed test (for both positive and negative correlations)
p_value = 2 * (1 - t_cdf(abs(t_statistic), df))

# Output the results
print(f"P-value: {p_value}")


P-value: 0.014704819101295286
