# E2E Pyspark (Ting Wang)

## Prerrequisites

Install Java and Spark in VM

In [1]:
# install Java8
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
# download spark 3.5.3
!wget -q https://dlcdn.apache.org/spark/spark-3.5.3/spark-3.5.3-bin-hadoop3.tgz

In [2]:
# unzip it
!tar xf spark-3.5.3-bin-hadoop3.tgz

In [3]:
!pip install -q findspark

In [4]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.5.3-bin-hadoop3"
os.environ["PYSPARK_SUBMIT_ARGS"] = "--master local[*] pyspark-shell"

Start Spark Session

---

In [5]:
import findspark
findspark.init("spark-3.5.3-bin-hadoop3")# SPARK_HOME

from pyspark.sql import SparkSession

# create the session
spark = SparkSession \
        .builder \
        .appName("Joins") \
        .master("local[*]") \
        .config("spark.ui.port", "4500") \
        .getOrCreate()

spark.version

'3.5.3'

In [6]:
spark

In [7]:
# Import sql functions
from pyspark.sql.functions import *

In [8]:
!mkdir -p /content/dataset


In [9]:
ls -l dataset

total 0


## Load the datasets

In [11]:
countryDF= spark.read.option("header", "true").option("inferSchema", "true").csv("/content/dataset/Country.csv")
countryDF.printSchema()

root
 |-- CountryCode: string (nullable = true)
 |-- ShortName: string (nullable = true)
 |-- TableName: string (nullable = true)
 |-- LongName: string (nullable = true)
 |-- Alpha2Code: string (nullable = true)
 |-- CurrencyUnit: string (nullable = true)
 |-- SpecialNotes: string (nullable = true)
 |-- Region: string (nullable = true)
 |-- IncomeGroup: string (nullable = true)
 |-- Wb2Code: string (nullable = true)
 |-- NationalAccountsBaseYear: string (nullable = true)
 |-- NationalAccountsReferenceYear: string (nullable = true)
 |-- SnaPriceValuation: string (nullable = true)
 |-- LendingCategory: string (nullable = true)
 |-- OtherGroups: string (nullable = true)
 |-- SystemOfNationalAccounts: string (nullable = true)
 |-- AlternativeConversionFactor: string (nullable = true)
 |-- PppSurveyYear: string (nullable = true)
 |-- BalanceOfPaymentsManualInUse: string (nullable = true)
 |-- ExternalDebtReportingStatus: string (nullable = true)
 |-- SystemOfTrade: string (nullable = true)


In [37]:
countryDF.show(3)

+-----------+-----------+-----------+--------------------+----------+--------------+--------------------+--------------------+-------------------+-------+------------------------+-----------------------------+--------------------+---------------+-----------+------------------------+---------------------------+-------------+----------------------------+---------------------------+--------------------+---------------------------+----------------------------+----------------------+---------------------+------------------------------------------+-------------------------+------------------------+--------------------+---------------+-------------------------+
|CountryCode|  ShortName|  TableName|            LongName|Alpha2Code|  CurrencyUnit|        SpecialNotes|              Region|        IncomeGroup|Wb2Code|NationalAccountsBaseYear|NationalAccountsReferenceYear|   SnaPriceValuation|LendingCategory|OtherGroups|SystemOfNationalAccounts|AlternativeConversionFactor|PppSurveyYear|BalanceOfPay

In [147]:
indicatorDF= spark.read.option("header", "true").option("inferSchema", "true").csv("/content/dataset/HDI.csv")
indicatorDF.printSchema()

root
 |-- HDI Rank: integer (nullable = true)
 |-- Country: string (nullable = true)
 |-- 1990: string (nullable = true)
 |-- 1991: string (nullable = true)
 |-- 1992: string (nullable = true)
 |-- 1993: string (nullable = true)
 |-- 1994: string (nullable = true)
 |-- 1995: string (nullable = true)
 |-- 1996: string (nullable = true)
 |-- 1997: string (nullable = true)
 |-- 1998: string (nullable = true)
 |-- 1999: string (nullable = true)
 |-- 2000: string (nullable = true)
 |-- 2001: string (nullable = true)
 |-- 2002: string (nullable = true)
 |-- 2003: string (nullable = true)
 |-- 2004: string (nullable = true)
 |-- 2005: string (nullable = true)
 |-- 2006: string (nullable = true)
 |-- 2007: string (nullable = true)
 |-- 2008: string (nullable = true)
 |-- 2009: string (nullable = true)
 |-- 2010: string (nullable = true)
 |-- 2011: string (nullable = true)
 |-- 2012: string (nullable = true)
 |-- 2013: string (nullable = true)
 |-- 2014: string (nullable = true)
 |-- 2015: stri

In [148]:
indicatorDF.show(3)

+--------+-----------+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+
|HDI Rank|    Country| 1990| 1991| 1992| 1993| 1994| 1995| 1996| 1997| 1998| 1999| 2000| 2001| 2002| 2003| 2004| 2005| 2006| 2007| 2008| 2009| 2010| 2011| 2012| 2013| 2014| 2015| 2016| 2017| 2018| 2019|
+--------+-----------+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+
|     169|Afghanistan|0.302|0.307|0.316|0.312|0.307|0.331|0.335|0.339|0.344|0.348| 0.35|0.353|0.384|0.393|0.409|0.418|0.429|0.447|0.447| 0.46|0.472|0.477|0.489|0.496|  0.5|  0.5|0.502|0.506|0.509|0.511|
|      69|    Albania| 0.65|0.631|0.615|0.618|0.624|0.637|0.646|0.645|0.655|0.665|0.671|0.678|0.684|0.691|0.696|0.706|0.713|0.722|0.728|0.733|0.745|0.764|0.775|0.782|0.787|0.788|0.788| 0.7

In [150]:
indicator2019DF = indicatorDF.select("HDI Rank", "Country", "2019").orderBy("HDI Rank")
indicator2019DF.show(3)

+--------+-----------+-----+
|HDI Rank|    Country| 2019|
+--------+-----------+-----+
|       1|     Norway|0.957|
|       2|    Ireland|0.955|
|       2|Switzerland|0.955|
+--------+-----------+-----+
only showing top 3 rows



In [119]:
happinessDF= spark.read.option("header", "true").option("inferSchema", "true").csv("/content/dataset/happiness_2019.csv")
happinessDF.printSchema()

root
 |-- Overall rank: integer (nullable = true)
 |-- Country or region: string (nullable = true)
 |-- Score: double (nullable = true)
 |-- GDP per capita: double (nullable = true)
 |-- Social support: double (nullable = true)
 |-- Healthy life expectancy: double (nullable = true)
 |-- Freedom to make life choices: double (nullable = true)
 |-- Generosity: double (nullable = true)
 |-- Perceptions of corruption: double (nullable = true)



In [14]:
happinessDF.show(3)

+------------+-----------------+-----+--------------+--------------+-----------------------+----------------------------+----------+-------------------------+
|Overall rank|Country or region|Score|GDP per capita|Social support|Healthy life expectancy|Freedom to make life choices|Generosity|Perceptions of corruption|
+------------+-----------------+-----+--------------+--------------+-----------------------+----------------------------+----------+-------------------------+
|           1|          Finland|7.769|          1.34|         1.587|                  0.986|                       0.596|     0.153|                    0.393|
|           2|          Denmark|  7.6|         1.383|         1.573|                  0.996|                       0.592|     0.252|                     0.41|
|           3|           Norway|7.554|         1.488|         1.582|                  1.028|                       0.603|     0.271|                    0.341|
+------------+-----------------+-----+--------

In [15]:
terrorismDF= spark.read.option("header", "true").option("inferSchema", "true").csv("/content/dataset/terrorism.csv")
terrorismDF.printSchema()

root
 |-- eventid: long (nullable = true)
 |-- iyear: integer (nullable = true)
 |-- imonth: integer (nullable = true)
 |-- iday: integer (nullable = true)
 |-- approxdate: string (nullable = true)
 |-- extended: integer (nullable = true)
 |-- resolution: string (nullable = true)
 |-- country: integer (nullable = true)
 |-- country_txt: string (nullable = true)
 |-- region: integer (nullable = true)
 |-- region_txt: string (nullable = true)
 |-- provstate: string (nullable = true)
 |-- city: string (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- specificity: integer (nullable = true)
 |-- vicinity: integer (nullable = true)
 |-- location: string (nullable = true)
 |-- summary: string (nullable = true)
 |-- crit1: string (nullable = true)
 |-- crit2: string (nullable = true)
 |-- crit3: string (nullable = true)
 |-- doubtterr: string (nullable = true)
 |-- alternative: string (nullable = true)
 |-- alternative_txt: string (nullable

In [16]:
terrorismColumnsDF = terrorismDF.select("eventid", "iyear", "imonth", "iday", "extended", "country","country_txt", "region", "region_txt", "city","multiple","success","suicide","attacktype1", "attacktype1_txt", "targtype1","targtype1_txt")
terrorismColumnsDF.show(3)

+------------+-----+------+----+--------+-------+------------------+------+--------------------+-------------+--------+-------+-------+-----------+--------------------+---------+--------------------+
|     eventid|iyear|imonth|iday|extended|country|       country_txt|region|          region_txt|         city|multiple|success|suicide|attacktype1|     attacktype1_txt|targtype1|       targtype1_txt|
+------------+-----+------+----+--------+-------+------------------+------+--------------------+-------------+--------+-------+-------+-----------+--------------------+---------+--------------------+
|197000000001| 1970|     7|   2|       0|     58|Dominican Republic|     2|Central America &...|Santo Domingo|       0|      1|      0|          1|       Assassination|       14|Private Citizens ...|
|197000000002| 1970|     0|   0|       0|    130|            Mexico|     1|       North America|  Mexico city|       0|      1|      0|          6|Hostage Taking (K...|        7|Government (Diplo...|


## Transformaciones

Countries with most cases of terrorism

In [57]:
WorstCountryTerrorismDF = terrorismColumnsDF.groupBy("country_txt").agg(count("*").alias("totalEvents")).orderBy(col("totalEvents").desc())
WorstCountryTerrorismDF.show(5)

+--------------+-----------+
|   country_txt|totalEvents|
+--------------+-----------+
|      Colombia|       7022|
|          Peru|       6043|
|   El Salvador|       5320|
|         India|       4741|
|United Kingdom|       4468|
+--------------+-----------+
only showing top 5 rows



Safest countries (less cases of terrorism)

In [59]:
SafestCountryDF = terrorismColumnsDF.groupBy("country_txt").agg(count("*").alias("totalEvents")).orderBy(col("totalEvents"))
SafestCountryDF.show(10)

+-------------------+-----------+
|        country_txt|totalEvents|
+-------------------+-----------+
|            Andorra|          1|
|      South Vietnam|          1|
|  Equatorial Guinea|          1|
|       New Hebrides|          1|
|  Wallis and Futuna|          1|
|        North Korea|          1|
|Antigua and Barbuda|          1|
|   Falkland Islands|          1|
|       Vatican City|          1|
|      International|          1|
+-------------------+-----------+
only showing top 10 rows



Terrorism per region

In [27]:
RegionTerrorismDF = terrorismColumnsDF.groupBy("region_txt").agg(count("*").alias("totalEvents")).orderBy(col("totalEvents").desc())
RegionTerrorismDF.show(truncate=False)

+---------------------------+-----------+
|region_txt                 |totalEvents|
+---------------------------+-----------+
|South America              |17400      |
|Middle East & North Africa |15810      |
|Western Europe             |14615      |
|South Asia                 |13218      |
|Central America & Caribbean|10305      |
|Sub-Saharan Africa         |5831       |
|Southeast Asia             |4928       |
|North America              |3055       |
|Eastern Europe             |2217       |
|East Asia                  |681        |
|Central Asia               |446        |
|Australasia & Oceania      |232        |
+---------------------------+-----------+



Terrorism in European countries

In [30]:
RegionTerrorismDF.filter((col("region_txt") == "Western Europe") | (col("region_txt") == "Eastern Europe")).show()

+--------------+-----------+
|    region_txt|totalEvents|
+--------------+-----------+
|Western Europe|      14615|
|Eastern Europe|       2217|
+--------------+-----------+



In [35]:
EuropeTerrorismDF = terrorismColumnsDF.filter((col("region_txt") == "Western Europe") | (col("region_txt") == "Eastern Europe")).groupBy("region_txt", "country_txt").agg(count("*").alias("totalEvents")).orderBy(col("totalEvents").desc())
EuropeTerrorismDF.show(10)

+--------------+------------------+-----------+
|    region_txt|       country_txt|totalEvents|
+--------------+------------------+-----------+
|Western Europe|    United Kingdom|       4468|
|Western Europe|             Spain|       3195|
|Western Europe|            France|       2468|
|Western Europe|             Italy|       1498|
|Eastern Europe|            Russia|       1085|
|Western Europe|            Greece|        874|
|Western Europe|           Germany|        567|
|Western Europe|West Germany (FRG)|        541|
|Eastern Europe|        Yugoslavia|        203|
|Eastern Europe|            Kosovo|        158|
+--------------+------------------+-----------+
only showing top 10 rows



Yearly evolution of terrorism cases

In [62]:
yearEvolutionDF = terrorismColumnsDF.groupBy("iyear").agg(count("*").alias("totalEvents")).orderBy(col("iyear").desc())
yearEvolutionDF.show()

+-----+-----------+
|iyear|totalEvents|
+-----+-----------+
| 2008|       3388|
| 2007|       3242|
| 2006|       2758|
| 2005|       2017|
| 2004|       1166|
| 2003|       1278|
| 2002|       1333|
| 2001|       1906|
| 2000|       1814|
| 1999|       1395|
| 1998|        934|
| 1997|       3197|
| 1996|       3058|
| 1995|       3081|
| 1994|       3456|
| 1992|       5071|
| 1991|       4683|
| 1990|       3887|
| 1989|       4324|
| 1988|       3721|
+-----+-----------+
only showing top 20 rows



Year with most cases

In [71]:
yearEvolutionDF.orderBy(col("totalEvents").desc()).show(1)

+-----+-----------+
|iyear|totalEvents|
+-----+-----------+
| 1992|       5071|
+-----+-----------+
only showing top 1 row



For happiness index, correlation between GDP and any index is not directly linear (individually):

In [45]:
happinessDF.orderBy(col("GDP per capita").desc()).show(15)

+------------+--------------------+-----+--------------+--------------+-----------------------+----------------------------+----------+-------------------------+
|Overall rank|   Country or region|Score|GDP per capita|Social support|Healthy life expectancy|Freedom to make life choices|Generosity|Perceptions of corruption|
+------------+--------------------+-----+--------------+--------------+-----------------------+----------------------------+----------+-------------------------+
|          29|               Qatar|6.374|         1.684|         1.313|                  0.871|                       0.555|      0.22|                    0.167|
|          14|          Luxembourg| 7.09|         1.609|         1.479|                  1.012|                       0.526|     0.194|                    0.316|
|          34|           Singapore|6.262|         1.572|         1.463|                  1.141|                       0.556|     0.271|                    0.453|
|          21|United Arab Em

In [48]:
happinessDF.filter(col("Healthy life expectancy") >= 1).orderBy(col("Healthy life expectancy").desc()).show(15)

+------------+-----------------+-----+--------------+--------------+-----------------------+----------------------------+----------+-------------------------+
|Overall rank|Country or region|Score|GDP per capita|Social support|Healthy life expectancy|Freedom to make life choices|Generosity|Perceptions of corruption|
+------------+-----------------+-----+--------------+--------------+-----------------------+----------------------------+----------+-------------------------+
|          34|        Singapore|6.262|         1.572|         1.463|                  1.141|                       0.556|     0.271|                    0.453|
|          76|        Hong Kong| 5.43|         1.438|         1.277|                  1.122|                        0.44|     0.258|                    0.287|
|          58|            Japan|5.886|         1.327|         1.419|                  1.088|                       0.445|     0.069|                     0.14|
|          30|            Spain|6.354|        

Relation between happiness index and terrorism: there is not a clear correlation. Nevetheless, happier countries tend ot have less terrorism cases.

In [95]:
happinessTerrorismDF = happinessDF.join(SafestCountryDF, col("Country or region") == col("country_txt"), "inner").orderBy(col("Overall rank").desc()).orderBy("Overall rank").select("Overall rank", "Country or region", "Score", "totalEvents")
happinessTerrorismDF.show(10)

+------------+-----------------+-----+-----------+
|Overall rank|Country or region|Score|totalEvents|
+------------+-----------------+-----+-----------+
|           1|          Finland|7.769|          5|
|           2|          Denmark|  7.6|         33|
|           3|           Norway|7.554|         14|
|           4|          Iceland|7.494|          2|
|           5|      Netherlands|7.488|        114|
|           6|      Switzerland| 7.48|        105|
|           7|           Sweden|7.343|         54|
|           8|      New Zealand|7.307|         18|
|           9|           Canada|7.278|         54|
|          10|          Austria|7.246|        100|
+------------+-----------------+-----+-----------+
only showing top 10 rows



In [97]:
happinessTerrorismDF.orderBy(col("totalEvents").desc()).show(10)
happinessTerrorismDF.orderBy(col("totalEvents").asc()).show(10)

+------------+-----------------+-----+-----------+
|Overall rank|Country or region|Score|totalEvents|
+------------+-----------------+-----+-----------+
|          43|         Colombia|6.125|       7022|
|          65|             Peru|5.697|       6043|
|          35|      El Salvador|6.253|       5320|
|         140|            India|4.015|       4741|
|          15|   United Kingdom|7.054|       4468|
|         126|             Iraq|4.437|       3948|
|          30|            Spain|6.354|       3195|
|          67|         Pakistan|5.653|       2879|
|         130|        Sri Lanka|4.366|       2862|
|          79|           Turkey|5.373|       2732|
+------------+-----------------+-----+-----------+
only showing top 10 rows

+------------+-----------------+-----+-----------+
|Overall rank|Country or region|Score|totalEvents|
+------------+-----------------+-----+-----------+
|          87|     Turkmenistan|5.247|          1|
|           4|          Iceland|7.494|          2|
|    

Relation between corruption and terrorism: there is not a significant correlation

In [99]:
corruptionTerrorismDF = happinessDF.join(SafestCountryDF, col("Country or region") == col("country_txt"), "inner").select("Overall rank", "Country or region", "GDP per capita", "Perceptions of corruption", "totalEvents")
corruptionTerrorismDF.orderBy(col("Perceptions of corruption").asc()).show(10)
corruptionTerrorismDF.orderBy(col("Perceptions of corruption").desc()).show(10)

+------------+-----------------+--------------+-------------------------+-----------+
|Overall rank|Country or region|GDP per capita|Perceptions of corruption|totalEvents|
+------------+-----------------+--------------+-------------------------+-----------+
|          71|          Moldova|         0.685|                      0.0|         18|
|          97|         Bulgaria|         1.092|                    0.004|         41|
|          48|          Romania|         1.162|                    0.005|          6|
|          46|           Kosovo|         0.882|                    0.006|        158|
|         133|          Ukraine|          0.82|                     0.01|         33|
|          62|          Hungary|         1.201|                     0.02|         42|
|          75|          Croatia|         1.155|                    0.022|         54|
|          86|       Kyrgyzstan|         0.551|                    0.023|         22|
|          80|         Malaysia|         1.221|       

Is GDP per capita correlated with terrorism? Overall, countries with higher GDP per capita tend to have less terrorism cases (except some countries like USA).

In [110]:
GDPTerrorismDF = corruptionTerrorismDF.drop("Perceptions of corruption").orderBy(col("GDP per capita").desc())
GDPTerrorismDF.show(15)

+------------+--------------------+--------------+-----------+
|Overall rank|   Country or region|GDP per capita|totalEvents|
+------------+--------------------+--------------+-----------+
|          29|               Qatar|         1.684|          6|
|          14|          Luxembourg|         1.609|         16|
|          34|           Singapore|         1.572|          7|
|          21|United Arab Emirates|         1.503|         17|
|          51|              Kuwait|           1.5|         72|
|          16|             Ireland|         1.499|        150|
|           3|              Norway|         1.488|         14|
|           6|         Switzerland|         1.452|        105|
|          76|           Hong Kong|         1.438|         25|
|          19|       United States|         1.433|       2560|
|          28|        Saudi Arabia|         1.403|         59|
|           5|         Netherlands|         1.396|        114|
|           7|              Sweden|         1.387|     

However, if we look at the countries with the least cases, we see a mix of high income and low income countries. In low income countries, social unrest may focus more on basic survival and needs, while wealthier countries often experience more political or ideological violence due to stronger institutions and more divided politics.


In [112]:
GDPTerrorismDF.orderBy("totalEvents").show(15)

+------------+-----------------+--------------+-----------+
|Overall rank|Country or region|GDP per capita|totalEvents|
+------------+-----------------+--------------+-----------+
|          87|     Turkmenistan|         1.052|          1|
|           4|          Iceland|          1.38|          2|
|          57|        Mauritius|          1.12|          2|
|          73|       Montenegro|         1.051|          2|
|         120|           Gambia|         0.308|          3|
|         115|     Burkina Faso|         0.331|          3|
|          95|           Bhutan|         0.813|          3|
|          70|           Serbia|         1.004|          3|
|         104|            Gabon|         1.057|          4|
|         150|           Malawi|         0.191|          4|
|           1|          Finland|          1.34|          5|
|         142|          Comoros|         0.274|          5|
|          48|          Romania|         1.162|          6|
|          29|            Qatar|        

Upper/lower middle income countries have the most cases of terrorism.They often face significant challenges such as political instability, economic inequality, and weak institutions.

In [117]:
IncomeGroupTerrorDP = GDPTerrorismDF.join(countryDF, col("Country or region") == col("ShortName"), "inner").select("Overall rank", "Country or region", "GDP per capita", "totalEvents", "IncomeGroup")
IncomeGroupTerrorDP.orderBy(col("totalEvents").desc()).show(n=10, truncate=False)
IncomeGroupTerrorDP.orderBy(col("totalEvents")).show(n=10, truncate=False)

+------------+-----------------+--------------+-----------+-------------------+
|Overall rank|Country or region|GDP per capita|totalEvents|IncomeGroup        |
+------------+-----------------+--------------+-----------+-------------------+
|43          |Colombia         |0.985         |7022       |Upper middle income|
|65          |Peru             |0.96          |6043       |Upper middle income|
|35          |El Salvador      |0.794         |5320       |Lower middle income|
|140         |India            |0.755         |4741       |Lower middle income|
|15          |United Kingdom   |1.333         |4468       |High income: OECD  |
|126         |Iraq             |1.043         |3948       |Upper middle income|
|30          |Spain            |1.286         |3195       |High income: OECD  |
|67          |Pakistan         |0.677         |2879       |Lower middle income|
|130         |Sri Lanka        |0.949         |2862       |Lower middle income|
|79          |Turkey           |1.183   

Correlation happiness, HDI and terrorism: Hapiness and HDI is highly related.

In [163]:
joinedDF = happinessTerrorismDF.join(indicator2019DF, col("Country or region") == col("Country"), "inner").drop("Country").withColumnRenamed("Overall rank", "Happiness rank").withColumnRenamed("2019", "HDI_2019").orderBy("Happiness rank")
joinedDF.show(20)

+--------------+--------------------+-----+-----------+--------+--------+
|Happiness rank|   Country or region|Score|totalEvents|HDI Rank|HDI_2019|
+--------------+--------------------+-----+-----------+--------+--------+
|             1|             Finland|7.769|          5|      11|   0.938|
|             2|             Denmark|  7.6|         33|      10|    0.94|
|             3|              Norway|7.554|         14|       1|   0.957|
|             4|             Iceland|7.494|          2|       4|   0.949|
|             5|         Netherlands|7.488|        114|       8|   0.944|
|             6|         Switzerland| 7.48|        105|       2|   0.955|
|             7|              Sweden|7.343|         54|       7|   0.945|
|             8|         New Zealand|7.307|         18|      14|   0.931|
|             9|              Canada|7.278|         54|      16|   0.929|
|            10|             Austria|7.246|        100|      18|   0.922|
|            11|           Australia|7

In [164]:
joinedDF.orderBy(col("HDI Rank")).show(10)
joinedDF.orderBy(col("HDI Rank").desc()).show(10)

+--------------+-----------------+-----+-----------+--------+--------+
|Happiness rank|Country or region|Score|totalEvents|HDI Rank|HDI_2019|
+--------------+-----------------+-----+-----------+--------+--------+
|             3|           Norway|7.554|         14|       1|   0.957|
|            16|          Ireland|7.021|        150|       2|   0.955|
|             6|      Switzerland| 7.48|        105|       2|   0.955|
|             4|          Iceland|7.494|          2|       4|   0.949|
|            17|          Germany|6.985|        567|       6|   0.947|
|             7|           Sweden|7.343|         54|       7|   0.945|
|            11|        Australia|7.228|         74|       8|   0.944|
|             5|      Netherlands|7.488|        114|       8|   0.944|
|             2|          Denmark|  7.6|         33|      10|    0.94|
|            34|        Singapore|6.262|          7|      11|   0.938|
+--------------+-----------------+-----+-----------+--------+--------+
only s

Stats: average Happiness score and HDI.
We establish a constant (threshold) for both indexes based on:



*   HDI = "The cutoff-points are HDI of less than 0.550 for low human development, 0.550–0.699 for medium human development, 0.700–0.799 for high human development and 0.800 or greater for very high human development."
*   Happiness =  score above 5 can be considered neutral or happy





In [178]:
StatsDF = joinedDF.select(count("*").alias("count"),
                          lit(5).alias("Score_threshold"),
                          lit(0.550).alias("HDI_threshold"),
												 mean("Score").alias("Score_mean"),
                          mean("HDI_2019").alias("HDI_mean"))
StatsDF.show()

+-----+---------------+-------------+------------------+------------------+
|count|Score_threshold|HDI_threshold|        Score_mean|          HDI_mean|
+-----+---------------+-------------+------------------+------------------+
|  129|              5|         0.55|5.4670775193798455|0.7321782945736437|
+-----+---------------+-------------+------------------+------------------+



In [179]:
avgHappinessDF = joinedDF.withColumn("Happier than average", col("Score")>=5)
avgHappinessDF.orderBy(col("totalEvents").desc()).show(10)
avgHappinessDF.orderBy(col("totalEvents").asc()).show(10)

+--------------+-----------------+-----+-----------+--------+--------+--------------------+
|Happiness rank|Country or region|Score|totalEvents|HDI Rank|HDI_2019|Happier than average|
+--------------+-----------------+-----+-----------+--------+--------+--------------------+
|            43|         Colombia|6.125|       7022|      83|   0.767|                true|
|            65|             Peru|5.697|       6043|      79|   0.777|                true|
|            35|      El Salvador|6.253|       5320|     124|   0.673|                true|
|           140|            India|4.015|       4741|     131|   0.645|               false|
|            15|   United Kingdom|7.054|       4468|      13|   0.932|                true|
|           126|             Iraq|4.437|       3948|     123|   0.674|               false|
|            30|            Spain|6.354|       3195|      25|   0.904|                true|
|            67|         Pakistan|5.653|       2879|     154|   0.557|          

In [174]:
avgHDI = joinedDF.withColumn("More developed than average", col("HDI_2019") >= 0.55)
avgHDI.orderBy(col("totalEvents").desc()).show(10)
avgHDI.orderBy(col("totalEvents").asc()).show(10)

+--------------+-----------------+-----+-----------+--------+--------+---------------------------+
|Happiness rank|Country or region|Score|totalEvents|HDI Rank|HDI_2019|More developed than average|
+--------------+-----------------+-----+-----------+--------+--------+---------------------------+
|            43|         Colombia|6.125|       7022|      83|   0.767|                       true|
|            65|             Peru|5.697|       6043|      79|   0.777|                       true|
|            35|      El Salvador|6.253|       5320|     124|   0.673|                       true|
|           140|            India|4.015|       4741|     131|   0.645|                       true|
|            15|   United Kingdom|7.054|       4468|      13|   0.932|                       true|
|           126|             Iraq|4.437|       3948|     123|   0.674|                       true|
|            30|            Spain|6.354|       3195|      25|   0.904|                       true|
|         