## Install Pyspark

In [1]:
# !pip install pyspark

## Import required Library 

In [2]:
from pyspark.sql import SparkSession
import numpy as np 
import pandas as pd 
import warnings
from pyspark.sql import functions as F

## Configure Spark Session and Properties

In [3]:
%matplotlib inline
warnings.filterwarnings("ignore")

spark = SparkSession.builder \
          .master("local[*]") \
          .appName("Facts Analysis") \
          .getOrCreate()

spark.sparkContext.setLogLevel("ERROR")


Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/01/13 13:54:19 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [4]:
from IPython.core.display import HTML
display(HTML("<style>pre { white-space: pre !important; }</style>"))

In [5]:
spark

## Prepare Data

### Creating Pandas Dataframe

In [6]:
#pandas_df_15=pd.read_csv("/kaggle/input/world-happiness/2015.csv")
#pandas_df_16=pd.read_csv("/kaggle/input/world-happiness/2016.csv")
#pandas_df_17=pd.read_csv("/kaggle/input/world-happiness/2017.csv")
#pandas_df_18=pd.read_csv("/kaggle/input/world-happiness/2018.csv")
#pandas_df_19=pd.read_csv("/kaggle/input/world-happiness/2019.csv")

In [7]:
pandas_df_15=pd.read_csv("/Users/abhi011097/Workspace/Dataset_Analysis/4_World_Happiness_Report/Dataset/2015.csv")
pandas_df_16=pd.read_csv("/Users/abhi011097/Workspace/Dataset_Analysis/4_World_Happiness_Report/Dataset/2016.csv")
pandas_df_17=pd.read_csv("/Users/abhi011097/Workspace/Dataset_Analysis/4_World_Happiness_Report/Dataset/2017.csv")
pandas_df_18=pd.read_csv("/Users/abhi011097/Workspace/Dataset_Analysis/4_World_Happiness_Report/Dataset/2018.csv")
pandas_df_19=pd.read_csv("/Users/abhi011097/Workspace/Dataset_Analysis/4_World_Happiness_Report/Dataset/2019.csv")

### Creating Spark Dataframe

In [8]:
spark_df_15=spark.createDataFrame(pandas_df_15)
spark_df_16=spark.createDataFrame(pandas_df_16)
spark_df_17=spark.createDataFrame(pandas_df_17)
spark_df_18=spark.createDataFrame(pandas_df_18)
spark_df_19=spark.createDataFrame(pandas_df_19)

### Creating Spark Table 

In [9]:
spark_df_15.registerTempTable("spark_tbl_15")
spark_df_16.registerTempTable("spark_tbl_16")
spark_df_17.registerTempTable("spark_tbl_17")
spark_df_18.registerTempTable("spark_tbl_18")
spark_df_19.registerTempTable("spark_tbl_19")

## Investigate Data

### No of partitions of Spark Dataframe

In [10]:
spark_df_15.rdd.getNumPartitions() # 8 in my local system as it has 8 cores

8

### Dataframe Types

In [11]:
type(pandas_df_15)

pandas.core.frame.DataFrame

In [12]:
type(spark_df_15)

pyspark.sql.dataframe.DataFrame

### Check Schrema

#### Using Pandas

In [13]:
pandas_df_15.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 158 entries, 0 to 157
Data columns (total 12 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   Country                        158 non-null    object 
 1   Region                         158 non-null    object 
 2   Happiness Rank                 158 non-null    int64  
 3   Happiness Score                158 non-null    float64
 4   Standard Error                 158 non-null    float64
 5   Economy (GDP per Capita)       158 non-null    float64
 6   Family                         158 non-null    float64
 7   Health (Life Expectancy)       158 non-null    float64
 8   Freedom                        158 non-null    float64
 9   Trust (Government Corruption)  158 non-null    float64
 10  Generosity                     158 non-null    float64
 11  Dystopia Residual              158 non-null    float64
dtypes: float64(9), int64(1), object(2)
memory usage: 1

#### Using Spark Dataframe

In [14]:
spark_df_15.printSchema()

root
 |-- Country: string (nullable = true)
 |-- Region: string (nullable = true)
 |-- Happiness Rank: long (nullable = true)
 |-- Happiness Score: double (nullable = true)
 |-- Standard Error: double (nullable = true)
 |-- Economy (GDP per Capita): double (nullable = true)
 |-- Family: double (nullable = true)
 |-- Health (Life Expectancy): double (nullable = true)
 |-- Freedom: double (nullable = true)
 |-- Trust (Government Corruption): double (nullable = true)
 |-- Generosity: double (nullable = true)
 |-- Dystopia Residual: double (nullable = true)



### Describe Dataframe

#### For Spark Dataframe

In [15]:
spark_df_15.describe().show(truncate=False)

                                                                                

+-------+-----------+-------------------------+-----------------+------------------+--------------------+------------------------+------------------+------------------------+-------------------+-----------------------------+-------------------+------------------+
|summary|Country    |Region                   |Happiness Rank   |Happiness Score   |Standard Error      |Economy (GDP per Capita)|Family            |Health (Life Expectancy)|Freedom            |Trust (Government Corruption)|Generosity         |Dystopia Residual |
+-------+-----------+-------------------------+-----------------+------------------+--------------------+------------------------+------------------+------------------------+-------------------+-----------------------------+-------------------+------------------+
|count  |158        |158                      |158              |158               |158                 |158                     |158               |158                     |158                |158           

#### For Pandas Dataframe

In [16]:
pandas_df_15.describe()

Unnamed: 0,Happiness Rank,Happiness Score,Standard Error,Economy (GDP per Capita),Family,Health (Life Expectancy),Freedom,Trust (Government Corruption),Generosity,Dystopia Residual
count,158.0,158.0,158.0,158.0,158.0,158.0,158.0,158.0,158.0,158.0
mean,79.493671,5.375734,0.047885,0.846137,0.991046,0.630259,0.428615,0.143422,0.237296,2.098977
std,45.754363,1.14501,0.017146,0.403121,0.272369,0.247078,0.150693,0.120034,0.126685,0.55355
min,1.0,2.839,0.01848,0.0,0.0,0.0,0.0,0.0,0.0,0.32858
25%,40.25,4.526,0.037268,0.545808,0.856823,0.439185,0.32833,0.061675,0.150553,1.75941
50%,79.5,5.2325,0.04394,0.910245,1.02951,0.696705,0.435515,0.10722,0.21613,2.095415
75%,118.75,6.24375,0.0523,1.158448,1.214405,0.811013,0.549092,0.180255,0.309883,2.462415
max,158.0,7.587,0.13693,1.69042,1.40223,1.02525,0.66973,0.55191,0.79588,3.60214


In [17]:
pandas_df_15.describe(include='O')

Unnamed: 0,Country,Region
count,158,158
unique,158,10
top,Switzerland,Sub-Saharan Africa
freq,1,40


### Check Data

In [18]:
pandas_df_15.head(2)

Unnamed: 0,Country,Region,Happiness Rank,Happiness Score,Standard Error,Economy (GDP per Capita),Family,Health (Life Expectancy),Freedom,Trust (Government Corruption),Generosity,Dystopia Residual
0,Switzerland,Western Europe,1,7.587,0.03411,1.39651,1.34951,0.94143,0.66557,0.41978,0.29678,2.51738
1,Iceland,Western Europe,2,7.561,0.04884,1.30232,1.40223,0.94784,0.62877,0.14145,0.4363,2.70201


In [19]:
spark_df_15.show(2)

+-----------+--------------+--------------+---------------+--------------+------------------------+-------+------------------------+-------+-----------------------------+----------+-----------------+
|    Country|        Region|Happiness Rank|Happiness Score|Standard Error|Economy (GDP per Capita)| Family|Health (Life Expectancy)|Freedom|Trust (Government Corruption)|Generosity|Dystopia Residual|
+-----------+--------------+--------------+---------------+--------------+------------------------+-------+------------------------+-------+-----------------------------+----------+-----------------+
|Switzerland|Western Europe|             1|          7.587|       0.03411|                 1.39651|1.34951|                 0.94143|0.66557|                      0.41978|   0.29678|          2.51738|
|    Iceland|Western Europe|             2|          7.561|       0.04884|                 1.30232|1.40223|                 0.94784|0.62877|                      0.14145|    0.4363|          2.70201|


In [20]:
spark.sql("Select * from spark_tbl_15 limit 2").show(2)

+-----------+--------------+--------------+---------------+--------------+------------------------+-------+------------------------+-------+-----------------------------+----------+-----------------+
|    Country|        Region|Happiness Rank|Happiness Score|Standard Error|Economy (GDP per Capita)| Family|Health (Life Expectancy)|Freedom|Trust (Government Corruption)|Generosity|Dystopia Residual|
+-----------+--------------+--------------+---------------+--------------+------------------------+-------+------------------------+-------+-----------------------------+----------+-----------------+
|Switzerland|Western Europe|             1|          7.587|       0.03411|                 1.39651|1.34951|                 0.94143|0.66557|                      0.41978|   0.29678|          2.51738|
|    Iceland|Western Europe|             2|          7.561|       0.04884|                 1.30232|1.40223|                 0.94784|0.62877|                      0.14145|    0.4363|          2.70201|


## Understand features available for reporting in all files 

In [21]:
print("pandas_df_15 -> ",pandas_df_15.columns.to_list())
print("pandas_df_16 -> ",pandas_df_16.columns.to_list())
print("pandas_df_17 -> ",pandas_df_17.columns.to_list())
print("pandas_df_18 -> ",pandas_df_18.columns.to_list())
print("pandas_df_19 -> ",pandas_df_19.columns.to_list())

pandas_df_15 ->  ['Country', 'Region', 'Happiness Rank', 'Happiness Score', 'Standard Error', 'Economy (GDP per Capita)', 'Family', 'Health (Life Expectancy)', 'Freedom', 'Trust (Government Corruption)', 'Generosity', 'Dystopia Residual']
pandas_df_16 ->  ['Country', 'Region', 'Happiness Rank', 'Happiness Score', 'Lower Confidence Interval', 'Upper Confidence Interval', 'Economy (GDP per Capita)', 'Family', 'Health (Life Expectancy)', 'Freedom', 'Trust (Government Corruption)', 'Generosity', 'Dystopia Residual']
pandas_df_17 ->  ['Country', 'Happiness.Rank', 'Happiness.Score', 'Whisker.high', 'Whisker.low', 'Economy..GDP.per.Capita.', 'Family', 'Health..Life.Expectancy.', 'Freedom', 'Generosity', 'Trust..Government.Corruption.', 'Dystopia.Residual']
pandas_df_18 ->  ['Overall rank', 'Country or region', 'Score', 'GDP per capita', 'Social support', 'Healthy life expectancy', 'Freedom to make life choices', 'Generosity', 'Perceptions of corruption']
pandas_df_19 ->  ['Overall rank', 'Cou

## Querying Data with different methods

### Find the top 3 Countries in each year 
* sorted in yearwise followed by rank

* Output Format Using Pivot

    *  2015  | rank1 | rank2 | rank3 | rank4 | rank5
    *  2016  | rank1 | rank2 | rank3 | rank4 | rank5
    * ...
    

#### Using Spark Sql

In [22]:
spark.sql("""
            with ref_tbl as 
                    (Select year,rank,country from (
                        select country,`Happiness Rank` as rank , '2015' year  from spark_tbl_15 where `Happiness Rank`  <= 5
                        union
                        select country,`Happiness Rank` as rank , '2016' year  from spark_tbl_16 where `Happiness Rank`  <= 5
                        union
                        select country,`Happiness.Rank` as rank , '2017' year  from spark_tbl_17 where `Happiness.Rank`  <= 5
                        union
                        select `Country or region` country,`Overall rank` as rank , '2018' year  from spark_tbl_18 where `Overall rank`  <= 5
                        union
                        select `Country or region` country,`Overall rank` as rank , '2019' year  from spark_tbl_19 where `Overall rank`  <= 5       
                      ) order by 1 asc,2 asc)
                      
            select * from ref_tbl
            PIVOT (
                      max(country)
                      FOR rank in (
                        1 Rank_1, 2 Rank_2, 3 Rank_3, 4 Rank_4, 5 Rank_5
                      )
                    )
            ORDER BY year
                      """).show()

[Stage 9:>                                                          (0 + 1) / 1]

+----+-----------+-----------+-------+-----------+-----------+
|year|     Rank_1|     Rank_2| Rank_3|     Rank_4|     Rank_5|
+----+-----------+-----------+-------+-----------+-----------+
|2015|Switzerland|    Iceland|Denmark|     Norway|     Canada|
|2016|    Denmark|Switzerland|Iceland|     Norway|    Finland|
|2017|     Norway|    Denmark|Iceland|Switzerland|    Finland|
|2018|    Finland|     Norway|Denmark|    Iceland|Switzerland|
|2019|    Finland|    Denmark| Norway|    Iceland|Netherlands|
+----+-----------+-----------+-------+-----------+-----------+



                                                                                

#### Using Pandas

In [23]:
temp_res=pd.concat([
    pandas_df_15[['Country','Happiness Rank']].assign(Year='2015').rename(columns={'Happiness Rank':'Rank'}), 
    pandas_df_16[['Country','Happiness Rank']].assign(Year='2016').rename(columns={'Happiness Rank':'Rank'}),
    pandas_df_17[['Country','Happiness.Rank']].assign(Year='2017').rename(columns={'Happiness.Rank':'Rank'}),
    pandas_df_18[['Country or region','Overall rank']].assign(Year='2018').rename(columns={'Country or region':'Country','Overall rank':'Rank'}),
    pandas_df_19[['Country or region','Overall rank']].assign(Year='2019').rename(columns={'Country or region':'Country','Overall rank':'Rank'})
        ])

temp_res[temp_res['Rank']<=5]\
    .pivot(index='Year', columns='Rank', values='Country')\
    .rename(columns={1:'Rank_1',2:'Rank_2',3:'Rank_3',4:'Rank_4',5:'Rank_5'})\
    .rename_axis(None, axis=1)

Unnamed: 0_level_0,Rank_1,Rank_2,Rank_3,Rank_4,Rank_5
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2015,Switzerland,Iceland,Denmark,Norway,Canada
2016,Denmark,Switzerland,Iceland,Norway,Finland
2017,Norway,Denmark,Iceland,Switzerland,Finland
2018,Finland,Norway,Denmark,Iceland,Switzerland
2019,Finland,Denmark,Norway,Iceland,Netherlands


#### Using Spark Dataframe

In [24]:
spark_df_15['Country','Happiness Rank'].withColumnRenamed("Happiness Rank","Rank").withColumn('Year',F.lit(2015))\
    .union(spark_df_16['Country','Happiness Rank'].withColumnRenamed("Happiness Rank","Rank").withColumn('Year',F.lit(2016)))\
    .union(spark_df_17.select('Country',F.col('`Happiness.Rank`').alias('Rank'),F.lit(2017).alias('Year')))\
    .union(spark_df_18.select(F.col('Country or region').alias('Country'),F.col('`Overall rank`').alias('Rank'),F.lit(2018).alias('Year')))\
    .union(spark_df_19.select(F.col('Country or region').alias('Country'),F.col('`Overall rank`').alias('Rank'),F.lit(2019).alias('Year')))\
    .filter(F.col('Rank') <=5)\
    .groupBy("Year").pivot("Rank").agg(F.first('Country'))\
    .withColumnRenamed("1","Rank_1").withColumnRenamed("2","Rank_2").withColumnRenamed("3","Rank_3")\
    .withColumnRenamed("4","Rank_4").withColumnRenamed("5","Rank_5")\
    .show()

+----+-----------+-----------+-------+-----------+-----------+
|Year|     Rank_1|     Rank_2| Rank_3|     Rank_4|     Rank_5|
+----+-----------+-----------+-------+-----------+-----------+
|2015|Switzerland|    Iceland|Denmark|     Norway|     Canada|
|2016|    Denmark|Switzerland|Iceland|     Norway|    Finland|
|2017|     Norway|    Denmark|Iceland|Switzerland|    Finland|
|2018|    Finland|     Norway|Denmark|    Iceland|Switzerland|
|2019|    Finland|    Denmark| Norway|    Iceland|Netherlands|
+----+-----------+-----------+-------+-----------+-----------+



### Find the countries which never observed a drop in rank along with year wise rank and diffence of rank from 2015 to 2019 
* Sort the data by biggest to lowest change between 2015 to 2019

#### Using Spark Sql

In [25]:
spark.sql("""Select t1.Country ,t1.`Happiness Rank` as Rank_2015,t2.`Happiness Rank` Rank_2016,t3.`Happiness.Rank`  Rank_2017, 
            t4.`Overall rank` Rank_2018 ,t5.`Overall rank` Rank_2019,
            (t1.`Happiness Rank`-t5.`Overall rank`) Increment_Factor
            from  spark_tbl_15  t1
            inner join spark_tbl_16 t2
            on t1.Country = t2.Country and t1.`Happiness Rank` >= t2.`Happiness Rank`
            inner join spark_tbl_17 t3
            on t1.Country = t3.Country and t2.`Happiness Rank` >= t3.`Happiness.Rank`
            inner join spark_tbl_18 t4
            on t1.Country = t4.`Country or region` and t3.`Happiness.Rank` >= t4.`Overall rank`
            inner join spark_tbl_19 t5
            on t1.Country = t5.`Country or region` and t4.`Overall rank` >= t5.`Overall rank`
            order by Increment_Factor desc
          
          """).show(200)

                                                                                

+-------------------+---------+---------+---------+---------+---------+----------------+
|            Country|Rank_2015|Rank_2016|Rank_2017|Rank_2018|Rank_2019|Increment_Factor|
+-------------------+---------+---------+---------+---------+---------+----------------+
|              Benin|      155|      153|      143|      136|      102|              53|
|        Ivory Coast|      151|      139|      128|      107|       99|              52|
|           Honduras|      105|      104|       91|       72|       59|              46|
|            Hungary|      104|       91|       75|       69|       62|              42|
|            Romania|       86|       71|       57|       52|       48|              38|
|       Burkina Faso|      152|      145|      134|      121|      115|              37|
|           Cameroon|      133|      114|      107|       99|       96|              37|
|           Bulgaria|      134|      129|      105|      100|       97|              37|
|Congo (Brazzaville)|

#### Using Pandas

In [26]:
temp_res=pd.merge(pandas_df_15[['Country','Happiness Rank']], pandas_df_16[['Country','Happiness Rank']], on="Country")\
        .rename(columns={'Happiness Rank_x':'Rank_2015','Happiness Rank_y':'Rank_2016'})\
        .query("Rank_2015 >= Rank_2016")\
        .merge(pandas_df_17[['Country','Happiness.Rank']],on="Country")\
        .rename(columns={'Happiness.Rank':'Rank_2017'})\
        .query("Rank_2016 >= Rank_2017")\
        .merge(pandas_df_18[['Country or region','Overall rank']],left_on="Country",right_on="Country or region")\
        .rename(columns={'Overall rank':'Rank_2018'})\
        .query("Rank_2017 >= Rank_2018")\
        .drop(columns='Country or region')\
        .merge(pandas_df_19[['Country or region','Overall rank']],left_on="Country",right_on="Country or region")\
        .rename(columns={'Overall rank':'Rank_2019'})\
        .query("Rank_2018 >= Rank_2019")\
        .drop(columns='Country or region')
        

temp_res.assign(Increment_Factor=temp_res["Rank_2015"] - temp_res["Rank_2019"])\
        .sort_values('Increment_Factor',ascending=False)\
        .style.hide_index()

Country,Rank_2015,Rank_2016,Rank_2017,Rank_2018,Rank_2019,Increment_Factor
Benin,155,153,143,136,102,53
Ivory Coast,151,139,128,107,99,52
Honduras,105,104,91,72,59,46
Hungary,104,91,75,69,62,42
Romania,86,71,57,52,48,38
Cameroon,133,114,107,99,96,37
Burkina Faso,152,145,134,121,115,37
Bulgaria,134,129,105,100,97,37
Cambodia,145,140,129,120,109,36
Congo (Brazzaville),139,127,124,114,103,36


#### Using Spark Dataframe

In [27]:
spark_df_15['Country','Happiness Rank']\
    .withColumnRenamed("Happiness Rank","Rank_2015")\
    .join(spark_df_16['Country','Happiness Rank'].withColumnRenamed("Happiness Rank","Rank_2016"),on='Country')\
    .filter(F.col("Rank_2015") >= F.col("Rank_2016"))\
    .join(spark_df_17['Country','`Happiness.Rank`'].withColumnRenamed("Happiness.Rank","Rank_2017"),on='Country')\
    .filter(F.col("Rank_2016") >= F.col("Rank_2017"))\
    .join(spark_df_18.select(F.col('Country or region').alias('Country'),F.col('`Overall rank`').alias('Rank_2018')),on='Country')\
    .filter(F.col("Rank_2017") >= F.col("Rank_2018"))\
    .join(spark_df_19.select(F.col('Country or region').alias('Country'),F.col('`Overall rank`').alias('Rank_2019')),on='Country')\
    .filter(F.col("Rank_2018") >= F.col("Rank_2019"))\
    .withColumn('Increment_Factor', F.col("Rank_2015") - F.col("Rank_2019"))\
    .sort('Increment_Factor',ascending=False)\
    .show(200)
    

+-------------------+---------+---------+---------+---------+---------+----------------+
|            Country|Rank_2015|Rank_2016|Rank_2017|Rank_2018|Rank_2019|Increment_Factor|
+-------------------+---------+---------+---------+---------+---------+----------------+
|              Benin|      155|      153|      143|      136|      102|              53|
|        Ivory Coast|      151|      139|      128|      107|       99|              52|
|           Honduras|      105|      104|       91|       72|       59|              46|
|            Hungary|      104|       91|       75|       69|       62|              42|
|            Romania|       86|       71|       57|       52|       48|              38|
|       Burkina Faso|      152|      145|      134|      121|      115|              37|
|           Cameroon|      133|      114|      107|       99|       96|              37|
|           Bulgaria|      134|      129|      105|      100|       97|              37|
|           Cambodia|

### Find the countries which continously observed drop in rank along with year wise rank and diffence of rank from 2015 to 2019 
* Sort the data by biggest to lowest change between 2015 to 2019

#### Using Spark Sql

In [28]:
spark.sql("""Select t1.Country ,t1.`Happiness Rank` as Rank_2015,t2.`Happiness Rank` Rank_2016,t3.`Happiness.Rank`  Rank_2017, 
            t4.`Overall rank` Rank_2018 ,t5.`Overall rank` Rank_2019,
            (t1.`Happiness Rank`-t5.`Overall rank`) Decrement_Factor
            from  spark_tbl_15  t1
            inner join spark_tbl_16 t2
            on t1.Country = t2.Country and t1.`Happiness Rank` < t2.`Happiness Rank`
            inner join spark_tbl_17 t3
            on t1.Country = t3.Country and t2.`Happiness Rank` < t3.`Happiness.Rank`
            inner join spark_tbl_18 t4
            on t1.Country = t4.`Country or region` and t3.`Happiness.Rank` < t4.`Overall rank`
            inner join spark_tbl_19 t5
            on t1.Country = t5.`Country or region` and t4.`Overall rank` < t5.`Overall rank`
            order by Decrement_Factor asc
          
          """).show(200)

                                                                                

+-----------+---------+---------+---------+---------+---------+----------------+
|    Country|Rank_2015|Rank_2016|Rank_2017|Rank_2018|Rank_2019|Decrement_Factor|
+-----------+---------+---------+---------+---------+---------+----------------+
|  Venezuela|       23|       44|       82|      102|      108|             -85|
|     Zambia|       85|      106|      116|      125|      138|             -53|
|   Zimbabwe|      115|      131|      138|      144|      146|             -31|
|      India|      117|      118|      122|      133|      140|             -23|
|    Belarus|       59|       61|       67|       73|       81|             -22|
|   Botswana|      128|      137|      142|      146|      148|             -20|
|     Malawi|      131|      132|      136|      147|      150|             -19|
|    Moldova|       52|       55|       56|       67|       71|             -19|
|     Brazil|       16|       17|       22|       28|       32|             -16|
| Azerbaijan|       80|     

#### Using Pandas

In [29]:

temp_res=pd.merge(pandas_df_15[['Country','Happiness Rank']], pandas_df_16[['Country','Happiness Rank']], on="Country")\
        .rename(columns={'Happiness Rank_x':'Rank_2015','Happiness Rank_y':'Rank_2016'})\
        .query("Rank_2015 < Rank_2016")\
        .merge(pandas_df_17[['Country','Happiness.Rank']],on="Country")\
        .rename(columns={'Happiness.Rank':'Rank_2017'})\
        .query("Rank_2016 < Rank_2017")\
        .merge(pandas_df_18[['Country or region','Overall rank']],left_on="Country",right_on="Country or region")\
        .rename(columns={'Overall rank':'Rank_2018'})\
        .query("Rank_2017 < Rank_2018")\
        .drop(columns='Country or region')\
        .merge(pandas_df_19[['Country or region','Overall rank']],left_on="Country",right_on="Country or region")\
        .rename(columns={'Overall rank':'Rank_2019'})\
        .query("Rank_2018 < Rank_2019")\
        .drop(columns='Country or region')
        

temp_res.assign(Decrement_Factor=temp_res["Rank_2015"] - temp_res["Rank_2019"])\
        .sort_values('Decrement_Factor')\
        .style.hide_index()

Country,Rank_2015,Rank_2016,Rank_2017,Rank_2018,Rank_2019,Decrement_Factor
Venezuela,23,44,82,102,108,-85
Zambia,85,106,116,125,138,-53
Zimbabwe,115,131,138,144,146,-31
India,117,118,122,133,140,-23
Belarus,59,61,67,73,81,-22
Botswana,128,137,142,146,148,-20
Moldova,52,55,56,67,71,-19
Malawi,131,132,136,147,150,-19
Brazil,16,17,22,28,32,-16
Azerbaijan,80,81,85,87,90,-10


#### Using Spark Dataframe

In [30]:
spark_df_15['Country','Happiness Rank']\
    .withColumnRenamed("Happiness Rank","Rank_2015")\
    .join(spark_df_16['Country','Happiness Rank'].withColumnRenamed("Happiness Rank","Rank_2016"),on='Country')\
    .filter(F.col("Rank_2015") < F.col("Rank_2016"))\
    .join(spark_df_17['Country','`Happiness.Rank`'].withColumnRenamed("Happiness.Rank","Rank_2017"),on='Country')\
    .filter(F.col("Rank_2016") < F.col("Rank_2017"))\
    .join(spark_df_18.select(F.col('Country or region').alias('Country'),F.col('`Overall rank`').alias('Rank_2018')),on='Country')\
    .filter(F.col("Rank_2017") < F.col("Rank_2018"))\
    .join(spark_df_19.select(F.col('Country or region').alias('Country'),F.col('`Overall rank`').alias('Rank_2019')),on='Country')\
    .filter(F.col("Rank_2018") < F.col("Rank_2019"))\
    .withColumn('Decrement_Factor', F.col("Rank_2015") - F.col("Rank_2019"))\
    .sort('Decrement_Factor')\
    .show(200)
    

                                                                                

+-----------+---------+---------+---------+---------+---------+----------------+
|    Country|Rank_2015|Rank_2016|Rank_2017|Rank_2018|Rank_2019|Decrement_Factor|
+-----------+---------+---------+---------+---------+---------+----------------+
|  Venezuela|       23|       44|       82|      102|      108|             -85|
|     Zambia|       85|      106|      116|      125|      138|             -53|
|   Zimbabwe|      115|      131|      138|      144|      146|             -31|
|      India|      117|      118|      122|      133|      140|             -23|
|    Belarus|       59|       61|       67|       73|       81|             -22|
|   Botswana|      128|      137|      142|      146|      148|             -20|
|     Malawi|      131|      132|      136|      147|      150|             -19|
|    Moldova|       52|       55|       56|       67|       71|             -19|
|     Brazil|       16|       17|       22|       28|       32|             -16|
| Azerbaijan|       80|     

In [31]:
spark.stop()


### In progress