In [43]:
import findspark
findspark.init()

In [44]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('World Happiness Data').getOrCreate()
spark

In [45]:
df=spark.read.csv('../World Happiness Report 2021/world-happiness-report.csv',header=True)
# df.count()
df.show(5)



+------------+----+-----------+------------------+--------------+--------------------------------+----------------------------+----------+-------------------------+---------------+---------------+
|Country name|year|Life Ladder|Log GDP per capita|Social support|Healthy life expectancy at birth|Freedom to make life choices|Generosity|Perceptions of corruption|Positive affect|Negative affect|
+------------+----+-----------+------------------+--------------+--------------------------------+----------------------------+----------+-------------------------+---------------+---------------+
| Afghanistan|2008|      3.724|             7.370|         0.451|                          50.800|                       0.718|     0.168|                    0.882|          0.518|          0.258|
| Afghanistan|2009|      4.402|             7.540|         0.552|                          51.200|                       0.679|     0.190|                    0.850|          0.584|          0.237|
| Afghanistan|2

In [46]:
df.printSchema()

root
 |-- Country name: string (nullable = true)
 |-- year: string (nullable = true)
 |-- Life Ladder: string (nullable = true)
 |-- Log GDP per capita: string (nullable = true)
 |-- Social support: string (nullable = true)
 |-- Healthy life expectancy at birth: string (nullable = true)
 |-- Freedom to make life choices: string (nullable = true)
 |-- Generosity: string (nullable = true)
 |-- Perceptions of corruption: string (nullable = true)
 |-- Positive affect: string (nullable = true)
 |-- Negative affect: string (nullable = true)



In [47]:
#Need to convert all columns to double except for Year and Country
from pyspark.sql.functions import col
for i in df.columns :
    if i not in ['year','Country name']:
        df=df.withColumn(i, col(i).cast('Double'))
        
df=df.withColumn('year',col('year').cast('bigint'))   
df.printSchema()

root
 |-- Country name: string (nullable = true)
 |-- year: long (nullable = true)
 |-- Life Ladder: double (nullable = true)
 |-- Log GDP per capita: double (nullable = true)
 |-- Social support: double (nullable = true)
 |-- Healthy life expectancy at birth: double (nullable = true)
 |-- Freedom to make life choices: double (nullable = true)
 |-- Generosity: double (nullable = true)
 |-- Perceptions of corruption: double (nullable = true)
 |-- Positive affect: double (nullable = true)
 |-- Negative affect: double (nullable = true)



**Which Country has the highest and lowest Healthy life expectancy at birth over the year?**

In [48]:
from pyspark.sql.functions import col
max_le=df.agg({"Healthy life expectancy at birth":'max'}).collect()[0][0]
min_le=df.agg({"Healthy life expectancy at birth":'min'}).collect()[0][0]
df.filter(col("Healthy life expectancy at birth").isin(max_le,min_le))\
    .select("Country name","Healthy life expectancy at birth").distinct()\
    .show(truncate=False)


+------------+--------------------------------+
|Country name|Healthy life expectancy at birth|
+------------+--------------------------------+
|Singapore   |77.1                            |
|Haiti       |32.3                            |
+------------+--------------------------------+



In [49]:
#Check for nulls 
# Find count for empty, None, Null, Nan with string literals.
from pyspark.sql.functions import col,isnan,when,count

def check_nulls(dataframe,columns): 
    return dataframe.select([count(when(col(c).contains('None') | \
                                col(c).contains('NULL') | \
                                (col(c) == '' ) | \
                                col(c).isNull() | \
                                isnan(c), c 
                               )).alias(c)
                        for c in columns])


null_check=check_nulls(df,df.columns)
null_check.show()

#Columns

+------------+----+-----------+------------------+--------------+--------------------------------+----------------------------+----------+-------------------------+---------------+---------------+
|Country name|year|Life Ladder|Log GDP per capita|Social support|Healthy life expectancy at birth|Freedom to make life choices|Generosity|Perceptions of corruption|Positive affect|Negative affect|
+------------+----+-----------+------------------+--------------+--------------------------------+----------------------------+----------+-------------------------+---------------+---------------+
|           0|   0|          0|                36|            13|                              55|                          32|        89|                      110|             22|             16|
+------------+----+-----------+------------------+--------------+--------------------------------+----------------------------+----------+-------------------------+---------------+---------------+



In [50]:
# fill nulls  with 0
df=df.na.fill(value=0,subset=[c for c in df.columns if c not in ['Country name','year']])
null_check=check_nulls(df,df.columns)
null_check.show()

+------------+----+-----------+------------------+--------------+--------------------------------+----------------------------+----------+-------------------------+---------------+---------------+
|Country name|year|Life Ladder|Log GDP per capita|Social support|Healthy life expectancy at birth|Freedom to make life choices|Generosity|Perceptions of corruption|Positive affect|Negative affect|
+------------+----+-----------+------------------+--------------+--------------------------------+----------------------------+----------+-------------------------+---------------+---------------+
|           0|   0|          0|                 0|             0|                               0|                           0|         0|                        0|              0|              0|
+------------+----+-----------+------------------+--------------+--------------------------------+----------------------------+----------+-------------------------+---------------+---------------+



**Find a highly corrupted Country By Year (order from high to low)**

In [51]:
from pyspark.sql.window import Window
from pyspark.sql.functions import rank,desc
w= Window().partitionBy("Year").orderBy(col("Perceptions of corruption").desc())
df.withColumn('rn',rank().over(w)).filter(col('rn')==1)\
        .select('Country name','year',"Perceptions of corruption").orderBy(desc('Perceptions of corruption')).show(truncate=False)



+----------------------+----+-------------------------+
|Country name          |year|Perceptions of corruption|
+----------------------+----+-------------------------+
|Poland                |2005|0.983                    |
|Hungary               |2010|0.983                    |
|Lithuania             |2009|0.979                    |
|Serbia                |2011|0.977                    |
|Croatia               |2011|0.977                    |
|Bosnia and Herzegovina|2014|0.976                    |
|Bulgaria              |2007|0.976                    |
|Indonesia             |2013|0.973                    |
|Moldova               |2016|0.969                    |
|Indonesia             |2008|0.968                    |
|Lithuania             |2006|0.967                    |
|Bosnia and Herzegovina|2019|0.963                    |
|Indonesia             |2012|0.962                    |
|Romania               |2015|0.962                    |
|Croatia               |2020|0.961              

**Which Country had a Positive and Negative effect in the Great Depression (2008)?**

In [52]:
df.select('Positive affect','Negative affect').filter('year==2008').describe().show()

+-------+------------------+-------------------+
|summary|   Positive affect|    Negative affect|
+-------+------------------+-------------------+
|  count|               110|                110|
|   mean|0.6933999999999999| 0.2435454545454546|
| stddev| 0.145500855951327|0.07080566975219957|
|    min|               0.0|               0.12|
|    max|              0.89|              0.448|
+-------+------------------+-------------------+



In [53]:
df.filter(((col('year')==2008) & (col('Positive affect')==0.89) | (col('Negative affect')==0.12)))\
        .select('Country name','year','Positive affect','Negative affect').show()

+------------+----+---------------+---------------+
|Country name|year|Positive affect|Negative affect|
+------------+----+---------------+---------------+
|      Canada|2008|           0.89|          0.202|
|    Djibouti|2008|          0.755|           0.12|
+------------+----+---------------+---------------+



**Find the country whose GDP remained almost constant .**

In [54]:
from pyspark.sql.functions import mean,abs,avg


w= Window().partitionBy("Country name")
df1=df.withColumn('mean',mean('Log GDP per capita').over(w))\
    .select('Country name','year','Log GDP per capita','mean',abs(col('mean')-col('Log GDP per capita')).alias('abs_diff'))

# df1.show()

df2=df1.withColumn('avg_diff',avg(col('abs_diff')).over(w))\
        .select('Country name','avg_diff').distinct().orderBy('avg_diff')\
        .show()



+-----------------+--------------------+
|     Country name|            avg_diff|
+-----------------+--------------------+
|             Cuba|                 0.0|
|          Somalia|                 0.0|
|           Guyana|                 0.0|
|Somaliland region|                 0.0|
|      South Sudan|                 0.0|
|     North Cyprus|                 0.0|
|         Suriname|                 0.0|
|             Oman|                 0.0|
|         Maldives|                 0.0|
|           Belize|0.004500000000000...|
|           Norway|0.011100000000000065|
|            Gabon|0.015950617283950308|
|          Austria| 0.01862721893491121|
|          Jamaica| 0.02066666666666676|
|           Gambia| 0.02133333333333365|
|           Angola|0.021999999999999353|
|          Finland|0.022071005917159672|
|           France|0.022488888888889016|
|       Mauritania|  0.0228165680473371|
|          Algeria|0.022906249999999462|
+-----------------+--------------------+
only showing top

In [55]:
df.select('Country name','year','Log GDP per capita').filter(col("Country name")=="Cuba").show()
#Cuba has only one data point(i.e year 2006 ). The more the data point more accurate would results be.

+------------+----+------------------+
|Country name|year|Log GDP per capita|
+------------+----+------------------+
|        Cuba|2006|               0.0|
+------------+----+------------------+



**Calculate Percentage change of Life Ladder w.r.t to previous year and find the year in which there was positive % change in Life Ladder by Country.**

In [56]:
from pyspark.sql.functions import lag,isnull,round
w = Window.partitionBy('Country name').orderBy("year")

df1=df.withColumn("prev_value", lag(col('Life Ladder')).over(w))\
    .withColumn("diff", when(isnull(col('Life Ladder') - col('prev_value')), 0)
                              .otherwise(col('Life Ladder') - col('prev_value')))\
    .withColumn('Perctage Change in LifeLadder',round((col('diff')/col('Life Ladder'))*100,2 ))\
    .select('Country name','year','Life Ladder','diff','Perctage Change in LifeLadder')
# df1.show()

w = Window.partitionBy('Country name').orderBy(desc("Perctage Change in LifeLadder"))
df1.withColumn('rn',rank().over(w)).filter('rn==1').drop('rn','diff').show()

#Life Ladder improved most in below years

+--------------------+----+-----------+-----------------------------+
|        Country name|year|Life Ladder|Perctage Change in LifeLadder|
+--------------------+----+-----------+-----------------------------+
|         Afghanistan|2015|      3.983|                        21.39|
|             Albania|2009|      5.485|                        15.52|
|             Algeria|2014|      6.355|                         11.8|
|              Angola|2011|      5.589|                          0.0|
|           Argentina|2009|      6.424|                         7.21|
|             Armenia|2018|      5.062|                        15.29|
|           Australia|2010|       7.45|                         2.63|
|             Austria|2017|      7.294|                         3.37|
|          Azerbaijan|2013|      5.481|                         10.4|
|             Bahrain|2013|       6.69|                        24.86|
|          Bangladesh|2019|      5.114|                        12.03|
|             Belaru