# Import Libraries

In [22]:
import pyspark
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.stat import Correlation

# Obtain Data

Create sparksession

In [4]:
spark = pyspark.sql.SparkSession.builder.master('local').getOrCreate()

Read df into pyspark ml

In [10]:
df = spark.read.csv('excel/scrubbed_dataset.csv', header='true', inferSchema='true')
print(df.columns)
df.show(5, truncate=True, vertical=True)

['Access to electricity (% of population)', 'Current health expenditure per capita, PPP (current international $)', 'GDP per capita (constant LCU)', 'Government expenditure on education, total (% of GDP)', 'Intentional homicides (per 100,000 people)', 'Life expectancy at birth, total (years)', 'Out-of-pocket expenditure (% of current health expenditure)', 'Suicide mortality rate (per 100,000 population)', 'Unemployment, total (% of total labor force) (modeled ILO estimate)', 'Urban population (% of total population)', 'Country']
-RECORD 0-----------------------------------------------------------------------------------
 Access to electricity (% of population)                              | 98.7132034301758    
 Current health expenditure per capita, PPP (current international $) | 186.4072876         
 GDP per capita (constant LCU)                                        | 34696.1279892983    
 Government expenditure on education, total (% of GDP)                | 4.0588698387146005  


# Explore Data

Calculate descriptive statistics of life expectancy dataset

In [11]:
df.describe().show(vertical=True)

-RECORD 0----------------------------------------------------------------------------------
 summary                                                              | count              
 Access to electricity (% of population)                              | 174                
 Current health expenditure per capita, PPP (current international $) | 174                
 GDP per capita (constant LCU)                                        | 174                
 Government expenditure on education, total (% of GDP)                | 174                
 Intentional homicides (per 100,000 people)                           | 174                
 Life expectancy at birth, total (years)                              | 174                
 Out-of-pocket expenditure (% of current health expenditure)          | 174                
 Suicide mortality rate (per 100,000 population)                      | 174                
 Unemployment, total (% of total labor force) (modeled ILO estimate)  | 174     

Assemble features into a single column

In [28]:
features = df.columns
features.remove('Life expectancy at birth, total (years)')
features.remove('Country')
assembler = VectorAssembler(inputCols=features, outputCol='Features')
assembled_df = assembler.transform(df)
assembled_df.show(5, truncate=True, vertical=True)

-RECORD 0------------------------------------------------------------------------------------
 Access to electricity (% of population)                              | 98.7132034301758     
 Current health expenditure per capita, PPP (current international $) | 186.4072876          
 GDP per capita (constant LCU)                                        | 34696.1279892983     
 Government expenditure on education, total (% of GDP)                | 4.0588698387146005   
 Intentional homicides (per 100,000 people)                           | 6.6555611518         
 Life expectancy at birth, total (years)                              | 64.486               
 Out-of-pocket expenditure (% of current health expenditure)          | 78.38278198          
 Suicide mortality rate (per 100,000 population)                      | 4.7                  
 Unemployment, total (% of total labor force) (modeled ILO estimate)  | 11.163999557495101   
 Urban population (% of total population)                   

Calculate correlations between features

In [58]:
correlations = Correlation.corr(assembled_df, 'Features').collect()[0]["pearson({})".format('Features')].values
high_corr = max([x for x in list(correlations) if x != 1])
print('Highest Correlation: ', high_corr)
correlations

Highest Correlation:  0.5258535033950555


array([ 1.        ,  0.30516997,  0.08915253,  0.18320509, -0.0640105 ,
       -0.11394804,  0.18294923,  0.07655328,  0.5258535 ,  0.30516997,
        1.        , -0.07220666,  0.22387894, -0.23180453, -0.35267823,
        0.30089646, -0.15024685,  0.50797783,  0.08915253, -0.07220666,
        1.        , -0.06007638, -0.01941469,  0.06826174, -0.04928279,
       -0.07814293,  0.00901303,  0.18320509,  0.22387894, -0.06007638,
        1.        ,  0.11107473, -0.33194147,  0.15054383,  0.13695932,
        0.19695017, -0.0640105 , -0.23180453, -0.01941469,  0.11107473,
        1.        ,  0.02263126, -0.0920256 ,  0.17096494, -0.08077984,
       -0.11394804, -0.35267823,  0.06826174, -0.33194147,  0.02263126,
        1.        , -0.18222998, -0.00574389, -0.2442824 ,  0.18294923,
        0.30089646, -0.04928279,  0.15054383, -0.0920256 , -0.18222998,
        1.        , -0.03023459,  0.17683999,  0.07655328, -0.15024685,
       -0.07814293,  0.13695932,  0.17096494, -0.00574389, -0.03