# Import Libraries

In [143]:
import pyspark
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.stat import Correlation
from pyspark.ml.regression import LinearRegression, RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import ParamGridBuilder, TrainValidationSplit

# Obtain Data

Create sparksession

In [131]:
spark = pyspark.sql.SparkSession.builder.master('local').getOrCreate()

Read df into pyspark ml

In [132]:
df = spark.read.csv('excel/scrubbed_dataset.csv', header='true', inferSchema='true')
print(df.columns)
df.show(5, truncate=True, vertical=True)

['Access to electricity (% of population)', 'Current health expenditure per capita, PPP (current international $)', 'GDP per capita (constant LCU)', 'Government expenditure on education, total (% of GDP)', 'Intentional homicides (per 100,000 people)', 'Life expectancy at birth, total (years)', 'Out-of-pocket expenditure (% of current health expenditure)', 'Suicide mortality rate (per 100,000 population)', 'Unemployment, total (% of total labor force) (modeled ILO estimate)', 'Urban population (% of total population)', 'Country']
-RECORD 0-----------------------------------------------------------------------------------
 Access to electricity (% of population)                              | 98.7132034301758    
 Current health expenditure per capita, PPP (current international $) | 186.4072876         
 GDP per capita (constant LCU)                                        | 34696.1279892983    
 Government expenditure on education, total (% of GDP)                | 4.0588698387146005  


# Explore Data

Calculate descriptive statistics of life expectancy dataset

In [133]:
df.describe().show(vertical=True)

-RECORD 0----------------------------------------------------------------------------------
 summary                                                              | count              
 Access to electricity (% of population)                              | 174                
 Current health expenditure per capita, PPP (current international $) | 174                
 GDP per capita (constant LCU)                                        | 174                
 Government expenditure on education, total (% of GDP)                | 174                
 Intentional homicides (per 100,000 people)                           | 174                
 Life expectancy at birth, total (years)                              | 174                
 Out-of-pocket expenditure (% of current health expenditure)          | 174                
 Suicide mortality rate (per 100,000 population)                      | 174                
 Unemployment, total (% of total labor force) (modeled ILO estimate)  | 174     

Assemble features into a single column

In [134]:
features = df.columns
features.remove('Life expectancy at birth, total (years)')
features.remove('Country')
assembler = VectorAssembler(inputCols=features, outputCol='Features')
assembled_df = assembler.transform(df)
assembled_df.show(5, truncate=True, vertical=True)

-RECORD 0------------------------------------------------------------------------------------
 Access to electricity (% of population)                              | 98.7132034301758     
 Current health expenditure per capita, PPP (current international $) | 186.4072876          
 GDP per capita (constant LCU)                                        | 34696.1279892983     
 Government expenditure on education, total (% of GDP)                | 4.0588698387146005   
 Intentional homicides (per 100,000 people)                           | 6.6555611518         
 Life expectancy at birth, total (years)                              | 64.486               
 Out-of-pocket expenditure (% of current health expenditure)          | 78.38278198          
 Suicide mortality rate (per 100,000 population)                      | 4.7                  
 Unemployment, total (% of total labor force) (modeled ILO estimate)  | 11.163999557495101   
 Urban population (% of total population)                   

Calculate correlations between features

In [135]:
correlations = Correlation.corr(assembled_df, 'Features').collect()[0]["pearson({})".format('Features')].values
high_corr = max([x for x in list(correlations) if x != 1])
print('Highest Correlation: ', high_corr)
correlations

Highest Correlation:  0.5258535033950555


array([ 1.        ,  0.30516997,  0.08915253,  0.18320509, -0.0640105 ,
       -0.11394804,  0.18294923,  0.07655328,  0.5258535 ,  0.30516997,
        1.        , -0.07220666,  0.22387894, -0.23180453, -0.35267823,
        0.30089646, -0.15024685,  0.50797783,  0.08915253, -0.07220666,
        1.        , -0.06007638, -0.01941469,  0.06826174, -0.04928279,
       -0.07814293,  0.00901303,  0.18320509,  0.22387894, -0.06007638,
        1.        ,  0.11107473, -0.33194147,  0.15054383,  0.13695932,
        0.19695017, -0.0640105 , -0.23180453, -0.01941469,  0.11107473,
        1.        ,  0.02263126, -0.0920256 ,  0.17096494, -0.08077984,
       -0.11394804, -0.35267823,  0.06826174, -0.33194147,  0.02263126,
        1.        , -0.18222998, -0.00574389, -0.2442824 ,  0.18294923,
        0.30089646, -0.04928279,  0.15054383, -0.0920256 , -0.18222998,
        1.        , -0.03023459,  0.17683999,  0.07655328, -0.15024685,
       -0.07814293,  0.13695932,  0.17096494, -0.00574389, -0.03

# Model Data

Create df with columns needed for regression model only

In [136]:
regression_df = assembled_df.select('Features', 'Life expectancy at birth, total (years)')
regression_df = regression_df.withColumnRenamed('Features', 'features')
regression_df = regression_df.withColumnRenamed('Life expectancy at birth, total (years)', 'label')
regression_df.show(5, truncate=False, vertical=True)

-RECORD 0----------------------------------------------------------------------------------------------------------------------------------------
 features | [98.7132034301758,186.4072876,34696.1279892983,4.0588698387146005,6.6555611518,78.38278198,4.7,11.163999557495101,25.754]            
 label    | 64.486                                                                                                                               
-RECORD 1----------------------------------------------------------------------------------------------------------------------------------------
 features | [100.0,697.30487061,541581.745631168,3.61172008514404,2.2894924438,44.58411789,6.3,12.812999725341802,61.229]                        
 label    | 78.458                                                                                                                               
-RECORD 2-------------------------------------------------------------------------------------------------------------------

Train test split data

In [137]:
train, test = regression_df.randomSplit([0.8, 0.2], seed=97148)
print((train.count(), len(train.columns)))
print((test.count(), len(test.columns)))
train.show(5, truncate=False, vertical=True)

(125, 2)
(49, 2)
-RECORD 0-------------------------------------------------------------------------------------------------------------------------------------
 features | [11.7593555450439,79.01350403,338971.14757147396,2.45481991767883,18.14757598704,61.90134811,8.8,1.942999958992,23.279]           
 label    | 53.977                                                                                                                            
-RECORD 1-------------------------------------------------------------------------------------------------------------------------------------
 features | [14.4,111.71577454,437009.138356661,5.38379001617432,1.2504405198000001,35.83074951,7.7,6.41400003433228,29.98]                   
 label    | 61.174                                                                                                                            
-RECORD 2--------------------------------------------------------------------------------------------------------------------

Build linear regression model

In [138]:
lr = LinearRegression(maxIter=10)
paramGrid = ParamGridBuilder().addGrid(lr.regParam, [.5, 0.1, 0.01, 0.0]).addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0]).build()
tvs = TrainValidationSplit(estimator=lr,
                           estimatorParamMaps=paramGrid,
                           evaluator=RegressionEvaluator(),
                           trainRatio=0.8)
model = tvs.fit(train)
predictions = model.transform(test).select("features", "label", "prediction")
predictions.show()

+--------------------+------------------+------------------+
|            features|             label|        prediction|
+--------------------+------------------+------------------+
|[11.0242519378661...|            61.247| 56.56954844784067|
|[18.02,119.512825...|            63.798| 58.54127752346385|
|[25.9078121185303...|            66.681|60.148265508165395|
|[25.9182662963867...|             63.73| 72.24751127604908|
|[26.1,257.3265380...|            54.309| 59.97796985238802|
|[31.1,117.7957382...|60.163000000000004| 60.99931129600803|
|[32.4203987121581...|            52.805|59.943280621947615|
|[41.529239654541,...|             61.47|62.914721203369645|
|[43.2592582702637...|            60.782| 63.39966901577246|
|[44.9799575805664...|             66.24| 62.53422216848385|
|[49.6155433654784...|            65.941|63.679545524682446|
|[62.6600723266602...| 58.92100000000001| 66.17426883956355|
|[85.5918350219727...|             69.26| 71.47952360244952|
|[91.88720703125,3...| 7

Evaluate linear regresion model performance

In [139]:
evaluator = RegressionEvaluator(predictionCol='prediction', labelCol='label')
print('rmse: ', evaluator.evaluate(predictions,{evaluator.metricName: 'rmse'}))
print('mae: ', evaluator.evaluate(predictions,{evaluator.metricName: 'mae'}))
print('r2: ', evaluator.evaluate(predictions,{evaluator.metricName: 'r2'}))

rmse:  3.8937656483610272
mae:  3.1618810851372987
r2:  0.7399775322095343


View regression model coefficients

In [140]:
model.bestModel.coefficients

DenseVector([0.1827, 0.001, 0.0, 0.0, -0.0611, -0.0272, 0.0, -0.038, 0.0288])

Build random forest regression model

In [147]:
rf = RandomForestRegressor()
rfParamGrid = ParamGridBuilder().addGrid(rf.maxDepth, range(0,11)).addGrid(rf.numTrees, [100,200]).build()
rfTvs = TrainValidationSplit(estimator=rf,
                           estimatorParamMaps=rfParamGrid,
                           evaluator=RegressionEvaluator(),
                           trainRatio=0.8,
                           parallelism = 4)
rfModel = rfTvs.fit(train)
rfPredictions = rfModel.transform(test).select("features", "label", "prediction")
rfPredictions.show()

+--------------------+------------------+------------------+
|            features|             label|        prediction|
+--------------------+------------------+------------------+
|[11.0242519378661...|            61.247|  61.7123134848485|
|[18.02,119.512825...|            63.798| 63.62545499999997|
|[25.9078121185303...|            66.681| 61.99865681818184|
|[25.9182662963867...|             63.73| 69.31822178861789|
|[26.1,257.3265380...|            54.309|61.504581818181805|
|[31.1,117.7957382...|60.163000000000004|63.431161818181835|
|[32.4203987121581...|            52.805| 60.34407181818184|
|[41.529239654541,...|             61.47| 62.76333862950059|
|[43.2592582702637...|            60.782| 63.98898033333336|
|[44.9799575805664...|             66.24|62.474203897546914|
|[49.6155433654784...|            65.941| 61.52056174603175|
|[62.6600723266602...| 58.92100000000001| 63.72685926829265|
|[85.5918350219727...|             69.26|  68.8248061382114|
|[91.88720703125,3...| 7

Evaluate random forest regression model

In [150]:
rfEvaluator = RegressionEvaluator(predictionCol='prediction', labelCol='label')
print('rmse: ', rfEvaluator.evaluate(rfPredictions,{rfEvaluator.metricName: 'rmse'}))
print('mae: ', rfEvaluator.evaluate(rfPredictions,{rfEvaluator.metricName: 'mae'}))
print('r2: ', rfEvaluator.evaluate(rfPredictions,{rfEvaluator.metricName: 'r2'}))

rmse:  3.456513103262068
mae:  2.682687631601607
r2:  0.7950973042267448


View random forest regression coefficients

In [152]:
rfModel.bestModel.featureImportances

SparseVector(9, {0: 0.3713, 1: 0.267, 2: 0.0226, 3: 0.0238, 4: 0.0689, 5: 0.0618, 6: 0.0398, 7: 0.0386, 8: 0.1062})