In [1]:
import findspark
findspark.init()

In [3]:
from pyspark.sql import SparkSession
spark = SparkSession.builder \
.appName("MultiLinearReg") \
.master("local[4]") \
.config("spark.driver.memory","2g") \
.config("spark.executor.memory","4g") \
.getOrCreate()

In [4]:
df = spark.read.format("csv") \
.option("header", True) \
.option("inferSchema", True) \
.option("sep",",") \
.load("C:/Users/Rulokat/Desktop/GitHub/apache-spark/Mllib-ApacheSpark/Life Expectancy Data.csv")

In [5]:
df.toPandas().head()

Unnamed: 0,Country,Year,Status,Life expectancy,Adult Mortality,infant deaths,Alcohol,percentage expenditure,Hepatitis B,Measles,...,Polio,Total expenditure,Diphtheria,HIV/AIDS,GDP,Population,thinness 1-19 years,thinness 5-9 years,Income composition of resources,Schooling
0,Afghanistan,2015,Developing,65.0,263.0,62,0.01,71.279624,65.0,1154,...,6.0,8.16,65.0,0.1,584.25921,33736494.0,17.2,17.3,0.479,10.1
1,Afghanistan,2014,Developing,59.9,271.0,64,0.01,73.523582,62.0,492,...,58.0,8.18,62.0,0.1,612.696514,327582.0,17.5,17.5,0.476,10.0
2,Afghanistan,2013,Developing,59.9,268.0,66,0.01,73.219243,64.0,430,...,62.0,8.13,64.0,0.1,631.744976,31731688.0,17.7,17.7,0.47,9.9
3,Afghanistan,2012,Developing,59.5,272.0,69,0.01,78.184215,67.0,2787,...,67.0,8.52,67.0,0.1,669.959,3696958.0,17.9,18.0,0.463,9.8
4,Afghanistan,2011,Developing,59.2,275.0,71,0.01,7.097109,68.0,3013,...,68.0,7.87,68.0,0.1,63.537231,2978599.0,18.2,18.2,0.454,9.5


In [6]:
df.printSchema()

root
 |-- Country: string (nullable = true)
 |-- Year: integer (nullable = true)
 |-- Status: string (nullable = true)
 |-- Life expectancy : double (nullable = true)
 |-- Adult Mortality: integer (nullable = true)
 |-- infant deaths: integer (nullable = true)
 |-- Alcohol: double (nullable = true)
 |-- percentage expenditure: double (nullable = true)
 |-- Hepatitis B: integer (nullable = true)
 |-- Measles : integer (nullable = true)
 |--  BMI : double (nullable = true)
 |-- under-five deaths : integer (nullable = true)
 |-- Polio: integer (nullable = true)
 |-- Total expenditure: double (nullable = true)
 |-- Diphtheria : integer (nullable = true)
 |--  HIV/AIDS: double (nullable = true)
 |-- GDP: double (nullable = true)
 |-- Population: double (nullable = true)
 |--  thinness  1-19 years: double (nullable = true)
 |--  thinness 5-9 years: double (nullable = true)
 |-- Income composition of resources: double (nullable = true)
 |-- Schooling: double (nullable = true)



Fixing Column names.

In [7]:
new_cols = ["Country", "Year", "Status", "label", "AdultMortality",
      "InfantDeaths", "Alcohol", "PercentageExpenditure", "HepatitisB", "Measles", "BMI", "UnderFiveDeaths",
      "Polio", "TotalExpenditure", "Diphtheria", "HIV_AIDS", "GDP", "Population", "Thinness119", "Thinness59",
      "IncomeCompositionOfResources", "Schooling"]

In [8]:
df2 = df.toDF(*new_cols)

In [9]:
df2.printSchema()

root
 |-- Country: string (nullable = true)
 |-- Year: integer (nullable = true)
 |-- Status: string (nullable = true)
 |-- label: double (nullable = true)
 |-- AdultMortality: integer (nullable = true)
 |-- InfantDeaths: integer (nullable = true)
 |-- Alcohol: double (nullable = true)
 |-- PercentageExpenditure: double (nullable = true)
 |-- HepatitisB: integer (nullable = true)
 |-- Measles: integer (nullable = true)
 |-- BMI: double (nullable = true)
 |-- UnderFiveDeaths: integer (nullable = true)
 |-- Polio: integer (nullable = true)
 |-- TotalExpenditure: double (nullable = true)
 |-- Diphtheria: integer (nullable = true)
 |-- HIV_AIDS: double (nullable = true)
 |-- GDP: double (nullable = true)
 |-- Population: double (nullable = true)
 |-- Thinness119: double (nullable = true)
 |-- Thinness59: double (nullable = true)
 |-- IncomeCompositionOfResources: double (nullable = true)
 |-- Schooling: double (nullable = true)



In [10]:
categorical_cols = ["Country","Status"]
'''
numerical_cols = ["Year", "AdultMortality",
      "InfantDeaths", "Alcohol", "PercentageExpenditure", "HepatitisB", "Measles", "BMI", "UnderFiveDeaths",
      "Polio", "TotalExpenditure", "Diphtheria", "HIV_AIDS", "GDP", "Population", "Thinness119", "Thinness59",
      "IncomeCompositionOfResources", "Schooling"]
'''
numerical_cols = ["Year", "AdultMortality",
      "InfantDeaths", "Alcohol",   "BMI", "UnderFiveDeaths",
      "TotalExpenditure", "Diphtheria", "HIV_AIDS", "GDP",  
      "IncomeCompositionOfResources", "Schooling"]
label = ["label"]

##### Checking missing values

In [11]:
df3 = df2.na.drop()

In [12]:
df3.count()

1649

#### Data Preprocessing

In [13]:
from pyspark.ml.feature import StringIndexer, OneHotEncoderEstimator, VectorAssembler
from pyspark.ml import Pipeline, PipelineModel
from pyspark.ml.regression import LinearRegression, LinearRegressionModel

#### StringIndexer

In [14]:
# Country'de kategori sayısı çok fazla olduğundan analize dahil etmedik
status_string_indexer = StringIndexer().setInputCol("Status").setOutputCol("StatusIndexed")

#### OneHotEncoder

In [15]:
encoder = OneHotEncoderEstimator().setInputCols(["StatusIndexed"]).setOutputCols(["StatusEncoded"])

#### VectorAssembler

In [16]:
vector_assembler = VectorAssembler().setInputCols(numerical_cols + encoder.getOutputCols()).setOutputCol("features")

#### Linear Model

In [17]:
linear_regression_object = LinearRegression().setFeaturesCol("features").setLabelCol("label")

#### Pipeline 

In [18]:
pipeline_object = Pipeline().setStages([status_string_indexer, encoder, vector_assembler, linear_regression_object])

#### Split Data Set

In [19]:
train_df, test_df = df3.randomSplit([0.8, 0.2], seed=142)
train_df.cache()
test_df.cache()

DataFrame[Country: string, Year: int, Status: string, label: double, AdultMortality: int, InfantDeaths: int, Alcohol: double, PercentageExpenditure: double, HepatitisB: int, Measles: int, BMI: double, UnderFiveDeaths: int, Polio: int, TotalExpenditure: double, Diphtheria: int, HIV_AIDS: double, GDP: double, Population: double, Thinness119: double, Thinness59: double, IncomeCompositionOfResources: double, Schooling: double]

In [20]:
pipeline_model = pipeline_object.fit(train_df)

In [21]:
pipeline_model.transform(train_df).select("label","prediction").toPandas().head(10)

Unnamed: 0,label,prediction
0,54.8,57.700099
1,55.3,58.148592
2,56.7,59.278697
3,57.0,59.166807
4,57.3,60.978716
5,57.5,61.274751
6,58.8,62.605374
7,59.2,62.829517
8,59.5,63.313159
9,59.9,63.345771


#### Checking Linear Object Values From Pipeline

In [22]:
lr_model = pipeline_model.stages[-1]

In [23]:
lr_model.coefficients

DenseVector([-0.1306, -0.0167, 0.0884, -0.0933, 0.0335, -0.0673, 0.1014, 0.0144, -0.4472, 0.0001, 9.6162, 0.9347, -0.7855])

In [24]:
lr_model.intercept

315.44452546889534

In [25]:
lr_model.summary.r2

0.8393963848802052

In [26]:
lr_model.summary.rootMeanSquaredError

3.534708903926837

In [27]:
lr_model.summary.pValues

[3.7210678383026163e-07,
 0.0,
 1.554312234475219e-14,
 0.011812369488869878,
 6.888301440355349e-08,
 4.218847493575595e-15,
 0.02289647691655916,
 0.0037352078727632687,
 0.0,
 7.28083149326153e-10,
 0.0,
 0.0,
 0.036279656005715255,
 9.313896320861659e-10]

In [28]:
lr_model.summary.tValues

[-5.109107918926713,
 -15.866071883283482,
 7.772887926975958,
 -2.5212668081293517,
 5.425509356666603,
 -7.94491898902853,
 2.2778692939988194,
 2.9049666429516243,
 -23.329980113514587,
 6.2064005864501715,
 10.722280046275595,
 14.333708382376306,
 -2.0959542172853327,
 6.166413827359158]