In [1]:
from pyspark.sql import SparkSession
from pyspark import SparkContext, SparkConf
from pyspark.ml import Pipeline
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.regression import LinearRegression

spark = SparkSession \
    .builder \
    .appName("LinearRegressionWithElasticNet") \
    .getOrCreate()
	
#creating dataframe	
ad_data= spark\
.read\
.option("inferSchema", "true")\
.option("header", "true")\
.csv("adult_lr_5.csv")
ad_data.createOrReplaceTempView("adult")
dataset = spark.table("adult")
cols = dataset.columns
#print cols

####### if you would like to check how the dataframe looks like and it's columns ######

#ad_data.createOrReplaceTempView("adult")
#dataset = spark.table("adult")
#cols = dataset.columns
#print cols

############# Columns ##################

categoricalColumns = ["workclass"]
stages = []
for categoricalCol in categoricalColumns:
	stringIndexer = StringIndexer(inputCol=categoricalCol, outputCol=categoricalCol+"Index")
	#In the above line for example, it takes workclass string and concatinates with the address("Index")
	encoder = OneHotEncoder(inputCol=categoricalCol+"Index", outputCol=categoricalCol+"classVec")
    # onehotencoder will take n-1 distinct values and convert to vector
	stages += [stringIndexer, encoder]
print (stages)


# Convert label into label indices using the StringIndexer
# means in our example we have <50k and >=50k so <50k will get label 0.0 and >50k will get label 1.0
label_stringIdx = StringIndexer(inputCol = "income", outputCol = "label")
stages += [label_stringIdx]
# Transform all features into a vector using VectorAssembler
numericCols = ["age","hours_per_week"]
assemblerInputs = list(map(lambda c: c + "classVec", categoricalColumns)) + numericCols
assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features")
stages += [assembler]
print (stages)
print (assembler)

# Create a Pipeline.
pipeline = Pipeline(stages=stages)
# Run the feature transformations.
#  - fit() computes feature statistics as needed.
#  - transform() actually transforms the features.
pipelineModel = pipeline.fit(dataset)
dataset = pipelineModel.transform(dataset)

# Keep relevant columns
selectedcols = ["label", "features"] + cols
dataset = dataset.select(selectedcols)

# we can use print dataset
### Randomly split data into training and test sets. set seed for reproducibility
(trainingData, testData) = dataset.randomSplit([0.7, 0.3], seed = 100)

### so if we have 100 records then 70 will be in training and 30 will be in testing (approximately)

lr = LinearRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8)

# Fit the model
lrModel = lr.fit(trainingData)

# Print the coefficients and intercept for linear regression
print("Coefficients: %s" % str(lrModel.coefficients))
print("Intercept: %s" % str(lrModel.intercept))

# Summarize the model over the training set and print out some metrics
trainingSummary = lrModel.summary
print("numIterations: %d" % trainingSummary.totalIterations)
print("objectiveHistory: %s" % str(trainingSummary.objectiveHistory))
trainingSummary.residuals.show()
print("RMSE: %f" % trainingSummary.rootMeanSquaredError)
print("r2: %f" % trainingSummary.r2)
# $example off$
dataset = lrModel.transform(testData)
dataset.show()

spark.stop()

[StringIndexer_41bea0f9812ceca3fca9, OneHotEncoder_46349e0e749145854b84]
[StringIndexer_41bea0f9812ceca3fca9, OneHotEncoder_46349e0e749145854b84, StringIndexer_4e2facce7b6750e5e678, VectorAssembler_461c8714299b1b29326f]
VectorAssembler_461c8714299b1b29326f
Coefficients: [0.0,0.0,0.010425881948,0.0]
Intercept: -0.1305446911035797
numIterations: 11
objectiveHistory: [0.5, 0.49248840179791875, 0.46543968467416663, 0.46337810593100537, 0.46303177080326474, 0.4629769269117229, 0.4629682421084831, 0.46296686682665666, 0.46296664904393336, 0.4629666145569551, 0.46296660909577025]
+--------------------+
|           residuals|
+--------------------+
| -0.2760647048701082|
|-0.10925059370141574|
|-0.20308353123380524|
|  0.5883988298053292|
+--------------------+

RMSE: 0.344820
r2: 0.365862
+-----+-------------------+---+---------+------+---------+-------------+---------------+-------+--------------+------+-------------------+
|label|           features|age|workclass|fnlwgt|education|education_