In [1]:
from pyspark.sql import SparkSession
from pyspark import SparkContext, SparkConf
from operator import add
from pyspark.ml import Pipeline
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.regression import LinearRegression

spark = SparkSession \
    .builder \
    .appName("LinearRegressionWithElasticNet") \
    .getOrCreate()
	
#creating dataframe	
ad_data= spark\
.read\
.option("inferSchema", "true")\
.option("header", "true")\
.csv("adult6.csv")
ad_data.createOrReplaceTempView("adult")
dataset = spark.table("adult")
cols = dataset.columns
#print cols


categoricalColumns = ["workclass"]
stages = []
for categoricalCol in categoricalColumns:
	stringIndexer = StringIndexer(inputCol=categoricalCol, outputCol=categoricalCol+"Index")
	#In the above line for example, it takes workclass string and concatinates with the address("Index")
	encoder = OneHotEncoder(inputCol=categoricalCol+"Index", outputCol=categoricalCol+"classVec")
	stages += [stringIndexer, encoder]
#print stages


#
# Convert label into label indices using the StringIndexer
label_stringIdx = StringIndexer(inputCol = "income", outputCol = "label")
stages += [label_stringIdx]
# Transform all features into a vector using VectorAssembler
numericCols = ["age","hours_per_week"]
assemblerInputs = map(lambda c: c + "classVec", categoricalColumns) + numericCols
assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features")
stages += [assembler]
#print stages
#print assembler

# Create a Pipeline.
pipeline = Pipeline(stages=stages)
# Run the feature transformations.
#  - fit() computes feature statistics as needed.
#  - transform() actually transforms the features.
pipelineModel = pipeline.fit(dataset)
dataset = pipelineModel.transform(dataset)

# Keep relevant columns
selectedcols = ["label", "features"] + cols
dataset = dataset.select(selectedcols)

# we can use print dataset
### Randomly split data into training and test sets. set seed for reproducibility
(trainingData, testData) = dataset.randomSplit([0.7, 0.3], seed = 100)

lr = LinearRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8)

# Fit the model
lrModel = lr.fit(trainingData)

# Print the coefficients and intercept for linear regression
print("Coefficients: %s" % str(lrModel.coefficients))
print("Intercept: %s" % str(lrModel.intercept))

# Summarize the model over the training set and print out some metrics
trainingSummary = lrModel.summary
print("numIterations: %d" % trainingSummary.totalIterations)
print("objectiveHistory: %s" % str(trainingSummary.objectiveHistory))
trainingSummary.residuals.show()
print("RMSE: %f" % trainingSummary.rootMeanSquaredError)
print("r2: %f" % trainingSummary.r2)
# $example off$
dataset = lrModel.transform(testData)
dataset.show()

spark.stop()

Coefficients: [0.0,-0.36603121543,0.0225539737063,0.0120457731673]
Intercept: -0.43834917889
numIterations: 11
objectiveHistory: [0.4999999999999999, 0.4850034222802328, 0.45148643215923123, 0.45096736920348357, 0.45069319592989965, 0.4506884629300494, 0.450688302072468, 0.4506882541560428, 0.45068825023326803, 0.45068824911749356, 0.45068824892020054]
+-------------------+
|          residuals|
+-------------------+
|-0.5570555069166132|
|-0.4417654113732035|
|-0.8856666380752383|
|0.02411746709945639|
|  0.723482753634509|
| 1.1368873356310882|
+-------------------+

RMSE: 0.719536
r2: 0.357298
+-----+-------------------+---+---------+--------------+------+------------------+
|label|           features|age|workclass|hours_per_week|income|        prediction|
+-----+-------------------+---+---------+--------------+------+------------------+
|  1.0|[1.0,0.0,37.0,50.0]| 37|  Private|            50|  =50K|0.9984365066068699|
+-----+-------------------+---+---------+--------------+------+-