In [None]:
from pyspark.ml import Pipeline

from pyspark.ml.feature import VectorIndexer
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler
from pyspark.ml.regression import GBTRegressor
from pyspark.ml.evaluation import RegressionEvaluator

In [None]:

spark = SparkSession \
    .builder \
    .appName("Python Spark gbr ML example") \
    .getOrCreate()

In [None]:
# Load and parse the data file, converting it to a DataFrame.
data = spark.read.format("com.databricks.spark.csv")\
  .option("header", "true")\
  .option("inferSchema", "true")\
  .load("adult5.csv")

In [None]:
data.show()

In [None]:

categoricalColumns = ["workclass", "occupation"]

In [None]:
stages = []
for categoricalCol in categoricalColumns:
	stringIndexer = StringIndexer(inputCol=categoricalCol, outputCol=categoricalCol+"Index")
	#.fit(ad_data)
	#df_numeric = stringIndexer.transform(ad_data)
	#df_numeric.repartition(1).repartition(1).write.csv('indexer')
	#print df_numeric.select('workclass','workclassIndex').show(5)
	#In the above line for example, it takes workclass string and concatinates with the address("Index")
	encoder = OneHotEncoder(inputCol=categoricalCol+"Index", outputCol=categoricalCol+"classVec")
	#print encoder.outputCol
	stages += [stringIndexer, encoder]

In [None]:
label_stringIdx = StringIndexer(inputCol = "income", outputCol = "label")

In [None]:
stages += [label_stringIdx]

In [None]:
numericCols = ["age", "hours_per_week"]

In [None]:
assemblerInputs = map(lambda c: c + "classVec", categoricalColumns) 

In [None]:
assemblerInputs

In [None]:
assemblerInputs=list(assemblerInputs) + numericCols

In [None]:
assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features")

In [None]:
stages += [assembler]

In [None]:
# Run the feature transformations.
#  - fit() computes feature statistics as needed.
#  - transform() actually transforms the features.

In [None]:
pipeline = Pipeline(stages=stages)

In [None]:
pipelineModel = pipeline.fit(data)
dataset = pipelineModel.transform(data)

In [None]:
dataset.printSchema()

In [None]:
cols = data.columns

In [None]:
selectedcols = ["label", "features"] + cols
dataset = dataset.select(selectedcols)

In [None]:
from pyspark.ml.feature import VectorIndexer

In [None]:
dataset.printSchema()

# Automatically identify categorical features, and index them.
# Set maxCategories so features with > 4 distinct values are treated as continuous.
featureIndexer =VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(dataset)
print(featureIndexer)



In [None]:
featuredf=featureIndexer.transform(dataset)

In [None]:
featuredf.show()

In [None]:
# Split the data into training and test sets (30% held out for testing)
(trainingData, testData) = featuredf.randomSplit([0.7, 0.3], seed = 100)

In [None]:
trainingData.show()

In [None]:
testData.show()

In [None]:

from pyspark.ml.regression import GBTRegressor

In [None]:
# Train a GBT model.
gbt = GBTRegressor(featuresCol="indexedFeatures", maxIter=10)

In [None]:
# Chain indexer and GBT in a Pipeline
pipeline = Pipeline(stages=[featureIndexer, gbt])


In [None]:
# Train model.  This also runs the indexer.
model = pipeline.fit(trainingData)

In [None]:
# Make predictions.
predictions = model.transform(testData)

In [None]:
# Select example rows to display.
predictions.select("prediction", "label", "features").show(5)

In [None]:
# Select (prediction, true label) and compute test error
evaluator = RegressionEvaluator(
    labelCol="label", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)

In [None]:
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

In [None]:
gbtModel = model.stages[1]

In [None]:
print(gbtModel)  # summary only