In [None]:
!pip install pyspark

In [None]:
#importing the sparkContext and sparkfame
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession, DataFrame
# importing the SQL context
from pyspark.sql import SQLContext
from pyspark.sql.types import *
#importing the VectorAssembler 
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.stat import Correlation
# Importing the string indexer to convert categorial values to continous
from pyspark.ml.feature import StringIndexer
# importing DecisionRegressor to get the prediction values
from pyspark.ml.regression import DecisionTreeRegressor
from pyspark.ml import Pipeline



conf = SparkConf()
conf.setMaster('local')
# set the app name
conf.setAppName('5433')
sc = SparkContext.getOrCreate(conf=conf)
sqlContext = SQLContext(sc)
# laod the csv file 
df = sqlContext.read.format('csv').options(
    header='true', inferschema='true').load("hdfs:///user/amita/spark/dataset.csv")
#show the contents of the csv file
df.show()
# converting the gender categorial values into continous values
gender_indexer = StringIndexer(inputCol="gender", outputCol="Gender")
df = gender_indexer.fit(df).transform(df)
# converting the gender categorial values into continous values
race_indexer = StringIndexer(inputCol="race/ethnicity", outputCol="Race/ethnicity")
df = race_indexer.fit(df).transform(df)
# converting the parental level of education categorial values into continous values values
parental_indexer = StringIndexer(inputCol="parental level of education", outputCol="PArental level of education")
df = parental_indexer.fit(df).transform(df)
# converting the lunch categorial values into continous values
lunch_indexer = StringIndexer(inputCol="lunch", outputCol="Lunch")
df = lunch_indexer.fit(df).transform(df)
# converting the test preparation course categorial values into continous values
test_indexer = StringIndexer(inputCol="test preparation course", outputCol="Test preparation course")
df = test_indexer.fit(df).transform(df)
#dispaly the columns in continous values
df.show()
# creating the vectors for the columns
vector_features_col = 'features'
# creating Vector Assembler
vectorAssembler = VectorAssembler(
inputCols=['reading score','math score','Gender','Race/ethnicity','PArental level of education','Lunch','Test preparation course'], outputCol=vector_features_col)
df_vector1 = vectorAssembler.transform(df).select(
[vector_features_col, 'writing score'])
#prediction using decision regression 
dt = DecisionTreeRegressor(
featuresCol=vector_features_col, labelCol='writing score')
dt_model = dt.fit(df_vector1)
# prediction using df_vector1
dt_predictions = dt_model.transform(df_vector1)
# show the prediction output
dt_predictions.show()
#evaluate accuracy using regression evaluator
from pyspark.ml.evaluation import RegressionEvaluator
dt_evaluator = RegressionEvaluator(
    labelCol="writing score", predictionCol="prediction", metricName="rmse")
# getting the accuracy if prediction
rmse = dt_evaluator.evaluate(dt_predictions)
# print the accuracy value
print("accuracy (RMSE) on test data = %g" % rmse)
dt_predictions.describe().show()





