<a href="https://colab.research.google.com/github/amitgaur95/pyspark/blob/main/googleplaystore_randomforestreg.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler, StringIndexer
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator
import pyspark.sql.functions as F
import pickle

spark = SparkSession.builder.appName("PlaystoreRatingPrediction").getOrCreate()

# Loading dataset
data = spark.read.csv('/googleplaystore.csv', header=True, inferSchema=True)

# existing features and target variable
prediction_value = "Rating"

# Converting categorical variables to numerical using StringIndexer
category_indexer = StringIndexer(inputCol="Category", outputCol="CategoryIndex")
content_rating_indexer = StringIndexer(inputCol="Content Rating", outputCol="ContentRatingIndex")

data = category_indexer.fit(data).transform(data)
data = content_rating_indexer.fit(data).transform(data)

data = data.withColumn("Rating", data["Rating"].cast("int"))
data = data.withColumn("Reviews", data["Reviews"].cast("float"))
data = data.withColumn("Price", data["Price"].cast("float"))

# Creating vector
vector_assembler = VectorAssembler(inputCols=["Reviews", "Price", "CategoryIndex", "ContentRatingIndex"],
                                   outputCol="features", handleInvalid="keep")
data = vector_assembler.transform(data)

# Spliting into training and testing sets
(training_data, testing_data) = data.randomSplit([0.8, 0.2], seed=42)

# Creating RandomForestRegressor model
dt_model = RandomForestRegressor(featuresCol="features", labelCol=prediction_value, numTrees=10)

data = data.filter(F.col("Rating").isNotNull())
data = data.filter(F.col("Reviews").isNotNull())
data = data.filter(F.col("Price").isNotNull())
training_data = training_data.filter(F.col("Rating").isNotNull())
training_data = training_data.filter(F.col("Reviews").isNotNull())
training_data = training_data.filter(F.col("Price").isNotNull())

data = data.drop('Category'), data['Category']

# Training the model
dt_model = dt_model.setParams(maxBins=35)
dt_model = dt_model.fit(training_data)

# Making predictions on testing set
predictions = dt_model.transform(testing_data)

try:
    predictions = predictions.withColumn("Rating", data[0]["Rating"].cast("integer"))
    predictions = predictions.withColumn("CategoryIndex", predictions["CategoryIndex"].cast("integer"))
    predictions = predictions.withColumn("ContentRatingIndex", predictions["ContentRatingIndex"].cast("integer"))
except pickle.PicklingError:
    pass

# Evaluating model
evaluator = RegressionEvaluator(labelCol=prediction_value, predictionCol="prediction", metricName="rmse")
predictions = predictions.drop('Category'), predictions['Category']
rmse = evaluator.evaluate(predictions[0])
print("Result of test data =", rmse)


Result of test data = 0.5307265472745697
