### ML with PySpark
+ Classify/Predict 

In [1]:
# Load our Pkgs
from pyspark import SparkContext

In [2]:
sc = SparkContext(master='local')

In [3]:
# Spark UI
sc

In [4]:
# Load Pkgs 
from pyspark.sql import SparkSession
from pyspark.sql.types import IntegerType,BooleanType,DateType
from pyspark.sql.functions import col

In [5]:
# Spark
spark = SparkSession.builder.appName("MLwithSpark").getOrCreate()

#### WorkFlow
+ Data Prep
+ Feature Engineering
+ Build Model
+ Evaluate

# Task
+ Predict if a patient is Hep or not based parameter
+ The data set contains laboratory values of blood donors and Hepatitis C patients and demographic values like age.



In [6]:
# Load our dataset
df = spark.read.csv("Google-Playstore.csv",header=True,inferSchema=True)

In [7]:
# Preview Dataset
df.show()

+----------------------------------+--------------------+-----------------+------+------------+--------+----------------+----------------+----+-----+--------+----+---------------+--------------------+--------------------+--------------------+------------+------------+--------------+--------------------+------------+----------------+--------------+-------------------+
|                          App Name|              App Id|         Category|Rating|Rating Count|Installs|Minimum Installs|Maximum Installs|Free|Price|Currency|Size|Minimum Android|        Developer Id|   Developer Website|     Developer Email|    Released|Last Updated|Content Rating|      Privacy Policy|Ad Supported|In App Purchases|Editors Choice|       Scraped Time|
+----------------------------------+--------------------+-----------------+------+------------+--------+----------------+----------------+----+-----+--------+----+---------------+--------------------+--------------------+--------------------+------------+-----

In [8]:
# check for columns
print(df.columns)

['App Name', 'App Id', 'Category', 'Rating', 'Rating Count', 'Installs', 'Minimum Installs', 'Maximum Installs', 'Free', 'Price', 'Currency', 'Size', 'Minimum Android', 'Developer Id', 'Developer Website', 'Developer Email', 'Released', 'Last Updated', 'Content Rating', 'Privacy Policy', 'Ad Supported', 'In App Purchases', 'Editors Choice', 'Scraped Time']


In [9]:
# Rearrange
df = df.select('App Id', 'App Name', 'Category', 'Rating', 'Rating Count', 'Installs', 'Minimum Installs', 'Maximum Installs', 'Free', 'Price', 'Currency', 'Size', 'Minimum Android', 'Developer Id', 'Developer Website', 'Developer Email', 'Released', 'Last Updated', 'Content Rating', 'Privacy Policy', 'Ad Supported', 'In App Purchases', 'Editors Choice', 'Scraped Time')
df.show(5)

+--------------------+--------------------+-------------+------+------------+--------+----------------+----------------+----+-----+--------+----+---------------+--------------------+--------------------+--------------------+------------+------------+--------------+--------------------+------------+----------------+--------------+-------------------+
|              App Id|            App Name|     Category|Rating|Rating Count|Installs|Minimum Installs|Maximum Installs|Free|Price|Currency|Size|Minimum Android|        Developer Id|   Developer Website|     Developer Email|    Released|Last Updated|Content Rating|      Privacy Policy|Ad Supported|In App Purchases|Editors Choice|       Scraped Time|
+--------------------+--------------------+-------------+------+------------+--------+----------------+----------------+----+-----+--------+----+---------------+--------------------+--------------------+--------------------+------------+------------+--------------+--------------------+----------

In [10]:
# Check for the Schema
df.printSchema()


#cast some columns to numeric values not strings
df = df.withColumn("Rating",df['Rating'].cast('float'))
df = df.withColumn("Rating Count",df['Rating Count'].cast('int'))
df = df.withColumn("Minimum Installs",df['Minimum Installs'].cast('int'))
df = df.withColumn("Maximum Installs",df['Maximum Installs'].cast('int'))
df = df.withColumn("Price",df['Price'].cast('float'))


df = df.na.drop(how='any', subset=['Rating', 'Rating Count', 'Minimum Installs', 'Maximum Installs', 'Free', 'Price', 'Content Rating', 'Category'])



root
 |-- App Id: string (nullable = true)
 |-- App Name: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- Rating: string (nullable = true)
 |-- Rating Count: string (nullable = true)
 |-- Installs: string (nullable = true)
 |-- Minimum Installs: string (nullable = true)
 |-- Maximum Installs: string (nullable = true)
 |-- Free: string (nullable = true)
 |-- Price: string (nullable = true)
 |-- Currency: string (nullable = true)
 |-- Size: string (nullable = true)
 |-- Minimum Android: string (nullable = true)
 |-- Developer Id: string (nullable = true)
 |-- Developer Website: string (nullable = true)
 |-- Developer Email: string (nullable = true)
 |-- Released: string (nullable = true)
 |-- Last Updated: string (nullable = true)
 |-- Content Rating: string (nullable = true)
 |-- Privacy Policy: string (nullable = true)
 |-- Ad Supported: string (nullable = true)
 |-- In App Purchases: string (nullable = true)
 |-- Editors Choice: string (nullable = true)
 |-- Scra

In [11]:

df.printSchema()



root
 |-- App Id: string (nullable = true)
 |-- App Name: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- Rating: float (nullable = true)
 |-- Rating Count: integer (nullable = true)
 |-- Installs: string (nullable = true)
 |-- Minimum Installs: integer (nullable = true)
 |-- Maximum Installs: integer (nullable = true)
 |-- Free: string (nullable = true)
 |-- Price: float (nullable = true)
 |-- Currency: string (nullable = true)
 |-- Size: string (nullable = true)
 |-- Minimum Android: string (nullable = true)
 |-- Developer Id: string (nullable = true)
 |-- Developer Website: string (nullable = true)
 |-- Developer Email: string (nullable = true)
 |-- Released: string (nullable = true)
 |-- Last Updated: string (nullable = true)
 |-- Content Rating: string (nullable = true)
 |-- Privacy Policy: string (nullable = true)
 |-- Ad Supported: string (nullable = true)
 |-- In App Purchases: string (nullable = true)
 |-- Editors Choice: string (nullable = true)
 |-- Scr

In [12]:
# Descriptive summary
df.describe().show()

+-------+--------------------+--------------------+--------+------------------+------------------+------------+------------------+------------------+-------+-------------------+--------+------------------+------------------+--------------------+-----------------+--------------------+---------------+------------+---------------+-----------------------+--------------------+----------------+--------------+-------------------+
|summary|              App Id|            App Name|Category|            Rating|      Rating Count|    Installs|  Minimum Installs|  Maximum Installs|   Free|              Price|Currency|              Size|   Minimum Android|        Developer Id|Developer Website|     Developer Email|       Released|Last Updated| Content Rating|         Privacy Policy|        Ad Supported|In App Purchases|Editors Choice|       Scraped Time|
+-------+--------------------+--------------------+--------+------------------+------------------+------------+------------------+----------------

In [13]:
#count of apps for each category
df.groupBy('Category').count().show()

#mean rating for each category
df.groupBy('Category').mean('Rating').show()

+----------------+------+
|        Category| count|
+----------------+------+
|   Music & Audio|154588|
|       Education|239782|
|          Trivia| 11689|
| Auto & Vehicles| 17951|
|   Entertainment|137572|
|       Adventure| 22787|
|          Arcade| 52707|
|          Sports| 47052|
|  Travel & Local| 66666|
|    Food & Drink| 73443|
|    Role Playing|  9657|
|         Finance| 64729|
| Personalization| 89011|
|          Racing| 10177|
|           Tools|141965|
|     Educational| 21058|
|          Comics|  2821|
|          Social| 44040|
|Libraries & Demo|  5068|
|        Shopping| 74750|
+----------------+------+
only showing top 20 rows

+----------------+------------------+
|        Category|       avg(Rating)|
+----------------+------------------+
|   Music & Audio| 2.226824850719996|
|       Education|2.2468225311926124|
|          Trivia|2.5261784584102687|
| Auto & Vehicles| 2.049612834137347|
|   Entertainment|2.4353916492931953|
|       Adventure| 2.814991004868682|
|       

#### Feature Engineering
+ Numberical Values
+ Vectorization
+ Scaling

In [12]:
df.show(5)

+--------------------+--------------------+-------------+------+------------+--------+----------------+----------------+----+-----+--------+----+---------------+--------------------+--------------------+--------------------+------------+------------+--------------+--------------------+------------+----------------+--------------+-------------------+
|              App Id|            App Name|     Category|Rating|Rating Count|Installs|Minimum Installs|Maximum Installs|Free|Price|Currency|Size|Minimum Android|        Developer Id|   Developer Website|     Developer Email|    Released|Last Updated|Content Rating|      Privacy Policy|Ad Supported|In App Purchases|Editors Choice|       Scraped Time|
+--------------------+--------------------+-------------+------+------------+--------+----------------+----------------+----+-----+--------+----+---------------+--------------------+--------------------+--------------------+------------+------------+--------------+--------------------+----------

In [13]:
import pyspark.ml
dir(pyspark.ml)

['Estimator',
 'Model',
 'Pipeline',
 'PipelineModel',
 'PredictionModel',
 'Predictor',
 'TorchDistributor',
 'Transformer',
 'UnaryTransformer',
 '__all__',
 '__builtins__',
 '__cached__',
 '__doc__',
 '__file__',
 '__loader__',
 '__name__',
 '__package__',
 '__path__',
 '__spec__',
 'base',
 'classification',
 'clustering',
 'common',
 'evaluation',
 'feature',
 'fpm',
 'image',
 'linalg',
 'param',
 'pipeline',
 'recommendation',
 'regression',
 'stat',
 'torch',
 'tree',
 'tuning',
 'util',
 'wrapper']

In [14]:
# Load ML Pkgs
from pyspark.ml.feature import VectorAssembler,StringIndexer

In [17]:
# Unique Values for Category
df.select('Category').distinct().show()

+----------------+
|        Category|
+----------------+
|   Music & Audio|
|       Education|
|          Trivia|
| Auto & Vehicles|
|   Entertainment|
|       Adventure|
|          Arcade|
|          Sports|
|  Travel & Local|
|    Food & Drink|
|    Role Playing|
|         Finance|
| Personalization|
|          Racing|
|           Tools|
|     Educational|
|          Comics|
|          Social|
|Libraries & Demo|
|        Shopping|
+----------------+
only showing top 20 rows



In [15]:
# Convert the string into numerical code
# label encoding
categoryEncoder = StringIndexer(inputCol='Category',outputCol='Category_enc', handleInvalid='skip').fit(df)
df = categoryEncoder.transform(df)

categoryEncoder = StringIndexer(inputCol='Free',outputCol='Free_enc', handleInvalid='skip').fit(df)
df = categoryEncoder.transform(df)

categoryEncoder = StringIndexer(inputCol='Content Rating',outputCol='Content Rating_enc',  handleInvalid='skip').fit(df)
df = categoryEncoder.transform(df)

categoryEncoder = StringIndexer(inputCol='Ad Supported',outputCol='Ad Supported_enc', handleInvalid='skip').fit(df)
df = categoryEncoder.transform(df)

categoryEncoder = StringIndexer(inputCol='In App Purchases',outputCol='In App Purchases_enc', handleInvalid='skip').fit(df)
df = categoryEncoder.transform(df)

categoryEncoder = StringIndexer(inputCol='Editors Choice',outputCol='Editors Choice_enc', handleInvalid='skip').fit(df)
df = categoryEncoder.transform(df)

df.na.fill(value=0.0)


df.show(5)

+--------------------+--------------------+-------------+------+------------+--------+----------------+----------------+----+-----+--------+----+---------------+--------------------+--------------------+--------------------+------------+------------+--------------+--------------------+------------+----------------+--------------+-------------------+------------+--------+------------------+----------------+--------------------+------------------+
|              App Id|            App Name|     Category|Rating|Rating Count|Installs|Minimum Installs|Maximum Installs|Free|Price|Currency|Size|Minimum Android|        Developer Id|   Developer Website|     Developer Email|    Released|Last Updated|Content Rating|      Privacy Policy|Ad Supported|In App Purchases|Editors Choice|       Scraped Time|Category_enc|Free_enc|Content Rating_enc|Ad Supported_enc|In App Purchases_enc|Editors Choice_enc|
+--------------------+--------------------+-------------+------+------------+--------+--------------

In [16]:
# encoded column back to String
from pyspark.ml.feature import IndexToString
converter = IndexToString(inputCol='Category_enc',outputCol='orig_category')
converted_df = converter.transform(df)
converted_df.show()

+--------------------+----------------------------------+-----------------+------+------------+--------+----------------+----------------+----+-----+--------+----+---------------+--------------------+--------------------+--------------------+------------+------------+--------------+--------------------+------------+----------------+--------------+-------------------+------------+--------+------------------+----------------+--------------------+------------------+-----------------+
|              App Id|                          App Name|         Category|Rating|Rating Count|Installs|Minimum Installs|Maximum Installs|Free|Price|Currency|Size|Minimum Android|        Developer Id|   Developer Website|     Developer Email|    Released|Last Updated|Content Rating|      Privacy Policy|Ad Supported|In App Purchases|Editors Choice|       Scraped Time|Category_enc|Free_enc|Content Rating_enc|Ad Supported_enc|In App Purchases_enc|Editors Choice_enc|    orig_category|
+--------------------+------

### Features


In [17]:
'''

#fix nullable columns
from pyspark.sql.functions import col, lit, coalesce

df = df.withColumn("Rating", coalesce(col("Rating"), lit(0.0)))
df = df.withColumn("Rating Count", coalesce(col("Rating Count"), lit(0)))
df = df.withColumn("Minimum Installs", coalesce(col("Minimum Installs"), lit(0)))
df = df.withColumn("Maximum Installs", coalesce(col("Maximum Installs"), lit(0)))
df = df.withColumn("Price", coalesce(col("Price"), lit(0.0)))

df.printSchema()
'''


'\n\n#fix nullable columns\nfrom pyspark.sql.functions import col, lit, coalesce\n\ndf = df.withColumn("Rating", coalesce(col("Rating"), lit(0.0)))\ndf = df.withColumn("Rating Count", coalesce(col("Rating Count"), lit(0)))\ndf = df.withColumn("Minimum Installs", coalesce(col("Minimum Installs"), lit(0)))\ndf = df.withColumn("Maximum Installs", coalesce(col("Maximum Installs"), lit(0)))\ndf = df.withColumn("Price", coalesce(col("Price"), lit(0.0)))\n\ndf.printSchema()\n'

In [18]:
#we took the only features we are concerned about
#plus the target variable
df2 = df.select('Rating Count','Minimum Installs', 'Maximum Installs', 'Free_enc', 'Price', 'Content Rating_enc', 'Ad Supported_enc', 'In App Purchases_enc', 'Editors Choice_enc', 'Category_enc', 'Rating')

In [19]:
df2.printSchema()

root
 |-- Rating Count: integer (nullable = true)
 |-- Minimum Installs: integer (nullable = true)
 |-- Maximum Installs: integer (nullable = true)
 |-- Free_enc: double (nullable = false)
 |-- Price: float (nullable = true)
 |-- Content Rating_enc: double (nullable = false)
 |-- Ad Supported_enc: double (nullable = false)
 |-- In App Purchases_enc: double (nullable = false)
 |-- Editors Choice_enc: double (nullable = false)
 |-- Category_enc: double (nullable = false)
 |-- Rating: float (nullable = true)



In [20]:
df2.groupBy('Free_enc').count().show()

+--------+-------+
|Free_enc|  count|
+--------+-------+
|     0.0|2245163|
|     1.0|  44840|
+--------+-------+



In [21]:
'''
#convert pyspark df to pandas df
pd_df = df2.toPandas()

# Convert To PySpark Dataframe
new_df = spark.createDataFrame(pd_df)

new_df.show()
'''

'\n#convert pyspark df to pandas df\npd_df = df2.toPandas()\n\n# Convert To PySpark Dataframe\nnew_df = spark.createDataFrame(pd_df)\n\nnew_df.show()\n'

In [22]:
required_features = df2.columns[:-1]      #all except the target variable 'Rating' which is at the last index
print(required_features)

['Rating Count', 'Minimum Installs', 'Maximum Installs', 'Free_enc', 'Price', 'Content Rating_enc', 'Ad Supported_enc', 'In App Purchases_enc', 'Editors Choice_enc', 'Category_enc']


In [23]:
# VectorAsm
vec_assembler = VectorAssembler(inputCols=required_features,outputCol='Features')

In [24]:
vec_df = vec_assembler.setHandleInvalid("skip").transform(df2)

In [25]:
vec_df.na.drop(how='any',subset='Features')

DataFrame[Rating Count: int, Minimum Installs: int, Maximum Installs: int, Free_enc: double, Price: float, Content Rating_enc: double, Ad Supported_enc: double, In App Purchases_enc: double, Editors Choice_enc: double, Category_enc: double, Rating: float, Features: vector]

In [26]:
vec_df.show(30, False)

+------------+----------------+----------------+--------+-----+------------------+----------------+--------------------+------------------+------------+------+----------------------------------------------------+
|Rating Count|Minimum Installs|Maximum Installs|Free_enc|Price|Content Rating_enc|Ad Supported_enc|In App Purchases_enc|Editors Choice_enc|Category_enc|Rating|Features                                            |
+------------+----------------+----------------+--------+-----+------------------+----------------+--------------------+------------------+------------+------+----------------------------------------------------+
|0           |10              |15              |0.0     |0.0  |0.0               |0.0             |0.0                 |0.0               |25.0        |0.0   |(10,[1,2,9],[10.0,15.0,25.0])                       |
|64          |5000            |7662            |0.0     |0.0  |0.0               |1.0             |0.0                 |0.0               |3.0      

### Train,Test Split

In [30]:
train_df,test_df = vec_df.select(['Features', 'Rating']).randomSplit([0.7,0.3])

In [31]:
train_df.count()

1602048

In [32]:
train_df.show(4)

+----------+------+
|  Features|Rating|
+----------+------+
|(10,[],[])|   0.0|
|(10,[],[])|   0.0|
|(10,[],[])|   0.0|
|(10,[],[])|   0.0|
+----------+------+
only showing top 4 rows



#### Model Building
+ Pyspark.ml: DataFrame
+ Pyspark.mllib: RDD /Legacy

In [33]:
from pyspark.ml.regression import LinearRegression

In [34]:
# Regr Model
lr = LinearRegression(featuresCol='Features',labelCol='Rating')

In [35]:
lr_model = lr.fit(train_df)

#just some training metrics
trainingSummary = lr_model.summary
print("RMSE: %f" % trainingSummary.rootMeanSquaredError)
print("r2: %f" % trainingSummary.r2)

In [37]:
y_pred = lr_model.transform(test_df)

#notice another column called 'prediction' is automatically made
y_pred.show(5)

#### Model Evaluation

In [39]:
from pyspark.ml.evaluation import RegressionEvaluator
lr_evaluator = RegressionEvaluator(predictionCol="prediction", \
                 labelCol="Rating",metricName="r2")
print("R Squared (R2) on test data = %g" % lr_evaluator.evaluate(y_pred))

R Squared (R2) on test data = 0.0451513


In [42]:
test_result = lr_model.evaluate(test_df)
print("Root Mean Squared Error (RMSE) on test data = %g" % test_result.rootMeanSquaredError)

Root Mean Squared Error (RMSE) on test data = 2.05874


#### Decision Tree Regression

In [45]:
from pyspark.ml.regression import DecisionTreeRegressor
dt = DecisionTreeRegressor(featuresCol ='Features', labelCol = 'Rating', maxBins=48)   #setting a larger maxbins(default=32) is an ad-hoc step just to solve an error I dont understand :)
dt_model = dt.fit(train_df)
dt_predictions = dt_model.transform(test_df)
dt_evaluator = RegressionEvaluator(
    labelCol="Rating", predictionCol="prediction", metricName="rmse")
rmse = dt_evaluator.evaluate(dt_predictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

Root Mean Squared Error (RMSE) on test data = 0.465668


In [46]:
#Feature Importance
dt_model.featureImportances

#notice that Feature at index 0 has higher importance, this feature is the 'Rating Count'

SparseVector(10, {0: 0.9917, 2: 0.0062, 3: 0.0, 4: 0.0001, 6: 0.0003, 9: 0.0017})

### Gradient-Boosted Decision Tree Regressor

In [48]:
from pyspark.ml.regression import GBTRegressor
gbt = GBTRegressor(featuresCol = 'Features', labelCol = 'Rating', maxIter=10, maxBins=48) #same as above, maxbins is set just to avoid an error I dont understand
# maxIter is just a hyperparameter we put by hand
gbt_model = gbt.fit(train_df)
gbt_predictions = gbt_model.transform(test_df)
gbt_predictions.select('prediction', 'Rating', 'Features').show(5)

+--------------------+------+----------+
|          prediction|Rating|  Features|
+--------------------+------+----------+
|-6.46655770197093E-4|   0.0|(10,[],[])|
|-6.46655770197093E-4|   0.0|(10,[],[])|
|-6.46655770197093E-4|   0.0|(10,[],[])|
|-6.46655770197093E-4|   0.0|(10,[],[])|
|-6.46655770197093E-4|   0.0|(10,[],[])|
+--------------------+------+----------+
only showing top 5 rows



In [49]:
gbt_evaluator = RegressionEvaluator(
    labelCol="Rating", predictionCol="prediction", metricName="rmse")
rmse = gbt_evaluator.evaluate(gbt_predictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

Root Mean Squared Error (RMSE) on test data = 0.452679


### Saving Model


In [62]:
#this is giving me errors I still cannot resolve

from pyspark.ml.pipeline import PipelineModel
gbt_model.save("D:/CLASSSWORKKK/BD/BD_PROJECT/GBT_Model")
#to overwrite an existing model
gbt_model.write().overwrite().save("D:/CLASSSWORKKK/BD/BD_PROJECT/GBT_Model")

Py4JJavaError: An error occurred while calling o1689.save.
: java.io.IOException: Path D:/CLASSSWORKKK/BD/BD_PROJECT/GBT_Model already exists. To overwrite it, please use write.overwrite().save(path) for Scala and use write().overwrite().save(path) for Java and Python.
	at org.apache.spark.ml.util.FileSystemOverwrite.handleOverwrite(ReadWrite.scala:683)
	at org.apache.spark.ml.util.MLWriter.save(ReadWrite.scala:167)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:566)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:834)


### Loading Model

In [63]:
from pyspark.ml.pipeline import PipelineModel
LoadedModel = PipelineModel.load("./GBT_Model")

Py4JJavaError: An error occurred while calling o1696.partitions.
: org.apache.hadoop.mapred.InvalidInputException: Input path does not exist: file:/d:/CLASSSWORKKK/BD/BD_PROJECT/GBT_Model/metadata
	at org.apache.hadoop.mapred.FileInputFormat.singleThreadedListStatus(FileInputFormat.java:304)
	at org.apache.hadoop.mapred.FileInputFormat.listStatus(FileInputFormat.java:244)
	at org.apache.hadoop.mapred.FileInputFormat.getSplits(FileInputFormat.java:332)
	at org.apache.spark.rdd.HadoopRDD.getPartitions(HadoopRDD.scala:208)
	at org.apache.spark.rdd.RDD.$anonfun$partitions$2(RDD.scala:291)
	at scala.Option.getOrElse(Option.scala:189)
	at org.apache.spark.rdd.RDD.partitions(RDD.scala:287)
	at org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:49)
	at org.apache.spark.rdd.RDD.$anonfun$partitions$2(RDD.scala:291)
	at scala.Option.getOrElse(Option.scala:189)
	at org.apache.spark.rdd.RDD.partitions(RDD.scala:287)
	at org.apache.spark.api.java.JavaRDDLike.partitions(JavaRDDLike.scala:61)
	at org.apache.spark.api.java.JavaRDDLike.partitions$(JavaRDDLike.scala:61)
	at org.apache.spark.api.java.AbstractJavaRDDLike.partitions(JavaRDDLike.scala:45)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:566)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:834)
Caused by: java.io.IOException: Input path does not exist: file:/d:/CLASSSWORKKK/BD/BD_PROJECT/GBT_Model/metadata
	at org.apache.hadoop.mapred.FileInputFormat.singleThreadedListStatus(FileInputFormat.java:278)
	... 25 more
