In [1]:
import pyspark
from pyspark.sql import SparkSession
from pyspark import SparkContext

In [2]:
#just copy paste those 2 lines
sc = SparkContext(master='local')
spark=SparkSession.builder.appName('Ayman').getOrCreate()

In [3]:
#click on the Spark UI, it shows some good metrics
# PS: we can actually cheat while performing map-reduce from this UI
# probably you won't understand anything from this UI, but bear with me, no need to fully understand everything
spark

In [4]:
# Reading a csv, same as pandas
# but specify to read the header, and inferSchema so as not to read every column as String
df_pyspark=spark.read.csv('Google-Playstore.csv', header=True, inferSchema= True)
df_pyspark.show(10)
#dont use print() in pyspark, always use .show()

+--------------------+--------------------+----------------+------+------------+--------+----------------+----------------+----+-----+--------+----+---------------+--------------------+--------------------+--------------------+------------+------------+--------------+--------------------+------------+----------------+--------------+-------------------+
|            App Name|              App Id|        Category|Rating|Rating Count|Installs|Minimum Installs|Maximum Installs|Free|Price|Currency|Size|Minimum Android|        Developer Id|   Developer Website|     Developer Email|    Released|Last Updated|Content Rating|      Privacy Policy|Ad Supported|In App Purchases|Editors Choice|       Scraped Time|
+--------------------+--------------------+----------------+------+------------+--------+----------------+----------------+----+-----+--------+----+---------------+--------------------+--------------------+--------------------+------------+------------+--------------+--------------------+-

In [None]:
#schema is just all columns with their data types
df_pyspark.printSchema()
print(type(df_pyspark))

In [5]:
df_pyspark.head(3)
df_pyspark.show()

+----------------------------------+--------------------+-----------------+------+------------+--------+----------------+----------------+----+-----+--------+----+---------------+--------------------+--------------------+--------------------+------------+------------+--------------+--------------------+------------+----------------+--------------+-------------------+
|                          App Name|              App Id|         Category|Rating|Rating Count|Installs|Minimum Installs|Maximum Installs|Free|Price|Currency|Size|Minimum Android|        Developer Id|   Developer Website|     Developer Email|    Released|Last Updated|Content Rating|      Privacy Policy|Ad Supported|In App Purchases|Editors Choice|       Scraped Time|
+----------------------------------+--------------------+-----------------+------+------------+--------+----------------+----------------+----+-----+--------+----+---------------+--------------------+--------------------+--------------------+------------+-----

In [6]:
# if you wanna select a specific column(s)
df_pyspark.select('App Id').show()
df_pyspark.select(['App Id', 'Rating']).show()

+--------------------+
|              App Id|
+--------------------+
| com.ishakwe.gakondo|
|com.webserveis.ba...|
|com.doantiepvien.crm|
|cst.stJoseph.ug17...|
|com.horodyski.grower|
|          com.imocci|
|getfreedata.super...|
|com.mozaix.simone...|
|   com.whatsopen.app|
|com.ikeyboard.the...|
|com.MrScratchEnte...|
|com.eqra.android....|
|com.jieapp.taoyua...|
|com.hastobe.bemob...|
|com.caliwayconduc...|
|com.ionicfirebase...|
|com.camineo.otent...|
|com.tkapplication...|
|    com.grit.redmond|
|com.felingdev.low...|
+--------------------+
only showing top 20 rows

+--------------------+------+
|              App Id|Rating|
+--------------------+------+
| com.ishakwe.gakondo|   0.0|
|com.webserveis.ba...|   4.4|
|com.doantiepvien.crm|   0.0|
|cst.stJoseph.ug17...|   5.0|
|com.horodyski.grower|   0.0|
|          com.imocci|   0.0|
|getfreedata.super...|   4.5|
|com.mozaix.simone...|   2.0|
|   com.whatsopen.app|   0.0|
|com.ikeyboard.the...|   4.7|
|com.MrScratchEnte...|   4.9|
|c

In [7]:
#try to not use this, ever, if it happens and you wanna use this line, there might be something wrong
df_pyspark['App Id']

Column<'App Id'>

In [8]:
#quick summary of the data
df_pyspark.describe().show()

+-------+--------------------+--------------------+--------------+--------------------+--------------------+-----------------+--------------------+--------------------+------------------+------------------+------------------+------------------+------------------+--------------------+-----------------+--------------------+---------------+------------------+---------------+-----------------------+--------------------+--------------------+--------------------+-------------------+
|summary|            App Name|              App Id|      Category|              Rating|        Rating Count|         Installs|    Minimum Installs|    Maximum Installs|              Free|             Price|          Currency|              Size|   Minimum Android|        Developer Id|Developer Website|     Developer Email|       Released|      Last Updated| Content Rating|         Privacy Policy|        Ad Supported|    In App Purchases|      Editors Choice|       Scraped Time|
+-------+--------------------+------

In [9]:
### Adding new Columns in data frame
df_pyspark=df_pyspark.withColumn('App Id Duplicated', df_pyspark['App Id'])
df_pyspark.show()

### Rename this new column
df_pyspark = df_pyspark.withColumnRenamed('App Id Duplicated','App ID 2')

#now drop this new column
df_pyspark = df_pyspark.drop('App ID 2')
df_pyspark.show()

+----------------------------------+--------------------+-----------------+------+------------+--------+----------------+----------------+----+-----+--------+----+---------------+--------------------+--------------------+--------------------+------------+------------+--------------+--------------------+------------+----------------+--------------+-------------------+--------------------+
|                          App Name|              App Id|         Category|Rating|Rating Count|Installs|Minimum Installs|Maximum Installs|Free|Price|Currency|Size|Minimum Android|        Developer Id|   Developer Website|     Developer Email|    Released|Last Updated|Content Rating|      Privacy Policy|Ad Supported|In App Purchases|Editors Choice|       Scraped Time|   App Id Duplicated|
+----------------------------------+--------------------+-----------------+------+------------+--------+----------------+----------------+----+-----+--------+----+---------------+--------------------+------------------

In [10]:
#Dropping nan values for all columns, if any nan value in any column is present, drop this entire row
df_pyspark.na.drop().show()

#same behaviour as above
df_pyspark.na.drop(how='any').show()

#only drop the rows where ALL values are Nan
df_pyspark.na.drop(how='all').show()

#drop rows with 3 or more Nan Values, otherwise keep the row
df_pyspark.na.drop(how='any', thresh= 3).show()

#drop rows with any Nan values for columns "App Id" & "Rating" ONLY
df_pyspark.na.drop(how='any', subset=['App Id', 'Rating']).show()



+----------------------------------+--------------------+-----------------+------+------------+--------+----------------+----------------+----+-----+--------+----+---------------+--------------------+--------------------+--------------------+------------+------------+--------------+--------------------+------------+----------------+--------------+-------------------+
|                          App Name|              App Id|         Category|Rating|Rating Count|Installs|Minimum Installs|Maximum Installs|Free|Price|Currency|Size|Minimum Android|        Developer Id|   Developer Website|     Developer Email|    Released|Last Updated|Content Rating|      Privacy Policy|Ad Supported|In App Purchases|Editors Choice|       Scraped Time|
+----------------------------------+--------------------+-----------------+------+------------+--------+----------------+----------------+----+-----+--------+----+---------------+--------------------+--------------------+--------------------+------------+-----

In [11]:
### Filling the Missing Values with a constant value
df_pyspark.na.fill('MISSING',['App Id','Rating']).show()

+----------------------------------+--------------------+-----------------+------+------------+--------+----------------+----------------+----+-----+--------+----+---------------+--------------------+--------------------+--------------------+------------+------------+--------------+--------------------+------------+----------------+--------------+-------------------+
|                          App Name|              App Id|         Category|Rating|Rating Count|Installs|Minimum Installs|Maximum Installs|Free|Price|Currency|Size|Minimum Android|        Developer Id|   Developer Website|     Developer Email|    Released|Last Updated|Content Rating|      Privacy Policy|Ad Supported|In App Purchases|Editors Choice|       Scraped Time|
+----------------------------------+--------------------+-----------------+------+------------+--------+----------------+----------------+----+-----+--------+----+---------------+--------------------+--------------------+--------------------+------------+-----

In [13]:
### to cast columns to a specific data type (for example string --> float)

#cast some columns to numeric values not strings
df_pyspark = df_pyspark.withColumn("Rating",df_pyspark['Rating'].cast('float'))
df_pyspark = df_pyspark.withColumn("Rating Count",df_pyspark['Rating Count'].cast('int'))
df_pyspark = df_pyspark.withColumn("Minimum Installs",df_pyspark['Minimum Installs'].cast('int'))
df_pyspark = df_pyspark.withColumn("Maximum Installs",df_pyspark['Maximum Installs'].cast('int'))
df_pyspark = df_pyspark.withColumn("Price",df_pyspark['Price'].cast('float'))

df_pyspark.printSchema()


root
 |-- App Name: string (nullable = true)
 |-- App Id: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- Rating: float (nullable = true)
 |-- Rating Count: integer (nullable = true)
 |-- Installs: string (nullable = true)
 |-- Minimum Installs: integer (nullable = true)
 |-- Maximum Installs: integer (nullable = true)
 |-- Free: string (nullable = true)
 |-- Price: float (nullable = true)
 |-- Currency: string (nullable = true)
 |-- Size: string (nullable = true)
 |-- Minimum Android: string (nullable = true)
 |-- Developer Id: string (nullable = true)
 |-- Developer Website: string (nullable = true)
 |-- Developer Email: string (nullable = true)
 |-- Released: string (nullable = true)
 |-- Last Updated: string (nullable = true)
 |-- Content Rating: string (nullable = true)
 |-- Privacy Policy: string (nullable = true)
 |-- Ad Supported: string (nullable = true)
 |-- In App Purchases: string (nullable = true)
 |-- Editors Choice: string (nullable = true)
 |-- Scr

In [15]:
### median imputation, must only perform on numeric columns, if you wanna use it for string columns, consider encoding the column
# to numeric values

from pyspark.ml.feature import Imputer

imputer = Imputer(
    inputCols=['Price', 'Rating', 'Rating Count'], 
    outputCols=["{}_imputed".format(c) for c in ['Price', 'Rating', 'Rating Count']]
    ).setStrategy("median")

# Add imputation cols to df
imputer.fit(df_pyspark).transform(df_pyspark).show()

+----------------------------------+--------------------+-----------------+------+------------+--------+----------------+----------------+----+-----+--------+----+---------------+--------------------+--------------------+--------------------+------------+------------+--------------+--------------------+------------+----------------+--------------+-------------------+-------------+--------------+--------------------+
|                          App Name|              App Id|         Category|Rating|Rating Count|Installs|Minimum Installs|Maximum Installs|Free|Price|Currency|Size|Minimum Android|        Developer Id|   Developer Website|     Developer Email|    Released|Last Updated|Content Rating|      Privacy Policy|Ad Supported|In App Purchases|Editors Choice|       Scraped Time|Price_imputed|Rating_imputed|Rating Count_imputed|
+----------------------------------+--------------------+-----------------+------+------------+--------+----------------+----------------+----+-----+--------+--

In [16]:
### mean imputation

from pyspark.ml.feature import Imputer

imputer = Imputer(
    inputCols=['Price', 'Rating', 'Rating Count'], 
    outputCols=["{}_imputed".format(c) for c in ['Price', 'Rating', 'Rating Count']]
    ).setStrategy("mean")

# Add imputation cols to df
imputer.fit(df_pyspark).transform(df_pyspark).show()

#there is also a mode imputer

+----------------------------------+--------------------+-----------------+------+------------+--------+----------------+----------------+----+-----+--------+----+---------------+--------------------+--------------------+--------------------+------------+------------+--------------+--------------------+------------+----------------+--------------+-------------------+-------------+--------------+--------------------+
|                          App Name|              App Id|         Category|Rating|Rating Count|Installs|Minimum Installs|Maximum Installs|Free|Price|Currency|Size|Minimum Android|        Developer Id|   Developer Website|     Developer Email|    Released|Last Updated|Content Rating|      Privacy Policy|Ad Supported|In App Purchases|Editors Choice|       Scraped Time|Price_imputed|Rating_imputed|Rating Count_imputed|
+----------------------------------+--------------------+-----------------+------+------------+--------+----------------+----------------+----+-----+--------+--

In [17]:
### Filter our dataframe according to a criteria
df_pyspark.filter("Rating<=2.4").show()

#to choose specific columns
df_pyspark.filter("Rating<=2.4").select(['App Id','Rating Count']).show()

#you can also write it like this
df_pyspark.filter(df_pyspark['Rating']<=2.4).show()

+--------------------+--------------------+-----------------+------+------------+--------+----------------+----------------+----+-----+--------+----+---------------+--------------------+--------------------+--------------------+------------+------------+--------------+--------------------+------------+----------------+--------------+-------------------+
|            App Name|              App Id|         Category|Rating|Rating Count|Installs|Minimum Installs|Maximum Installs|Free|Price|Currency|Size|Minimum Android|        Developer Id|   Developer Website|     Developer Email|    Released|Last Updated|Content Rating|      Privacy Policy|Ad Supported|In App Purchases|Editors Choice|       Scraped Time|
+--------------------+--------------------+-----------------+------+------------+--------+----------------+----------------+----+-----+--------+----+---------------+--------------------+--------------------+--------------------+------------+------------+--------------+-------------------

In [18]:
# Filter by more than one condition
df_pyspark.filter((df_pyspark['Rating']<=4) & (df_pyspark['Rating']>=2.4)).show()

df_pyspark.filter((df_pyspark['Rating']<=4) | (df_pyspark['Rating']>=2.4)).show()

#NOTICE the '~' operator to negate the condition, not to negate the expression '!='
df_pyspark.filter(~(df_pyspark['Rating']<=2.4)).show()

#the difference is shown here, here we are filtering to recieve NO rows whose category is not "adventure" , meaning, we are looking for
#rows with category "Adventure" :)
df_pyspark.filter(~(df_pyspark['Category']!="Adventure")).show()

+----------------------------------+--------------------+-----------------+------+------------+--------+----------------+----------------+----+-----+--------+----+---------------+--------------------+--------------------+--------------------+------------+------------+--------------+--------------------+------------+----------------+--------------+-------------------+
|                          App Name|              App Id|         Category|Rating|Rating Count|Installs|Minimum Installs|Maximum Installs|Free|Price|Currency|Size|Minimum Android|        Developer Id|   Developer Website|     Developer Email|    Released|Last Updated|Content Rating|      Privacy Policy|Ad Supported|In App Purchases|Editors Choice|       Scraped Time|
+----------------------------------+--------------------+-----------------+------+------------+--------+----------------+----------------+----+-----+--------+----+---------------+--------------------+--------------------+--------------------+------------+-----

In [19]:
## Groupby
# getting a quick sum report for all columns
df_pyspark.groupBy('Category').sum().show()

+--------------------+------------------+-----------------+---------------------+---------------------+------------------+
|            Category|       sum(Rating)|sum(Rating Count)|sum(Minimum Installs)|sum(Maximum Installs)|        sum(Price)|
+--------------------+------------------+-----------------+---------------------+---------------------+------------------+
|       Music & Audio|344240.40002310276|        195760915|          14239411974|          23021925013| 6552.157372951508|
|           Education|  538747.600174427|        104811664|           5983961831|          10814023504|39355.844651803374|
|              Trivia|29528.500000357628|         37403129|           1188962983|           2207718907|375.92618203163147|
|     Auto & Vehicles| 36792.59998559952|         12673750|           1594745568|           3056222948|2133.8702086806297|
|       Entertainment| 335045.9999767542|        224991976|          17108498394|          27939346236|7030.6443387418985|
|           Adve

In [20]:
#grouping by Category, then finding the mean Rating for each category
#this is used to find the most mean rated Category
df_pyspark.groupBy('Category').mean('Rating').show()

'''
other available aggregate functions...
count()
mean()
max()
min()
sum()
avg()
pivot()                         Not quite sure, seems like a median to me
approx_count_distinct()         function returns the count of distinct items in a group.
collect_list()                  function returns all values from an input column with duplicates.
collect_set()                   function returns all values from an input column with duplicate values eliminated.
countDistinct()                 function returns the number of distinct elements in a columns
first()
last()
kurtosis()                      Dont know
skewness()                      Dont know, but might come in handy later
stddev_samp()
stddev_pop()
sumDistinct() function returns the sum of all distinct values in a column.
var_samp()
var_pop()
'''

+--------------------+------------------+
|            Category|       avg(Rating)|
+--------------------+------------------+
|       Music & Audio| 2.226824850719996|
|           Education|2.2468225311926124|
|              Trivia|2.5261784584102687|
|     Auto & Vehicles| 2.049612834137347|
|       Entertainment|2.4354052028868614|
|           Adventure| 2.814991004868682|
|com.free074a81ba9...|              null|
|              Arcade|2.4504809646643357|
|net.cleverbit.Mic...|              null|
|              Sports|2.3155445043837792|
|      Travel & Local|1.9211963940963501|
|        Food & Drink|1.2945808316654184|
|        Role Playing|3.3844982885828023|
|             Finance|2.4030697207277085|
|     Personalization| 2.758753873719551|
|              Racing| 2.960233858728779|
|               Tools|2.3460960326089735|
|         Educational|2.4376389013191315|
|              Comics|2.8649060628127647|
|              Social|2.3989215095932646|
+--------------------+------------

'\nother available aggregate functions...\ncount()\nmean()\nmax()\nmin()\nsum()\navg()\npivot()                         Not quite sure, seems like a median to me\napprox_count_distinct()         function returns the count of distinct items in a group.\ncollect_list()                  function returns all values from an input column with duplicates.\ncollect_set()                   function returns all values from an input column with duplicate values eliminated.\ncountDistinct()                 function returns the number of distinct elements in a columns\nfirst()\nlast()\nkurtosis()                      Dont know\nskewness()                      Dont know, but might come in handy later\nstddev_samp()\nstddev_pop()\nsumDistinct() function returns the sum of all distinct values in a column.\nvar_samp()\nvar_pop()\n'

In [22]:
import pandas as pd
### switch pyspark df to pandas df
pd_df = df_pyspark.toPandas()

### switch pandas df back to pyspark df
new_df = spark.createDataFrame(pd_df)

Py4JJavaError: An error occurred while calling o123.collectToPython.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 41.0 failed 1 times, most recent failure: Lost task 0.0 in stage 41.0 (TID 70) (LAPTOP-66BOPJMV executor driver): java.lang.OutOfMemoryError: Java heap space
	at java.base/java.nio.HeapByteBuffer.<init>(HeapByteBuffer.java:61)
	at java.base/java.nio.ByteBuffer.allocate(ByteBuffer.java:348)
	at org.apache.spark.serializer.SerializerHelper$.$anonfun$serializeToChunkedBuffer$1(SerializerHelper.scala:40)
	at org.apache.spark.serializer.SerializerHelper$.$anonfun$serializeToChunkedBuffer$1$adapted(SerializerHelper.scala:40)
	at org.apache.spark.serializer.SerializerHelper$$$Lambda$2725/0x00000001012be040.apply(Unknown Source)
	at org.apache.spark.util.io.ChunkedByteBufferOutputStream.allocateNewChunkIfNeeded(ChunkedByteBufferOutputStream.scala:87)
	at org.apache.spark.util.io.ChunkedByteBufferOutputStream.write(ChunkedByteBufferOutputStream.scala:75)
	at java.base/java.io.ObjectOutputStream$BlockDataOutputStream.write(ObjectOutputStream.java:1859)
	at java.base/java.io.ObjectOutputStream.write(ObjectOutputStream.java:712)
	at org.apache.spark.util.Utils$.$anonfun$writeByteBuffer$1(Utils.scala:271)
	at org.apache.spark.util.Utils$.$anonfun$writeByteBuffer$1$adapted(Utils.scala:271)
	at org.apache.spark.util.Utils$$$Lambda$2728/0x00000001012c8440.apply(Unknown Source)
	at org.apache.spark.util.Utils$.writeByteBufferImpl(Utils.scala:249)
	at org.apache.spark.util.Utils$.writeByteBuffer(Utils.scala:271)
	at org.apache.spark.util.io.ChunkedByteBuffer.$anonfun$writeExternal$2(ChunkedByteBuffer.scala:103)
	at org.apache.spark.util.io.ChunkedByteBuffer.$anonfun$writeExternal$2$adapted(ChunkedByteBuffer.scala:103)
	at org.apache.spark.util.io.ChunkedByteBuffer$$Lambda$2727/0x00000001012c8840.apply(Unknown Source)
	at scala.collection.IndexedSeqOptimized.foreach(IndexedSeqOptimized.scala:36)
	at scala.collection.IndexedSeqOptimized.foreach$(IndexedSeqOptimized.scala:33)
	at scala.collection.mutable.ArrayOps$ofRef.foreach(ArrayOps.scala:198)
	at org.apache.spark.util.io.ChunkedByteBuffer.writeExternal(ChunkedByteBuffer.scala:103)
	at org.apache.spark.scheduler.DirectTaskResult.$anonfun$writeExternal$1(TaskResult.scala:60)
	at org.apache.spark.scheduler.DirectTaskResult$$Lambda$2732/0x00000001012cac40.apply$mcV$sp(Unknown Source)
	at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
	at org.apache.spark.util.Utils$.tryOrIOException(Utils.scala:1495)
	at org.apache.spark.scheduler.DirectTaskResult.writeExternal(TaskResult.scala:59)
	at java.base/java.io.ObjectOutputStream.writeExternalData(ObjectOutputStream.java:1460)
	at java.base/java.io.ObjectOutputStream.writeOrdinaryObject(ObjectOutputStream.java:1431)
	at java.base/java.io.ObjectOutputStream.writeObject0(ObjectOutputStream.java:1179)
	at java.base/java.io.ObjectOutputStream.writeObject(ObjectOutputStream.java:349)
	at org.apache.spark.serializer.JavaSerializationStream.writeObject(JavaSerializer.scala:46)
	at org.apache.spark.serializer.SerializerHelper$.serializeToChunkedBuffer(SerializerHelper.scala:42)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2785)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2721)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2720)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2720)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1206)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1206)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1206)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2984)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2923)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2912)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:971)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2263)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2284)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2303)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2328)
	at org.apache.spark.rdd.RDD.$anonfun$collect$1(RDD.scala:1019)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:405)
	at org.apache.spark.rdd.RDD.collect(RDD.scala:1018)
	at org.apache.spark.sql.execution.SparkPlan.executeCollect(SparkPlan.scala:448)
	at org.apache.spark.sql.Dataset.$anonfun$collectToPython$1(Dataset.scala:3997)
	at org.apache.spark.sql.Dataset.$anonfun$withAction$2(Dataset.scala:4167)
	at org.apache.spark.sql.execution.QueryExecution$.withInternalError(QueryExecution.scala:526)
	at org.apache.spark.sql.Dataset.$anonfun$withAction$1(Dataset.scala:4165)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$6(SQLExecution.scala:118)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:195)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:103)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:827)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:65)
	at org.apache.spark.sql.Dataset.withAction(Dataset.scala:4165)
	at org.apache.spark.sql.Dataset.collectToPython(Dataset.scala:3994)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:566)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:834)
Caused by: java.lang.OutOfMemoryError: Java heap space
	at java.base/java.nio.HeapByteBuffer.<init>(HeapByteBuffer.java:61)
	at java.base/java.nio.ByteBuffer.allocate(ByteBuffer.java:348)
	at org.apache.spark.serializer.SerializerHelper$.$anonfun$serializeToChunkedBuffer$1(SerializerHelper.scala:40)
	at org.apache.spark.serializer.SerializerHelper$.$anonfun$serializeToChunkedBuffer$1$adapted(SerializerHelper.scala:40)
	at org.apache.spark.serializer.SerializerHelper$$$Lambda$2725/0x00000001012be040.apply(Unknown Source)
	at org.apache.spark.util.io.ChunkedByteBufferOutputStream.allocateNewChunkIfNeeded(ChunkedByteBufferOutputStream.scala:87)
	at org.apache.spark.util.io.ChunkedByteBufferOutputStream.write(ChunkedByteBufferOutputStream.scala:75)
	at java.base/java.io.ObjectOutputStream$BlockDataOutputStream.write(ObjectOutputStream.java:1859)
	at java.base/java.io.ObjectOutputStream.write(ObjectOutputStream.java:712)
	at org.apache.spark.util.Utils$.$anonfun$writeByteBuffer$1(Utils.scala:271)
	at org.apache.spark.util.Utils$.$anonfun$writeByteBuffer$1$adapted(Utils.scala:271)
	at org.apache.spark.util.Utils$$$Lambda$2728/0x00000001012c8440.apply(Unknown Source)
	at org.apache.spark.util.Utils$.writeByteBufferImpl(Utils.scala:249)
	at org.apache.spark.util.Utils$.writeByteBuffer(Utils.scala:271)
	at org.apache.spark.util.io.ChunkedByteBuffer.$anonfun$writeExternal$2(ChunkedByteBuffer.scala:103)
	at org.apache.spark.util.io.ChunkedByteBuffer.$anonfun$writeExternal$2$adapted(ChunkedByteBuffer.scala:103)
	at org.apache.spark.util.io.ChunkedByteBuffer$$Lambda$2727/0x00000001012c8840.apply(Unknown Source)
	at scala.collection.IndexedSeqOptimized.foreach(IndexedSeqOptimized.scala:36)
	at scala.collection.IndexedSeqOptimized.foreach$(IndexedSeqOptimized.scala:33)
	at scala.collection.mutable.ArrayOps$ofRef.foreach(ArrayOps.scala:198)
	at org.apache.spark.util.io.ChunkedByteBuffer.writeExternal(ChunkedByteBuffer.scala:103)
	at org.apache.spark.scheduler.DirectTaskResult.$anonfun$writeExternal$1(TaskResult.scala:60)
	at org.apache.spark.scheduler.DirectTaskResult$$Lambda$2732/0x00000001012cac40.apply$mcV$sp(Unknown Source)
	at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
	at org.apache.spark.util.Utils$.tryOrIOException(Utils.scala:1495)
	at org.apache.spark.scheduler.DirectTaskResult.writeExternal(TaskResult.scala:59)
	at java.base/java.io.ObjectOutputStream.writeExternalData(ObjectOutputStream.java:1460)
	at java.base/java.io.ObjectOutputStream.writeOrdinaryObject(ObjectOutputStream.java:1431)
	at java.base/java.io.ObjectOutputStream.writeObject0(ObjectOutputStream.java:1179)
	at java.base/java.io.ObjectOutputStream.writeObject(ObjectOutputStream.java:349)
	at org.apache.spark.serializer.JavaSerializationStream.writeObject(JavaSerializer.scala:46)
	at org.apache.spark.serializer.SerializerHelper$.serializeToChunkedBuffer(SerializerHelper.scala:42)


ERROR:root:Exception while sending command.
Traceback (most recent call last):
  File "C:\Users\swak\AppData\Local\Programs\Python\Python311\Lib\site-packages\py4j\clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
                          ^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\swak\AppData\Local\Programs\Python\Python311\Lib\socket.py", line 706, in readinto
    return self._sock.recv_into(b)
           ^^^^^^^^^^^^^^^^^^^^^^^
ConnectionResetError: [WinError 10054] An existing connection was forcibly closed by the remote host

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "C:\Users\swak\AppData\Local\Programs\Python\Python311\Lib\site-packages\py4j\java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\swak\AppData\Local\Programs\Python\Python311\Lib\site-packages\py4j\c

In [25]:
### encode categorical columns to numeric values
from pyspark.ml.feature import VectorAssembler,StringIndexer, IndexToString
categoryEncoder = StringIndexer(inputCol='Category',outputCol='Category_enc', handleInvalid='keep').fit(df_pyspark)
df_pyspark = categoryEncoder.transform(df_pyspark)
df_pyspark.show()



ConnectionRefusedError: [WinError 10061] No connection could be made because the target machine actively refused it

In [None]:

#to decode back the encoded column
converter = IndexToString(inputCol='Category_enc',outputCol='orig_category')
converted_df = converter.transform(df_pyspark)
converted_df.show()