In [96]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType
from pyspark.sql.functions import *

In [97]:
spark = SparkSession.builder.appName('abc').getOrCreate()

In [98]:
df = spark.read.format("csv")\
.option("header", "True")\
.option("inferSchema", "True")\
.load("/googlestore.csv")

In [99]:
df.show(5)

+--------------------+--------------+------+-------+----+-----------+----+-----+--------------+--------------------+----------------+------------------+------------+
|                 App|      Category|Rating|Reviews|Size|   Installs|Type|Price|Content Rating|              Genres|    Last Updated|       Current Ver| Android Ver|
+--------------------+--------------+------+-------+----+-----------+----+-----+--------------+--------------------+----------------+------------------+------------+
|Photo Editor & Ca...|ART_AND_DESIGN|   4.1|    159| 19M|    10,000+|Free|    0|      Everyone|        Art & Design| January 7, 2018|             1.0.0|4.0.3 and up|
| Coloring book moana|ART_AND_DESIGN|   3.9|    967| 14M|   500,000+|Free|    0|      Everyone|Art & Design;Pret...|January 15, 2018|             2.0.0|4.0.3 and up|
|U Launcher Lite –...|ART_AND_DESIGN|   4.7|  87510|8.7M| 5,000,000+|Free|    0|      Everyone|        Art & Design|  August 1, 2018|             1.2.4|4.0.3 and up|
|Ske

In [100]:
df.count()

10841

In [101]:
df.printSchema()

root
 |-- App: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- Rating: string (nullable = true)
 |-- Reviews: string (nullable = true)
 |-- Size: string (nullable = true)
 |-- Installs: string (nullable = true)
 |-- Type: string (nullable = true)
 |-- Price: string (nullable = true)
 |-- Content Rating: string (nullable = true)
 |-- Genres: string (nullable = true)
 |-- Last Updated: string (nullable = true)
 |-- Current Ver: string (nullable = true)
 |-- Android Ver: string (nullable = true)



In [102]:
df = df.drop("Size", "Content Rating", "Last Updated", "Android Ver", "Current Ver")

In [103]:
df.show(2)

+--------------------+--------------+------+-------+--------+----+-----+--------------------+
|                 App|      Category|Rating|Reviews|Installs|Type|Price|              Genres|
+--------------------+--------------+------+-------+--------+----+-----+--------------------+
|Photo Editor & Ca...|ART_AND_DESIGN|   4.1|    159| 10,000+|Free|    0|        Art & Design|
| Coloring book moana|ART_AND_DESIGN|   3.9|    967|500,000+|Free|    0|Art & Design;Pret...|
+--------------------+--------------+------+-------+--------+----+-----+--------------------+
only showing top 2 rows



In [104]:
from pyspark.sql.types import DoubleType

In [105]:
df = df.withColumn("Rating", col("Rating").cast(DoubleType()))\
.withColumn("Reviews", col("Reviews").cast(IntegerType()))

In [106]:
df = df.fillna(0, 'Rating')

In [107]:
df.select('Rating').distinct().show()

+------+
|Rating|
+------+
|   2.4|
|   0.0|
|   3.5|
|   2.9|
|   3.7|
|   1.4|
|   2.3|
|   4.9|
|   3.1|
|   4.2|
|   4.5|
|   1.7|
|   3.4|
|   2.5|
|   1.0|
|   2.7|
|   4.1|
|   2.2|
|   2.8|
|   4.0|
+------+
only showing top 20 rows



In [108]:
df.show()

+--------------------+--------------+------+-------+-----------+----+-----+--------------------+
|                 App|      Category|Rating|Reviews|   Installs|Type|Price|              Genres|
+--------------------+--------------+------+-------+-----------+----+-----+--------------------+
|Photo Editor & Ca...|ART_AND_DESIGN|   4.1|    159|    10,000+|Free|    0|        Art & Design|
| Coloring book moana|ART_AND_DESIGN|   3.9|    967|   500,000+|Free|    0|Art & Design;Pret...|
|U Launcher Lite –...|ART_AND_DESIGN|   4.7|  87510| 5,000,000+|Free|    0|        Art & Design|
|Sketch - Draw & P...|ART_AND_DESIGN|   4.5| 215644|50,000,000+|Free|    0|        Art & Design|
|Pixel Draw - Numb...|ART_AND_DESIGN|   4.3|    967|   100,000+|Free|    0|Art & Design;Crea...|
|Paper flowers ins...|ART_AND_DESIGN|   4.4|    167|    50,000+|Free|    0|        Art & Design|
|Smoke Effect Phot...|ART_AND_DESIGN|   3.8|    178|    50,000+|Free|    0|        Art & Design|
|    Infinite Painter|ART_AND_

In [109]:
df = df.withColumn("Installs", regexp_replace(col("Installs"), "[^0-9]", ""))\
.withColumn("Price", regexp_replace(col("Price"),"[$]", ""))

In [110]:
df.show()

+--------------------+--------------+------+-------+--------+----+-----+--------------------+
|                 App|      Category|Rating|Reviews|Installs|Type|Price|              Genres|
+--------------------+--------------+------+-------+--------+----+-----+--------------------+
|Photo Editor & Ca...|ART_AND_DESIGN|   4.1|    159|   10000|Free|    0|        Art & Design|
| Coloring book moana|ART_AND_DESIGN|   3.9|    967|  500000|Free|    0|Art & Design;Pret...|
|U Launcher Lite –...|ART_AND_DESIGN|   4.7|  87510| 5000000|Free|    0|        Art & Design|
|Sketch - Draw & P...|ART_AND_DESIGN|   4.5| 215644|50000000|Free|    0|        Art & Design|
|Pixel Draw - Numb...|ART_AND_DESIGN|   4.3|    967|  100000|Free|    0|Art & Design;Crea...|
|Paper flowers ins...|ART_AND_DESIGN|   4.4|    167|   50000|Free|    0|        Art & Design|
|Smoke Effect Phot...|ART_AND_DESIGN|   3.8|    178|   50000|Free|    0|        Art & Design|
|    Infinite Painter|ART_AND_DESIGN|   4.1|  36815| 1000000

In [111]:
df = df.withColumn("Installs", col("Installs").cast(IntegerType()))\
.withColumn("Price", col("Price").cast(IntegerType()))

In [112]:
df.printSchema()

root
 |-- App: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- Rating: double (nullable = false)
 |-- Reviews: integer (nullable = true)
 |-- Installs: integer (nullable = true)
 |-- Type: string (nullable = true)
 |-- Price: integer (nullable = true)
 |-- Genres: string (nullable = true)



In [113]:
top_reviews = df.select("App", "Reviews")\
                .orderBy(col("Reviews").desc()).limit(10)
top_reviews.show()

+--------------------+--------+
|                 App| Reviews|
+--------------------+--------+
|            Facebook|78158306|
|            Facebook|78128208|
|  WhatsApp Messenger|69119316|
|  WhatsApp Messenger|69119316|
|  WhatsApp Messenger|69109672|
|           Instagram|66577446|
|           Instagram|66577313|
|           Instagram|66577313|
|           Instagram|66509917|
|Messenger – Text ...|56646578|
+--------------------+--------+



In [114]:
from typing import OrderedDict
top_installs = df.select("App", "Installs")\
                 .orderBy(col("Installs").desc()).limit(10)
top_installs.show()

+--------------------+----------+
|                 App|  Installs|
+--------------------+----------+
|            Hangouts|1000000000|
|Google Chrome: Fa...|1000000000|
|   Google Play Books|1000000000|
|  WhatsApp Messenger|1000000000|
|Messenger – Text ...|1000000000|
|Messenger – Text ...|1000000000|
|               Gmail|1000000000|
|            Hangouts|1000000000|
|Google Chrome: Fa...|1000000000|
|Skype - free IM &...|1000000000|
+--------------------+----------+



In [115]:
category_distribution = df.groupBy("Category")\
                          .agg(sum(col("Installs")).alias("Total_Installs"))\
                          .orderBy(col("Total_Installs").desc())
category_distribution.show(10)

+------------------+--------------+
|          Category|Total_Installs|
+------------------+--------------+
|              GAME|   35086024415|
|     COMMUNICATION|   32647276251|
|      PRODUCTIVITY|   14176091369|
|            SOCIAL|   14069867902|
|             TOOLS|   11452771915|
|            FAMILY|   10258263505|
|       PHOTOGRAPHY|   10088247655|
|NEWS_AND_MAGAZINES|    7496317760|
|  TRAVEL_AND_LOCAL|    6868887146|
|     VIDEO_PLAYERS|    6222002720|
+------------------+--------------+
only showing top 10 rows



In [116]:
top_paid_apps = df.filter(col("Type") == "Paid")\
                  .orderBy(col("Installs").desc())\
                  .limit(10)
top_paid_apps.show(10)

+--------------------+---------------+------+-------+--------+----+-----+--------------------+
|                 App|       Category|Rating|Reviews|Installs|Type|Price|              Genres|
+--------------------+---------------+------+-------+--------+----+-----+--------------------+
|           Minecraft|         FAMILY|   4.5|2376564|10000000|Paid|    6|Arcade;Action & A...|
|       Hitman Sniper|           GAME|   4.6| 408292|10000000|Paid|    0|              Action|
|           Minecraft|         FAMILY|   4.5|2375336|10000000|Paid|    6|Arcade;Action & A...|
| Facetune - For Free|    PHOTOGRAPHY|   4.4|  49553| 1000000|Paid|    5|         Photography|
|Zombie Avengers:(...|           GAME|   4.3|  13604| 1000000|Paid|    0|              Action|
|   Cut the Rope GOLD|         FAMILY|   4.6|  61264| 1000000|Paid|    0|              Puzzle|
|          True Skate|         SPORTS|   4.4| 129409| 1000000|Paid|    1|              Sports|
|Beautiful Widgets...|PERSONALIZATION|   4.2|  978

In [117]:
top_paid_rating_apps = df.filter(col("Type") == "Paid")\
                         .orderBy(col("Rating").desc())\
                         .limit(10)
top_paid_rating_apps.show()

+--------------------+---------------+------+-------+--------+----+-----+---------------+
|                 App|       Category|Rating|Reviews|Installs|Type|Price|         Genres|
+--------------------+---------------+------+-------+--------+----+-----+---------------+
|     P-Home for KLWP|PERSONALIZATION|   5.0|      4|     100|Paid|    0|Personalization|
|            Ra Ga Ba|           GAME|   5.0|      2|       1|Paid|    1|         Arcade|
|Android P Style I...|PERSONALIZATION|   5.0|      1|     100|Paid|    0|Personalization|
|Super Hearing Sec...|        MEDICAL|   5.0|      3|     100|Paid|    2|        Medical|
|Easy Hotspot Ad Free|          TOOLS|   5.0|      2|      10|Paid|    0|          Tools|
|AJ Gray Dark Icon...|PERSONALIZATION|   5.0|      2|      10|Paid|    0|Personalization|
|AP Art History Fl...|         FAMILY|   5.0|      1|      10|Paid|   29|      Education|
|   AJ Blue Icon Pack|PERSONALIZATION|   5.0|      4|      50|Paid|    0|Personalization|
|        A