
## Overview

This is an end-to-end real-time PySpark project, where we analyze the Google Playstore dataset and derive business insights from the data.

In [0]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType
from pyspark.sql.functions import *

# File location and type
file_location = "/FileStore/tables/apps.csv"
file_type = "csv"

df = spark.read.format(file_type) \
  .option('header','true') \
  .option('sep',',') \
  .option('escape','"') \
  .option('inferSchema', 'true') \
  .load(file_location)

display(df)

_c0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19.0,"10,000+",Free,0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14.0,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up
2,"U Launcher Lite – FREE Live Cool Themes, Hide Apps",ART_AND_DESIGN,4.7,87510,8.7,"5,000,000+",Free,0,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25.0,"50,000,000+",Free,0,Teen,Art & Design,"June 8, 2018",Varies with device,4.2 and up
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8,"100,000+",Free,0,Everyone,Art & Design;Creativity,"June 20, 2018",1.1,4.4 and up
5,Paper flowers instructions,ART_AND_DESIGN,4.4,167,5.6,"50,000+",Free,0,Everyone,Art & Design,"March 26, 2017",1,2.3 and up
6,Smoke Effect Photo Maker - Smoke Editor,ART_AND_DESIGN,3.8,178,19.0,"50,000+",Free,0,Everyone,Art & Design,"April 26, 2018",1.1,4.0.3 and up
7,Infinite Painter,ART_AND_DESIGN,4.1,36815,29.0,"1,000,000+",Free,0,Everyone,Art & Design,"June 14, 2018",6.1.61.1,4.2 and up
8,Garden Coloring Book,ART_AND_DESIGN,4.4,13791,33.0,"1,000,000+",Free,0,Everyone,Art & Design,"September 20, 2017",2.9.2,3.0 and up
9,Kids Paint Free - Drawing Fun,ART_AND_DESIGN,4.7,121,3.1,"10,000+",Free,0,Everyone,Art & Design;Creativity,"July 3, 2018",2.8,4.0.3 and up


In [0]:
df.count()

9659

In [0]:
df.show(1)

+---+--------------------+--------------+------+-------+----+--------+----+-----+--------------+------------+---------------+-----------+------------+
|_c0|                 App|      Category|Rating|Reviews|Size|Installs|Type|Price|Content Rating|      Genres|   Last Updated|Current Ver| Android Ver|
+---+--------------------+--------------+------+-------+----+--------+----+-----+--------------+------------+---------------+-----------+------------+
|  0|Photo Editor & Ca...|ART_AND_DESIGN|   4.1|    159|19.0| 10,000+|Free|    0|      Everyone|Art & Design|January 7, 2018|      1.0.0|4.0.3 and up|
+---+--------------------+--------------+------+-------+----+--------+----+-----+--------------+------------+---------------+-----------+------------+
only showing top 1 row



In [0]:
# CheckSchema

df.printSchema()

root
 |-- _c0: integer (nullable = true)
 |-- App: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- Rating: double (nullable = true)
 |-- Reviews: integer (nullable = true)
 |-- Size: double (nullable = true)
 |-- Installs: string (nullable = true)
 |-- Type: string (nullable = true)
 |-- Price: string (nullable = true)
 |-- Content Rating: string (nullable = true)
 |-- Genres: string (nullable = true)
 |-- Last Updated: string (nullable = true)
 |-- Current Ver: string (nullable = true)
 |-- Android Ver: string (nullable = true)



### inferSchema isn't sufficient. We need additional Data cleaning steps

In [0]:
df = df.drop("_c0","Size","Content Rating", "Last Updated", "Android Ver", "Current Ver")

In [0]:
df.show(2)

+--------------------+--------------+------+-------+--------+----+-----+--------------------+
|                 App|      Category|Rating|Reviews|Installs|Type|Price|              Genres|
+--------------------+--------------+------+-------+--------+----+-----+--------------------+
|Photo Editor & Ca...|ART_AND_DESIGN|   4.1|    159| 10,000+|Free|    0|        Art & Design|
| Coloring book moana|ART_AND_DESIGN|   3.9|    967|500,000+|Free|    0|Art & Design;Pret...|
+--------------------+--------------+------+-------+--------+----+-----+--------------------+
only showing top 2 rows



In [0]:
df.printSchema(())

root
 |-- App: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- Rating: double (nullable = true)
 |-- Reviews: integer (nullable = true)
 |-- Installs: string (nullable = true)
 |-- Type: string (nullable = true)
 |-- Price: string (nullable = true)
 |-- Genres: string (nullable = true)



In [0]:
from pyspark.sql.functions import regexp_replace, col

df = df.withColumn("Reviews",col("Reviews").cast(IntegerType()))\
    .withColumn("Installs",regexp_replace(col("Installs"),"[^0-9]",""))\
    .withColumn("Installs",col("Installs").cast(IntegerType()))\
        .withColumn("Price",regexp_replace(col("Price"),"[$]",""))\
            .withColumn("Price",col("Price").cast(IntegerType()))

df.printSchema()

df.show(5)

root
 |-- App: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- Rating: double (nullable = true)
 |-- Reviews: integer (nullable = true)
 |-- Installs: integer (nullable = true)
 |-- Type: string (nullable = true)
 |-- Price: integer (nullable = true)
 |-- Genres: string (nullable = true)

+--------------------+--------------+------+-------+--------+----+-----+--------------------+
|                 App|      Category|Rating|Reviews|Installs|Type|Price|              Genres|
+--------------------+--------------+------+-------+--------+----+-----+--------------------+
|Photo Editor & Ca...|ART_AND_DESIGN|   4.1|    159|   10000|Free|    0|        Art & Design|
| Coloring book moana|ART_AND_DESIGN|   3.9|    967|  500000|Free|    0|Art & Design;Pret...|
|U Launcher Lite –...|ART_AND_DESIGN|   4.7|  87510| 5000000|Free|    0|        Art & Design|
|Sketch - Draw & P...|ART_AND_DESIGN|   4.5| 215644|50000000|Free|    0|        Art & Design|
|Pixel Draw - Numb...|ART_AND_

In [0]:
df.createOrReplaceTempView("apps")

In [0]:
%sql

select * from apps

App,Category,Rating,Reviews,Installs,Type,Price,Genres
Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,10000,Free,0,Art & Design
Coloring book moana,ART_AND_DESIGN,3.9,967,500000,Free,0,Art & Design;Pretend Play
"U Launcher Lite – FREE Live Cool Themes, Hide Apps",ART_AND_DESIGN,4.7,87510,5000000,Free,0,Art & Design
Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,50000000,Free,0,Art & Design
Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,100000,Free,0,Art & Design;Creativity
Paper flowers instructions,ART_AND_DESIGN,4.4,167,50000,Free,0,Art & Design
Smoke Effect Photo Maker - Smoke Editor,ART_AND_DESIGN,3.8,178,50000,Free,0,Art & Design
Infinite Painter,ART_AND_DESIGN,4.1,36815,1000000,Free,0,Art & Design
Garden Coloring Book,ART_AND_DESIGN,4.4,13791,1000000,Free,0,Art & Design
Kids Paint Free - Drawing Fun,ART_AND_DESIGN,4.7,121,10000,Free,0,Art & Design;Creativity


### Most reviewed paid apps

In [0]:
%sql

select App, Type, sum(Reviews) from apps
where Type == 'Paid'
group by 1, 2
order by 3 desc

App,Type,sum(Reviews)
Minecraft,Paid,2376564
Hitman Sniper,Paid,408292
Grand Theft Auto: San Andreas,Paid,348962
Bloons TD 5,Paid,190086
Where's My Water?,Paid,188740
Card Wars - Adventure Time,Paid,129603
True Skate,Paid,129409
Five Nights at Freddy's,Paid,100805
Beautiful Widgets Pro,Paid,97890
DraStic DS Emulator,Paid,87766


### Top 20 installed paid apps

In [0]:
%sql
select App, Type, sum(Installs) from apps
where Type = 'Paid'
group by 1, 2
order by 3 Desc Limit 20

App,Type,sum(Installs)
Hitman Sniper,Paid,10000000
Minecraft,Paid,10000000
Infinity Dungeon VIP,Paid,1000000
Tasker,Paid,1000000
Threema,Paid,1000000
Beautiful Widgets Pro,Paid,1000000
Facetune - For Free,Paid,1000000
Zombie Avengers:(Dreamsky)Stickman War Z,Paid,1000000
True Skate,Paid,1000000
Card Wars - Adventure Time,Paid,1000000


### Category-wise distribution

In [0]:
%sql

select Category, sum(Installs) from apps
group by 1
order by 2 desc

Category,sum(Installs)
GAME,13878924415
COMMUNICATION,11038276251
TOOLS,8001771915
PRODUCTIVITY,5793091369
SOCIAL,5487867902
PHOTOGRAPHY,4649147655
FAMILY,4427941505
VIDEO_PLAYERS,3926902720
TRAVEL_AND_LOCAL,2894887146
NEWS_AND_MAGAZINES,2369217760


Databricks visualization. Run in Databricks to view.