In [0]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType
from pyspark.sql.functions import *

In [0]:
df = spark.read.load('/FileStore/tables/googleplaystore.csv', format = 'csv', sep = ',' , header = True, escape = '"', inferschema = 'true')

In [0]:
df.count()

Out[32]: 10841

In [0]:
df.show(1)

+--------------------+--------------+------+-------+----+--------+----+-----+--------------+------------+---------------+-----------+------------+
|                 App|      Category|Rating|Reviews|Size|Installs|Type|Price|Content Rating|      Genres|   Last Updated|Current Ver| Android Ver|
+--------------------+--------------+------+-------+----+--------+----+-----+--------------+------------+---------------+-----------+------------+
|Photo Editor & Ca...|ART_AND_DESIGN|   4.1|    159| 19M| 10,000+|Free|    0|      Everyone|Art & Design|January 7, 2018|      1.0.0|4.0.3 and up|
+--------------------+--------------+------+-------+----+--------+----+-----+--------------+------------+---------------+-----------+------------+
only showing top 1 row



In [0]:
df.printSchema()

root
 |-- App: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- Rating: double (nullable = true)
 |-- Reviews: string (nullable = true)
 |-- Size: string (nullable = true)
 |-- Installs: string (nullable = true)
 |-- Type: string (nullable = true)
 |-- Price: string (nullable = true)
 |-- Content Rating: string (nullable = true)
 |-- Genres: string (nullable = true)
 |-- Last Updated: string (nullable = true)
 |-- Current Ver: string (nullable = true)
 |-- Android Ver: string (nullable = true)



In [0]:
df = df.drop("Size", "Content Rating", "Current Ver", "Android Ver")

In [0]:
df.show(2)

+--------------------+--------------+------+-------+--------+----+-----+--------------------+----------------+
|                 App|      Category|Rating|Reviews|Installs|Type|Price|              Genres|    Last Updated|
+--------------------+--------------+------+-------+--------+----+-----+--------------------+----------------+
|Photo Editor & Ca...|ART_AND_DESIGN|   4.1|    159| 10,000+|Free|    0|        Art & Design| January 7, 2018|
| Coloring book moana|ART_AND_DESIGN|   3.9|    967|500,000+|Free|    0|Art & Design;Pret...|January 15, 2018|
+--------------------+--------------+------+-------+--------+----+-----+--------------------+----------------+
only showing top 2 rows



In [0]:
df.printSchema()

root
 |-- App: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- Rating: double (nullable = true)
 |-- Reviews: string (nullable = true)
 |-- Installs: string (nullable = true)
 |-- Type: string (nullable = true)
 |-- Price: string (nullable = true)
 |-- Genres: string (nullable = true)
 |-- Last Updated: string (nullable = true)



In [0]:
from pyspark.sql.functions import regexp_replace, col

df = df.withColumn("Reviews", col("Reviews").cast(IntegerType()))\
    .withColumn("Installs", regexp_replace(col("Installs"), "[^0-9]", ""))\
    .withColumn("Installs", col("Installs").cast(IntegerType()))\
        .withColumn("Price", regexp_replace(col("Price"), "[$]", ""))\
            .withColumn("Price", col("Price").cast(IntegerType()))


In [0]:
df.show(5)

+--------------------+--------------+------+-------+--------+----+-----+--------------------+----------------+
|                 App|      Category|Rating|Reviews|Installs|Type|Price|              Genres|    Last Updated|
+--------------------+--------------+------+-------+--------+----+-----+--------------------+----------------+
|Photo Editor & Ca...|ART_AND_DESIGN|   4.1|    159|   10000|Free|    0|        Art & Design| January 7, 2018|
| Coloring book moana|ART_AND_DESIGN|   3.9|    967|  500000|Free|    0|Art & Design;Pret...|January 15, 2018|
|U Launcher Lite –...|ART_AND_DESIGN|   4.7|  87510| 5000000|Free|    0|        Art & Design|  August 1, 2018|
|Sketch - Draw & P...|ART_AND_DESIGN|   4.5| 215644|50000000|Free|    0|        Art & Design|    June 8, 2018|
|Pixel Draw - Numb...|ART_AND_DESIGN|   4.3|    967|  100000|Free|    0|Art & Design;Crea...|   June 20, 2018|
+--------------------+--------------+------+-------+--------+----+-----+--------------------+----------------+
o

In [0]:
df.createOrReplaceTempView("apps")

In [0]:
%sql select * from apps
limit 10

App,Category,Rating,Reviews,Installs,Type,Price,Genres,Last Updated
Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,10000,Free,0,Art & Design,"January 7, 2018"
Coloring book moana,ART_AND_DESIGN,3.9,967,500000,Free,0,Art & Design;Pretend Play,"January 15, 2018"
"U Launcher Lite – FREE Live Cool Themes, Hide Apps",ART_AND_DESIGN,4.7,87510,5000000,Free,0,Art & Design,"August 1, 2018"
Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,50000000,Free,0,Art & Design,"June 8, 2018"
Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,100000,Free,0,Art & Design;Creativity,"June 20, 2018"
Paper flowers instructions,ART_AND_DESIGN,4.4,167,50000,Free,0,Art & Design,"March 26, 2017"
Smoke Effect Photo Maker - Smoke Editor,ART_AND_DESIGN,3.8,178,50000,Free,0,Art & Design,"April 26, 2018"
Infinite Painter,ART_AND_DESIGN,4.1,36815,1000000,Free,0,Art & Design,"June 14, 2018"
Garden Coloring Book,ART_AND_DESIGN,4.4,13791,1000000,Free,0,Art & Design,"September 20, 2017"
Kids Paint Free - Drawing Fun,ART_AND_DESIGN,4.7,121,10000,Free,0,Art & Design;Creativity,"July 3, 2018"


In [0]:
%sql select App, sum(Reviews) from apps
group by 1
order by 2 desc
limit 10

App,sum(Reviews)
Instagram,266241989
WhatsApp Messenger,207348304
Clash of Clans,179558781
Messenger – Text and Video Chat for Free,169932272
Subway Surfers,166331958
Candy Crush Saga,156993136
Facebook,156286514
8 Ball Pool,99386198
Clash Royale,92530298
Snapchat,68045010


In [0]:
%sql select App, Type, sum(Installs) as num_installed from apps
group by 1, 2
order by 3 desc
limit 10

App,Type,num_installed
Subway Surfers,Free,6000000000
Instagram,Free,4000000000
Google Drive,Free,4000000000
Hangouts,Free,4000000000
Google Photos,Free,4000000000
Google News,Free,4000000000
Candy Crush Saga,Free,3500000000
WhatsApp Messenger,Free,3000000000
Gmail,Free,3000000000
Temple Run 2,Free,3000000000


Databricks visualization. Run in Databricks to view.

In [0]:
%sql select Category, sum(Installs) as num_installed from apps
group by 1
order by 2 desc
limit 10

Category,num_installed
GAME,35086024415
COMMUNICATION,32647276251
PRODUCTIVITY,14176091369
SOCIAL,14069867902
TOOLS,11452771915
FAMILY,10258263505
PHOTOGRAPHY,10088247655
NEWS_AND_MAGAZINES,7496317760
TRAVEL_AND_LOCAL,6868887146
VIDEO_PLAYERS,6222002720


Databricks visualization. Run in Databricks to view.

In [0]:
%sql select * from apps
limit 10

App,Category,Rating,Reviews,Installs,Type,Price,Genres,Last Updated
Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,10000,Free,0,Art & Design,"January 7, 2018"
Coloring book moana,ART_AND_DESIGN,3.9,967,500000,Free,0,Art & Design;Pretend Play,"January 15, 2018"
"U Launcher Lite – FREE Live Cool Themes, Hide Apps",ART_AND_DESIGN,4.7,87510,5000000,Free,0,Art & Design,"August 1, 2018"
Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,50000000,Free,0,Art & Design,"June 8, 2018"
Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,100000,Free,0,Art & Design;Creativity,"June 20, 2018"
Paper flowers instructions,ART_AND_DESIGN,4.4,167,50000,Free,0,Art & Design,"March 26, 2017"
Smoke Effect Photo Maker - Smoke Editor,ART_AND_DESIGN,3.8,178,50000,Free,0,Art & Design,"April 26, 2018"
Infinite Painter,ART_AND_DESIGN,4.1,36815,1000000,Free,0,Art & Design,"June 14, 2018"
Garden Coloring Book,ART_AND_DESIGN,4.4,13791,1000000,Free,0,Art & Design,"September 20, 2017"
Kids Paint Free - Drawing Fun,ART_AND_DESIGN,4.7,121,10000,Free,0,Art & Design;Creativity,"July 3, 2018"


In [0]:
%sql select distinct(Type) from apps
limit 10


Type
0
""
Free
Paid


In [0]:
%sql select App, sum(price) as Price from apps
where Type = "Paid"
group by 1
order by 2 desc
limit 10

App,Price
I'm Rich - Trump Edition,400
most expensive app (H),399
I am rich(premium),399
I Am Rich Premium,399
I am Rich Plus,399
I'm Rich/Eu sou Rico/أنا غني/我很有錢,399
I Am Rich Pro,399
I AM RICH PRO PLUS,399
I am Rich,399
I am Rich!,399
