### Install Necessary Package

In [None]:
pip install pyspark

### Import dependencies

In [None]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType
from pyspark.sql.functions import *

### Create Spark Session

In [None]:
spark = SparkSession.builder \
    .appName("Google Play Store Analysis") \
    .getOrCreate()

### Load CSV/file

In [None]:
df = spark.read.load('/kaggle/input/googleplaystore/googleplaystore.csv',format='csv',sep=',',header='true',escape='"',inferschema='true')

In [None]:
df.count()

In [None]:
df.show(1)

### Check Table Schema

In [None]:
df.printSchema()

### Data Cleaning

In [None]:
df = df.drop("Size", "Content Rating", "Last Updated", "Android Ver")

In [None]:
df.show(2)

In [None]:
df = df.drop("Current Ver")

In [None]:
df.show(2)

In [None]:
df.printSchema()

In [None]:
from pyspark.sql.functions import regexp_replace, col

df = df.withColumn("Reviews",col("Reviews").cast(IntegerType()))\
    .withColumn("Installs",regexp_replace(col("Installs"),"[^0-9]",""))\
    .withColumn("Installs",col("Installs").cast(IntegerType()))\
    .withColumn("Price",regexp_replace(col("Price"),"[$]",""))\
    .withColumn("Price",col("Price").cast(IntegerType()))

In [None]:
df.show(3)

In [None]:
df.createOrReplaceTempView("apps")

In [None]:
result_df = spark.sql("SELECT * FROM apps")

result_df.show()

### Top 10 reviews given to the apps

In [None]:
query1 = """
SELECT
    App,
    SUM(Reviews)
FROM
    apps
GROUP BY
    App
ORDER BY 
    SUM(Reviews) DESC
LIMIT 10;
    """

result_df = spark.sql(query1)
result_df.show()

### Top 10 Installed Apps

In [None]:
query2 = """
SELECT
    App,
    Type,
    SUM(Installs)
FROM
    apps
GROUP BY
    App,
    Type
ORDER BY 
    SUM(Installs) DESC
LIMIT 10;
    """

result_df = spark.sql(query2)
result_df.show()

### Category wise distribution of installed apps

In [None]:
query3 = """
SELECT
    Category,
    SUM(Installs)
FROM
    apps
GROUP BY
    1
ORDER BY 
    2 DESC
LIMIT 10;
    """

result_df = spark.sql(query3)
result_df.show()

### Top paid apps

In [None]:
query4 = """
SELECT
    App,
    SUM(Price)
FROM (
    SELECT 
        *
    FROM
        apps
    WHERE
        Type = 'Paid'
)
GROUP BY
    1
ORDER BY 
    2 DESC
LIMIT 10;
    """

result_df = spark.sql(query4)
result_df.show()