In [1]:
#Domain: Analytics
#The ever-changing mobile landscape is a challenging space to navigate. Android holds about 53.2% of the smartphone market, while iOS is 43%. To get more people to download your app, you need to make sure they can easily find your app. Mobile app analytics is a great way to understand the existing strategy to drive growth and retention of future user.

#The data set contains more than 7000 Apple iOS mobile application details. The data was extracted from the iTunes Search API.

#Tasks:

#With millions of apps around nowadays, the data has become very key to getting top trending apps in iOS app store. As a data scientist, you are required to explore the datasets including cleaning and transforming the dataset.


#The data set comprises of information on 7200 apps on App store with following imp details

#"id" : App ID
#"track_name": App Name
#"size_bytes": Size (in Bytes)
#"price": Price amount
#"rating_count_tot": User Rating counts (for all version)
#"rating_count_ver": User Rating counts (for current version)
#"prime_genre": Primary Genre
#sup_devices.num: Number of supporting devices. 
#ipadSc_urls.num : Number of screenshots shown for display
#lang.num: number of supported languages
#vpp_lic: VPP Device based licening enabled





#appleStore.csv

#"id" : App ID
#"track_name": App Name
#"size_bytes": Size (in Bytes)
#"currency": Currency Type
#"price": Price amount
#"ratingcounttot": User Rating counts (for all version)
#"ratingcountver": User Rating counts (for current version)
#"user_rating" : Average User Rating value (for all version)
#"userratingver": Average User Rating value (for current version)
#"ver" : Latest version code
#"cont_rating": Content Rating
#"prime_genre": Primary Genre
#"sup_devices.num": Number of supporting devices
#"ipadSc_urls.num": Number of screenshots showed for display
#"lang.num": Number of supported languages
#"vpp_lic": Vpp Device Based Licensing Enabled


#appleStore_description.csv

#id : App ID
#track_name: Application name
#size_bytes: Memory size (in Bytes)
#app_desc: Application description

In [2]:
#1. Load csv into spark as a text file

 #File uploaded to /FileStore/tables/AppleStore.csv
 #File uploaded to /FileStore/tables/appleStore_description.csv

import sys
import os
from operator import add,mul
from pyspark import SparkContext,SparkConf
from pyspark import SparkFiles


In [3]:
cwd =os.getcwd()
for part in cwd.split('/'):
  #print(part)
  #if part.startswith('edureka'):
  if part.startswith('driver'):
    user_id=part.title()
user_id

In [4]:
app_name='{0} : Module5 Hands-On'.format(user_id)

app_name

In [5]:
conf=SparkConf().setAppName(app_name)

#sc=SparkContext(conf=conf)

In [6]:
sc.applicationId

In [7]:
#Datasets

#DATA_STR="Python is one of the programming language supported by Apache Spark"
AppleStore_DESC='/FileStore/tables/appleStore_description.csv'
AppleStore='/FileStore/tables/AppleStore.csv'

In [8]:
#1. Load csv into spark as a text file

RDD1=sc.textFile(AppleStore)

RDD2=sc.textFile(AppleStore_DESC)

In [9]:
RDD1.collect()

In [10]:
RDD2.take(5)

In [11]:
import os
from pyspark.sql import SparkSession,SQLContext
from pyspark import SparkContext,SparkConf

In [12]:
#2 Parse the data as csv.
AppleStore_Df=spark.read.csv(AppleStore,inferSchema=True,header=True)
AppleStore_desc_Df=spark.read.csv(AppleStore_DESC,inferSchema=True,header=True)

In [13]:
AppleStore_Df.show()

In [14]:
AppleStore_desc_Df.show()

In [15]:
AppleStore_Df.printSchema()

In [16]:
#joing 2 dataframes

#df1.join(df2, df1.id == df2.id).select(df1["*"],df2["other"])

AppleStore_Df_Final = AppleStore_Df.join(AppleStore_desc_Df, on=['id'], how='left').select(AppleStore_Df["*"],AppleStore_desc_Df["app_desc"])

In [17]:
AppleStore_Df_Final.count()

In [18]:
AppleStore_Df_Final.printSchema()

In [20]:
AppleStore_Df.count()

In [21]:
AppleStore_Df_Final.select(["track_name","app_desc"]).show()

In [22]:
#3 Convert bytes to MB and GB in a new column

#1 Byte = 0.000001 MB (in decimal)

AppleStore_Df=AppleStore_Df.withColumn('size_in_MB',AppleStore_Df['size_bytes']*0.000001)
AppleStore_Df=AppleStore_Df.withColumn('size_in_GB',AppleStore_Df['size_in_MB']*0.001)


AppleStore_Df.printSchema()


In [23]:
AppleStore_Df.select(['size_bytes','size_in_MB','size_in_GB']).show()

In [24]:
AppleStore_Df.createOrReplaceTempView("AppleStore")

sql_Df=spark.sql('Select * From AppleStore')

sql_Df.show()

In [25]:
#4 List top 10 trending apps

sql_Df=spark.sql('Select id,track_name,rating_count_tot From AppleStore order by rating_count_tot desc')

sql_Df.show()

#https://www.analyticsvidhya.com/blog/2016/10/spark-dataframe-and-operations/




In [26]:
sql_Df.orderBy(sql_Df.rating_count_tot.desc()).show(10)

In [27]:
sql_Df.printSchema()

#test.drop('Comb').columns

In [28]:
#5 The difference in the average number of screenshots displayed of highest and lowest rating apps.

In [29]:
#6 What percentage of high rated apps support multiple languages.

In [30]:
#7 How does app details contribute to user ratings?

In [31]:
#8Compare the statistics of different app groups/genres.

AppleStore_Df.describe('prime_genre').show()

In [32]:
#9Does length of app description contribute to the ratings?

In [33]:
#10Create a spark-submit application for the same and print the findings in the log