In [1]:
from pyspark import SparkContext, SparkConf

In [2]:
app_name = 'apple_store'

In [3]:
conf = SparkConf().setAppName(app_name)
sc = SparkContext.getOrCreate(conf=conf)

In [4]:
sc.applicationId

'local-1576431340481'

In [5]:
# load dataset
app_store = sc.textFile('..\pyspark-training\data\AppleStore.csv')
app_store.take(2)

['"","id","track_name","size_bytes","currency","price","rating_count_tot","rating_count_ver","user_rating","user_rating_ver","ver","cont_rating","prime_genre","sup_devices.num","ipadSc_urls.num","lang.num","vpp_lic"',
 '"1","281656475","PAC-MAN Premium",100788224,"USD",3.99,21292,26,4,4.5,"6.3.5","4+","Games",38,5,10,1']

In [6]:
# Parse data
apps = app_store.map(lambda x: x.replace(", ", " ")
                    ).map(lambda x: x.split(',')
                         ).filter(lambda x: x[0] != '""'
                                 ).filter(lambda x: len(x)==17)
apps.take(1)

[['"1"',
  '"281656475"',
  '"PAC-MAN Premium"',
  '100788224',
  '"USD"',
  '3.99',
  '21292',
  '26',
  '4',
  '4.5',
  '"6.3.5"',
  '"4+"',
  '"Games"',
  '38',
  '5',
  '10',
  '1']]

In [15]:
# Convert bytes to MB and GB
size_bytes_mb = apps.map(lambda x: float(x[3])/1000000)
size_bytes_gb = apps.map(lambda x: float(x[3])/1000000000)
print(size_bytes_mb.take(5))
print(size_bytes_gb.take(5))

[100.788224, 158.578688, 100.524032, 128.512, 92.7744]
[0.100788224, 0.158578688, 0.100524032, 0.128512, 0.0927744]


In [8]:
# List Top 10 treding apps -> the apps with most rating_count_tot
trending_apps = apps.sortBy(lambda x: int(x[6]), False).map(lambda cols: cols[2]+": "+cols[6])
trending_apps.take(10)

['"Facebook": 2974676',
 '"Instagram": 2161558',
 '"Clash of Clans": 2130805',
 '"Temple Run": 1724546',
 '"Pandora - Music & Radio": 1126879',
 '"Pinterest": 1061624',
 '"Bible": 985920',
 '"Candy Crush Saga": 961794',
 '"Spotify Music": 878563',
 '"Angry Birds": 824451']

In [9]:
# The difference in the average number of screenshots displayed of highest and lowest rating apps
# Use ipadSc_urls.num
screenshots_highest_rating = apps.filter(lambda x: float(x[8])==5).map(lambda x: int(x[14]))
screenshots_lowest_rating = apps.filter(lambda x: float(x[8])==0).map(lambda x: int(x[14]))
screenshots_highest_rating.mean()- screenshots_lowest_rating.mean()

1.136256756547119

In [10]:
# What percentage of high rated apps support multiple languages
# Use lang.num
high_rating_apps = apps.filter(lambda x: float(x[8])==5)
high_rating_apps_multiple_lang = high_rating_apps.filter(lambda x: int(x[15]) >= 2)
high_rating_apps_multiple_lang.count() / high_rating_apps.count()

0.4745417515274949

In [11]:
from pyspark.mllib.stat import Statistics
import re

In [12]:
# How does app details contribute to user ratings
user_ratings = apps.map(lambda x: float(x[8]))
# Use version as a proxiy for details
version = apps.map(lambda x: int(re.sub('([a-zA-Z])','', x[10]).replace('"','')[:2].replace(".",'')))
#version.collect()
print("Correlation is: " + str(Statistics.corr(version, user_ratings, method="pearson")))

Correlation is: 0.043467893153070036


Number of versions has almost no impact on user rating.

In [13]:
# Compare the statistics of different app groups/genres
# Use prime_genre
genres = apps.map(lambda x: (x[12],1)).keys().distinct()
for genre in genres.collect():
    print(genre, ":")
    print(apps.filter(lambda x: x[12]==genre).map(lambda x: float(x[8])).stats())
#apps.map(lambda x: float(x[8])).stats()

"Food & Drink" :
(count: 62, mean: 3.2338709677419355, stdev: 1.62817367984, max: 5.0, min: 0.0)
"Games" :
(count: 3848, mean: 3.691787941787938, stdev: 1.46076195648, max: 5.0, min: 0.0)
"Shopping" :
(count: 121, mean: 3.5454545454545454, stdev: 1.5130288161, max: 5.0, min: 0.0)
"Business" :
(count: 56, mean: 3.7410714285714297, stdev: 1.25353454362, max: 5.0, min: 0.0)
"Health & Fitness" :
(count: 178, mean: 3.6882022471910108, stdev: 1.53598269375, max: 5.0, min: 0.0)
"Utilities" :
(count: 247, mean: 3.2732793522267207, stdev: 1.47146081774, max: 5.0, min: 0.0)
"Lifestyle" :
(count: 141, mean: 2.7943262411347516, stdev: 1.75651868176, max: 5.0, min: 0.0)
"Productivity" :
(count: 178, mean: 4.005617977528091, stdev: 0.967140991146, max: 5.0, min: 0.0)
"Entertainment" :
(count: 531, mean: 3.240112994350284, stdev: 1.43938127089, max: 5.0, min: 0.0)
"Education" :
(count: 453, mean: 3.3763796909492263, stdev: 1.54160638581, max: 5.0, min: 0.0)
"Music" :
(count: 138, mean: 3.978260869565

In [14]:
# Does length of app Description contribute to the ratings?
# Use track_name
len_name = apps.map(lambda x: len(x[2]))
print("Correlation is: " + str(Statistics.corr(len_name, user_ratings, method="pearson")))

Correlation is: 0.08101484439450309


There's almost no correlation between discription and rating.