In [1]:
import findspark; findspark.init()
from pyspark.sql import SparkSession
from pyspark.conf import SparkConf

### Spark session

In [28]:
spark=SparkSession.builder\
    .master("local[*]")\
    .appName("DescriptiveAnalysis")\
    .getOrCreate()

sc=spark.sparkContext

### Read the data 

In [29]:
rdd= sc.textFile('../../Dataset/val-RDD.csv')
header= rdd.first()
rdd= rdd.filter(lambda x: x!=header)
rdd.take(5)

['Q ParentConnection,com.AequitasSolutions.ParentPortal,Education,3.1,71.0,10000+,10000.0,36999,True,0.0,USD,24M,4.1 and up,Aequitas Solutions  Inc.,https://www.myaequitas.com,info@myaequitas.com,Jul 3  2019,Jun 11  2020,Everyone,https://www.myaequitas.com/privacy.aspx,True,True,True,2021-06-15 23:58:57',
 'Chef Monkey Pet - Adopt me,chef.monkeypet.co,Adventure,2.4,14.0,500+,500.0,614,True,0.0,USD,40M,4.1 and up,ALPHAEGO,,anasseb63@gmail.com,Apr 6  2021,Apr 06  2021,Everyone,,True,True,True,2021-06-16 06:39:57',
 'SmartConnect Apps,com.logictree.smartconnectapps,Productivity,0.0,0.0,100+,100.0,107,True,0.0,USD,10M,4.1 and up,LogicTree IT Solutions Inc,http://logictreeit.com,logictreeitsolutions@gmail.com,Sep 15  2017,Aug 19  2019,Everyone,https://www.smartconnectapps.com/OP/smartconnectappscom/terms.html,True,True,True,2021-06-16 12:18:18',
 'Team Formula Pro (2021),uk.co.teambobk.f1calendar,Sports,4.7,86.0,1000+,1000.0,1004,True,3.49,USD,9.5M,5.0 and up,TeamBobK,http://www.teambobk.co

# Avg Installations/ Free-Paid Apps

In [4]:
total_apps= rdd.count()
free_apps= rdd.filter(lambda x: x.split(',')[8] == 'True').count()
paid_apps= rdd.filter(lambda x: x.split(',')[8] == 'False').count()

print(f'Number of Free apps: {free_apps}')
print(f'Number of Paid apps: {paid_apps}')
print(f'Total Number of apps: {total_apps}')

In [5]:
def avg_installs(x):
    installations = x.split(',')[7]
    if x.split(',')[8] == 'True': return ('Free', installations)
    else:                         return ('Paid', installations)

installs_rdd = rdd.filter(lambda x:x!='')\
                .map(avg_installs)\
                .reduceByKey(lambda x, y: x + y)\
                .map(lambda x: (x[0], x[1] / free_apps if x[0] == 'Free' else x[1] / paid_apps))

installs_rdd.take(1)

# Max No. of Installs for each category

In [11]:
# max installs= x[7], category= x[2]

max_installs_categ= rdd.filter(lambda x:x!='')\
            .map(lambda x: (x.split(',')[2], x.split(',')[7]))\
            .mapValues(lambda x: int(x))\
            .reduceByKey(lambda x,y:(x+y))\
            .sortBy(lambda x: x[1], ascending=False)

max_installs_categ.take(5)      

[('Tools', 18049967388),
 ('Communication', 12411624675),
 ('Social', 8187322038),
 ('Action', 6370229376),
 ('Productivity', 6290110338)]

# Min No. of Installs for each category

In [None]:
# input_data \
# .map(lambda row: (row[0], (row[1],row[2]))) \
# .groupByKey() \
# .map(lambda row: (
#         row[0],
#         sum([tup[0] for tup in list(row[1])]),
#         sum([tup[1] for tup in list(row[1])])
#     )
# ).collect()

# Developed apps per developer

In [9]:
developer_apps= rdd.filter(lambda x:x!='')\
                .map(lambda x: (x.split(',')[13], 1))\
                .reduceByKey(lambda x,y:(x+y))\
                .sortBy(lambda x: x[1], ascending=False)

developer_apps.take(5)

[('Subsplash Inc', 1122),
 ('ChowNow', 1010),
 ('TRAINERIZE', 999),
 ('OrderYOYO', 604),
 ('Phorest', 550)]