<h1>Profitable App Profiles for the App Store and Google Play Markets</h1>

Our goal is to help our developers understand what type of apps are likely to attract more users on Google Play and the App Store. To do this, we'll need to collect and analyze data about mobile apps available on Google Play and the App Store.

In [1]:
def explore_data(dataset, start, end, rows_and_columns=False):
    dataset_slice = dataset[start:end]    
    for row in dataset_slice:
        print(row)
        print('\n') # adds a new (empty) line after each row

    if rows_and_columns:
        print('Number of rows:', len(dataset))
        print('Number of columns:', len(dataset[0]))

In [2]:
from csv import reader
opened_ios = open("AppleStore.csv", encoding="utf8")
read_ios = reader(opened_ios)
ios_data = list(read_ios)
ios_header = ios_data[0]
ios_data = ios_data[1:]

opened_android = open("googleplaystore.csv", encoding="utf8")
read_android = reader(opened_android)
android_data = list(read_android)
android_header = android_data[0]
android_data = android_data[1:]

In [3]:
print(ios_header)
print("\n")
explore_data(ios_data, 0, 3, True)
print(android_header)
print("\n")
explore_data(android_data, 0, 3, True)

['id', 'track_name', 'size_bytes', 'currency', 'price', 'rating_count_tot', 'rating_count_ver', 'user_rating', 'user_rating_ver', 'ver', 'cont_rating', 'prime_genre', 'sup_devices.num', 'ipadSc_urls.num', 'lang.num', 'vpp_lic']


['284882215', 'Facebook', '389879808', 'USD', '0.0', '2974676', '212', '3.5', '3.5', '95.0', '4+', 'Social Networking', '37', '1', '29', '1']


['389801252', 'Instagram', '113954816', 'USD', '0.0', '2161558', '1289', '4.5', '4.0', '10.23', '12+', 'Photo & Video', '37', '0', '29', '1']


['529479190', 'Clash of Clans', '116476928', 'USD', '0.0', '2130805', '579', '4.5', '4.5', '9.24.12', '9+', 'Games', '38', '5', '18', '1']


Number of rows: 7197
Number of columns: 16
['App', 'Category', 'Rating', 'Reviews', 'Size', 'Installs', 'Type', 'Price', 'Content Rating', 'Genres', 'Last Updated', 'Current Ver', 'Android Ver']


['Photo Editor & Candy Camera & Grid & ScrapBook', 'ART_AND_DESIGN', '4.1', '159', '19M', '10,000+', 'Free', '0', 'Everyone', 'Art & Design', 'J

In [4]:
print(android_data[10472]) # this row is incorrect, category column is missing
print("\n")
print(android_header) # how should it be
print("\n")
print(android_data[0])

['Life Made WI-Fi Touchscreen Photo Frame', '1.9', '19', '3.0M', '1,000+', 'Free', '0', 'Everyone', '', 'February 11, 2018', '1.0.19', '4.0 and up']


['App', 'Category', 'Rating', 'Reviews', 'Size', 'Installs', 'Type', 'Price', 'Content Rating', 'Genres', 'Last Updated', 'Current Ver', 'Android Ver']


['Photo Editor & Candy Camera & Grid & ScrapBook', 'ART_AND_DESIGN', '4.1', '159', '19M', '10,000+', 'Free', '0', 'Everyone', 'Art & Design', 'January 7, 2018', '1.0.0', '4.0.3 and up']


In [5]:
print(len(android_data))

del android_data[10472]

print(len(android_data))

10841
10840


In [6]:
dublicate_android = []
unique_android = []

for app in android_data:
    name = app[0]
    
    if name in unique_android:
        dublicate_android.append(name)
    else:
        unique_android.append(name)

In [7]:
print(dublicate_android[:5])
print("\n")
print(len(dublicate_android))

['Quick PDF Scanner + OCR FREE', 'Box', 'Google My Business', 'ZOOM Cloud Meetings', 'join.me - Simple Meetings']


1181


We don't want to remove the dublicate datas randomly. 

We will keep he with highest number of reviews because it is the most recent one.

In [8]:
reviews_max = {}

for app in android_data:
    name = app[0]
    
    n_reviews = float(app[3])
    
    if name in reviews_max and reviews_max[name] < n_reviews:
        reviews_max[name] = n_reviews
        
    elif name not in reviews_max:
        reviews_max[name] = n_reviews

In [9]:
print('Expected length:', len(android_data) - 1181)
print('Actual length:', len(reviews_max))

Expected length: 9659
Actual length: 9659


In [10]:
android_clean = []

already_added = []

for app in android_data:
    name = app[0]
    n_reviews = float(app[3])
    
    if n_reviews == reviews_max[name] and name not in already_added:
        android_clean.append(app)
        already_added.append(name)
        


In [11]:
explore_data(android_clean, 0, 3, True)

['Photo Editor & Candy Camera & Grid & ScrapBook', 'ART_AND_DESIGN', '4.1', '159', '19M', '10,000+', 'Free', '0', 'Everyone', 'Art & Design', 'January 7, 2018', '1.0.0', '4.0.3 and up']


['U Launcher Lite – FREE Live Cool Themes, Hide Apps', 'ART_AND_DESIGN', '4.7', '87510', '8.7M', '5,000,000+', 'Free', '0', 'Everyone', 'Art & Design', 'August 1, 2018', '1.2.4', '4.0.3 and up']


['Sketch - Draw & Paint', 'ART_AND_DESIGN', '4.5', '215644', '25M', '50,000,000+', 'Free', '0', 'Teen', 'Art & Design', 'June 8, 2018', 'Varies with device', '4.2 and up']


Number of rows: 9659
Number of columns: 13


We have correct number of rows.

In [12]:
def isEnglish(string):
    non_ascii = 0
    for char in string:
        if ord(char) > 127:
            non_ascii += 1
    
    if non_ascii > 3:
        return False
    else:
        return True

In [13]:
print(isEnglish('Docs To Go™ Free Office Suite'))
print(isEnglish('Instachat 😜'))
print(isEnglish('爱奇艺PPS -《欢乐颂2》电视剧热播'))

print(ord('™'))
print(ord('😜'))

True
True
False
8482
128540


In [14]:
android_english = []
ios_english = []

for app in android_clean:
    name = app[0]
    
    if isEnglish(name):
        android_english.append(app)
        
for app in ios_data:
    name = app[1]
    
    if isEnglish(name):
        ios_english.append(app)
        
explore_data(android_english, 0, 3, True)
print('\n')
explore_data(ios_english, 0, 3, True)

['Photo Editor & Candy Camera & Grid & ScrapBook', 'ART_AND_DESIGN', '4.1', '159', '19M', '10,000+', 'Free', '0', 'Everyone', 'Art & Design', 'January 7, 2018', '1.0.0', '4.0.3 and up']


['U Launcher Lite – FREE Live Cool Themes, Hide Apps', 'ART_AND_DESIGN', '4.7', '87510', '8.7M', '5,000,000+', 'Free', '0', 'Everyone', 'Art & Design', 'August 1, 2018', '1.2.4', '4.0.3 and up']


['Sketch - Draw & Paint', 'ART_AND_DESIGN', '4.5', '215644', '25M', '50,000,000+', 'Free', '0', 'Teen', 'Art & Design', 'June 8, 2018', 'Varies with device', '4.2 and up']


Number of rows: 9614
Number of columns: 13


['284882215', 'Facebook', '389879808', 'USD', '0.0', '2974676', '212', '3.5', '3.5', '95.0', '4+', 'Social Networking', '37', '1', '29', '1']


['389801252', 'Instagram', '113954816', 'USD', '0.0', '2161558', '1289', '4.5', '4.0', '10.23', '12+', 'Photo & Video', '37', '0', '29', '1']


['529479190', 'Clash of Clans', '116476928', 'USD', '0.0', '2130805', '579', '4.5', '4.5', '9.24.12', '9+', 

We only build free apps as we make the money only from in-app ads. Therefore, we need isolate the free apps from non-free apps.

In [15]:
android_free = []
ios_free = []

for app in android_english:
    price = app[7]
        
    if price == "0":
        android_free.append(app)
        
for app in ios_english:
    price = app[4]
     
    if price == "0.0":
        ios_free.append(app)
     
    
print("\n")
print(len(android_free))
print(len(ios_free))




8864
3222


1. Our goal is to build a simple android app, and upload it to Google Play.
2. If the app gets a good response from users, we will develop it further.
3  If the app becomes becomes profitable after several months, we build an IOS version of it and upload it to App Store.

In [16]:
def freq_table(dataset, index):
    table = {}
    total = 0
    
    for row in dataset:
        total += 1
        value = row[index]
        if value in table:
            table[value] += 1
        else:
            table[value] = 1
    
    table_percentages = {}
    for key in table:
        percentage = (table[key] / total) * 100
        table_percentages[key] = percentage 
    
    return table_percentages


def display_table(dataset, index):
    table = freq_table(dataset, index)
    table_display = []
    for key in table:
        key_val_as_tuple = (table[key], key)
        table_display.append(key_val_as_tuple)
        
    table_sorted = sorted(table_display, reverse = True)
    for entry in table_sorted:
        print(entry[1], ':', entry[0])

In [17]:
display_table(ios_english, -5)

Games : 54.860100274947435
Entertainment : 7.261846999838266
Education : 6.6310852337053205
Photo & Video : 5.515122109008572
Utilities : 3.4449296458030085
Productivity : 2.7171276079573023
Health & Fitness : 2.6686074721009216
Music : 2.215752870774705
Social Networking : 2.037845705967977
Sports : 1.6820313763545207
Lifestyle : 1.6011644832605532
Shopping : 1.3747371825974446
Weather : 1.1159631246967492
Travel : 0.9704027171276078
News : 0.9218825812712276
Book : 0.8895358240336406
Reference : 0.8571890667960537
Business : 0.8571890667960537
Finance : 0.7924955523208799
Food & Drink : 0.7116286592269124
Navigation : 0.452854601326217
Medical : 0.3396409509946628
Catalogs : 0.08086689309396733


We can see that among the free English apps, almost 60% are games. Therefore, we can say that App Store is mostly populated by apps for fun.

In [18]:
display_table(android_english, -4) # Category

Tools : 8.602038693571874
Entertainment : 5.793634283336801
Education : 5.231953401289786
Business : 4.358227584772207
Medical : 4.108591637195756
Personalization : 3.900561680882047
Productivity : 3.879758685250676
Lifestyle : 3.775743707093822
Finance : 3.588516746411483
Sports : 3.442895776991887
Communication : 3.2660703141252343
Action : 3.110047846889952
Health & Fitness : 2.995631370917412
Photography : 2.9124193883919283
News & Magazines : 2.600374453921365
Social : 2.485957977948825
Travel & Local : 2.26752652381943
Books & Reference : 2.26752652381943
Shopping : 2.090701060952777
Simulation : 1.9762845849802373
Arcade : 1.9138755980861244
Dating : 1.768254628666528
Casual : 1.7162471395881007
Video Players & Editors : 1.674641148325359
Maps & Navigation : 1.3417932182234242
Puzzle : 1.2377782400665696
Food & Drink : 1.1649677553567712
Role Playing : 1.0817557728312877
Strategy : 0.9777407946744331
Racing : 0.9465363012273768
Libraries & Demo : 0.8737258165175785
Auto & Vehicl

Unlike App Store apps, the situation seems different on Google Play, which is mostly populated by apps for general purpose(family, tools, business)

In [19]:
display_table(android_english, -4)

Tools : 8.602038693571874
Entertainment : 5.793634283336801
Education : 5.231953401289786
Business : 4.358227584772207
Medical : 4.108591637195756
Personalization : 3.900561680882047
Productivity : 3.879758685250676
Lifestyle : 3.775743707093822
Finance : 3.588516746411483
Sports : 3.442895776991887
Communication : 3.2660703141252343
Action : 3.110047846889952
Health & Fitness : 2.995631370917412
Photography : 2.9124193883919283
News & Magazines : 2.600374453921365
Social : 2.485957977948825
Travel & Local : 2.26752652381943
Books & Reference : 2.26752652381943
Shopping : 2.090701060952777
Simulation : 1.9762845849802373
Arcade : 1.9138755980861244
Dating : 1.768254628666528
Casual : 1.7162471395881007
Video Players & Editors : 1.674641148325359
Maps & Navigation : 1.3417932182234242
Puzzle : 1.2377782400665696
Food & Drink : 1.1649677553567712
Role Playing : 1.0817557728312877
Strategy : 0.9777407946744331
Racing : 0.9465363012273768
Libraries & Demo : 0.8737258165175785
Auto & Vehicl

In [20]:
genres_ios = freq_table(ios_free, -5)

for genre in genres_ios:
    total = 0
    len_genre = 0
    for app in ios_free:
        genre_app = app[-5]
        if genre_app == genre:            
            n_ratings = float(app[5])
            total += n_ratings
            len_genre += 1
    avg_n_ratings = total / len_genre
    print(genre, ':', avg_n_ratings)
        

Health & Fitness : 23298.015384615384
Entertainment : 14029.830708661417
Travel : 28243.8
Productivity : 21028.410714285714
News : 21248.023255813954
Photo & Video : 28441.54375
Weather : 52279.892857142855
Navigation : 86090.33333333333
Music : 57326.530303030304
Book : 39758.5
Reference : 74942.11111111111
Sports : 23008.898550724636
Food & Drink : 33333.92307692308
Education : 7003.983050847458
Medical : 612.0
Utilities : 18684.456790123455
Business : 7491.117647058823
Lifestyle : 16485.764705882353
Games : 22788.6696905016
Catalogs : 4004.0
Shopping : 26919.690476190477
Finance : 31467.944444444445
Social Networking : 71548.34905660378


When we check the data above, navigation apps have the highest number of ratings that is followed by reference apps. 

In [27]:
categories_android = freq_table(android_free, 1)

for category in categories_android:
    total = 0
    len_category = 0
    for app in android_free:
        category_app = app[1]
        if category_app == category:
            n_installs = app[5]
            n_installs = n_installs.replace("+", "")
            n_installs = n_installs.replace(",", "")
            total += float(n_installs)
            len_category += 1
            
    avg_n_installs = total / len_category
    print(category, ':', avg_n_installs)

ENTERTAINMENT : 11640705.88235294
MAPS_AND_NAVIGATION : 4056941.7741935486
TRAVEL_AND_LOCAL : 13984077.710144928
EDUCATION : 1833495.145631068
BEAUTY : 513151.88679245283
GAME : 15588015.603248259
COMMUNICATION : 38456119.167247385
BOOKS_AND_REFERENCE : 8767811.894736841
SPORTS : 3638640.1428571427
WEATHER : 5074486.197183099
ART_AND_DESIGN : 1986335.0877192982
HEALTH_AND_FITNESS : 4188821.9853479853
BUSINESS : 1712290.1474201474
SHOPPING : 7036877.311557789
VIDEO_PLAYERS : 24727872.452830188
AUTO_AND_VEHICLES : 647317.8170731707
LIBRARIES_AND_DEMO : 638503.734939759
NEWS_AND_MAGAZINES : 9549178.467741935
PRODUCTIVITY : 16787331.344927534
LIFESTYLE : 1437816.2687861272
PHOTOGRAPHY : 17840110.40229885
DATING : 854028.8303030303
SOCIAL : 23253652.127118643
FINANCE : 1387692.475609756
EVENTS : 253542.22222222222
MEDICAL : 120550.61980830671
TOOLS : 10801391.298666667
FAMILY : 3695641.8198090694
PARENTING : 542603.6206896552
PERSONALIZATION : 5201482.6122448975
FOOD_AND_DRINK : 1924897.736

Gaming