# Guide Project: Profitable App Profiles for the App Store and Google Play Markets

This project is built using the Dataquest introduction to data science module. It analyzes data from an App store.

In [24]:
apple_opened_file = open('AppleStore.csv', encoding='utf8')
google_opened_file = open('googleplaystore.csv', encoding='utf8')
from csv import reader
apple_read_file = reader(apple_opened_file)
apple_app_data = list(apple_read_file)
google_read_file = reader(google_opened_file)
google_app_data = list(google_read_file)

def explore_data(dataset, start, end, rows_and_columns = False):
    dataset_slice = dataset[start:end]
    
    for row in dataset_slice:
        print(row)
        print('\n') # Adds an empty line after each row
        
    if rows_and_columns:
        print('Number of rows: ', len(dataset))
        print('Number of columns: ', len(dataset[0]))

# The first thing we do to clean the data is remove duplicates. Below, we see how many duplicates there are in the data
duplicate_apps = []
unique_apps = []

for app in google_app_data:
    name = app[0]
    if name in unique_apps:
        duplicate_apps.append(name)
    else:
        unique_apps.append(name)
        
del google_app_data[10473]
print('Number of Android apps:', len(google_app_data))
print('Number of Android duplicate apps:', len(duplicate_apps))

duplicate_apple_apps = []
unique_apple_apps = []

for app in apple_app_data:
    name = app[2]
    if name in unique_apple_apps:
        duplicate_apple_apps.append(name)
    else:
        unique_apple_apps.append(name)
        
print('Number of Apple apps:', len(apple_app_data))        
print('Number of Apple duplicate apps:', len(duplicate_apple_apps))


Number of Android apps: 10841
Number of Android duplicate apps: 1181
Number of Apple apps: 7198
Number of Apple duplicate apps: 2


We will now strategically pick which of the duplicates we would like to keep in the dataset. The fourth column shows the number of ratings. It is safe to say that the entry for a given app with the most ratings is the most recent, 
so we will just keep that one.

Below, we use a dictionary to store the key (app name) and its maximum number of reviews.

In [25]:
reviews_max_android = {}

# Android data set
for row in google_app_data[1:]:
    name = row[0]
    n_reviews = float(row[3])
    
    if (name in reviews_max_android) and (reviews_max_android[name] < n_reviews):
        reviews_max_android[name] = n_reviews
        
    if name not in reviews_max_android:
        reviews_max_android[name] = n_reviews
        
print('The number of items in the dictionary:', len(reviews_max_android))

# IOS data set

reviews_max_apple = {}

for row in apple_app_data[1:]:
    name = row[2]
    n_reviews = float(row[6])
    
    if (name in reviews_max_apple) and (reviews_max_apple[name] < n_reviews):
        reviews_max_apple[name] = n_reviews
        
    if name not in reviews_max_apple:
        reviews_max_apple[name] = n_reviews
        
print('The number of items in the Apple dictionary:', len(reviews_max_apple))


The number of items in the dictionary: 9659
The number of items in the Apple dictionary: 7195


In the block below, we will sweep through the whole dataset and keep only the items that meet the maximum review criterion we specified above.

In [27]:
android_clean = []
already_added = []

for row in google_app_data[1:]:
    name = row[0]
    n_reviews = float(row[3])
    
    if (reviews_max_android[name] == n_reviews) and (name not in already_added):
        android_clean.append(row)
        already_added.append(name)
        
print('The length of our new list Anroid is:', len(android_clean))

apple_clean = []
already_added_apple = []

for row in apple_app_data[1:]:
    name = row[2]
    n_reviews = float(row[6])
    
    if (reviews_max_apple[name] == n_reviews) and (name not in already_added_apple):
        apple_clean.append(row)
        already_added_apple.append(name)

print('The length of our new Apple list is:', len(apple_clean))

The length of our new list Anroid is: 9659
The length of our new Apple list is: 7195


Removing the data of non-English Apps:

In [61]:
def english_detector(app_name):
    counter = 0
    for character in app_name:
        if ord(character) > 127:
            counter +=1
        
    if counter <= 3:
        return True
    else:
        return False      
    
android_english = []
ios_english = []
    
for row in android_clean:
    app_name = row[0]
    if english_detector(app_name):
        android_english.append(row)

for row in apple_app_data:
    app_name = row[2]
    if english_detector(app_name):
        ios_english.append(row)    

        

print('Number of English Android apps: ', len(android_english))
print('Number of iOS Android apps: ', len(ios_english))

print(explore_data(google_app_data,0,1))
print(explore_data(apple_app_data,0,1))

print(google_app_data[0][5])


Number of English Android apps:  9614
Number of iOS Android apps:  6184
['App', 'Category', 'Rating', 'Reviews', 'Size', 'Installs', 'Type', 'Price', 'Content Rating', 'Genres', 'Last Updated', 'Current Ver', 'Android Ver']


None
['', 'id', 'track_name', 'size_bytes', 'currency', 'price', 'rating_count_tot', 'rating_count_ver', 'user_rating', 'user_rating_ver', 'ver', 'cont_rating', 'prime_genre', 'sup_devices.num', 'ipadSc_urls.num', 'lang.num', 'vpp_lic']


None
Installs


Extract the free apps

In [39]:
android_free = []

for row in android_english:
    app_price = row[6]
    if app_price == 'Free':
        android_free.append(row)    

apple_free = []

for row in ios_english:
    app_price = row[5]
    if app_price == '0':
        apple_free.append(row)
        
print('Number of free/English Android apps: ', len(android_free))
print('Number of free/English iOS apps: ', len(apple_free))
    

Number of free/English Android apps:  8863
Number of free/English iOS apps:  3222


We now want to operationalize the dataset to see how we can determine which apps will be bring us the most profit.

With that in mind, we should use 'Rating', 'Reviews', and 'Installs' to see which will be successful in the future.


In [6]:
# Create a frequency table in the form of a dictionary. Return items as percentages.
def freq_table(dataset, index):
    frequency_table = {}
    total = 0
    
    for row in dataset:
        total += 1
        data_item = row[index]
        if data_item in frequency_table:
            frequency_table[data_item] += 1
        else:
            frequency_table[data_item] = 1
    
    percentage_table = {}
    for key in frequency_table:
        percentage = (frequency_table[key]/total) * 100
        percentage_table[key] = percentage
    
    return percentage_table



# Create a display table in the form of a tuple from the frequency dictionary above
def display_table(dataset, index):
    table = freq_table(dataset, index)
    table_display = []
    for key in table:
        key_val_as_tuple = (table[key], key)
        table_display.append(key_val_as_tuple)

    table_sorted = sorted(table_display, reverse = True)
    for entry in table_sorted:
        print(entry[1], ':', entry[0])


# Android App Analysis

In [7]:
print("Genre frequency table:\n")
display_table(android_free, 1)

Genre frequency table:

FAMILY : 18.898792733837304
GAME : 9.725826469592688
TOOLS : 8.462146000225657
BUSINESS : 4.592124562789123
LIFESTYLE : 3.9038700214374367
PRODUCTIVITY : 3.8925871601038025
FINANCE : 3.7007785174320205
MEDICAL : 3.5315355974275078
SPORTS : 3.396141261423897
PERSONALIZATION : 3.317161232088458
COMMUNICATION : 3.2381812027530184
HEALTH_AND_FITNESS : 3.0802211440821394
PHOTOGRAPHY : 2.944826808078529
NEWS_AND_MAGAZINES : 2.798149610741284
SOCIAL : 2.6627552747376737
TRAVEL_AND_LOCAL : 2.335552296062281
SHOPPING : 2.245289405393208
BOOKS_AND_REFERENCE : 2.1437436533904997
DATING : 1.8616721200496444
VIDEO_PLAYERS : 1.7939749520478394
MAPS_AND_NAVIGATION : 1.399074805370642
FOOD_AND_DRINK : 1.241114746699763
EDUCATION : 1.1621347173643235
ENTERTAINMENT : 0.9590432133589079
LIBRARIES_AND_DEMO : 0.9364774906916393
AUTO_AND_VEHICLES : 0.9251946293580051
HOUSE_AND_HOME : 0.8236488773552973
WEATHER : 0.8010831546880289
EVENTS : 0.7108202640189552
PARENTING : 0.65440595735

In [8]:
print("Category frequency table:\n")
display_table(android_free, 9)

Category frequency table:

Tools : 8.450863138892023
Entertainment : 6.070179397495204
Education : 5.348076272142616
Business : 4.592124562789123
Productivity : 3.8925871601038025
Lifestyle : 3.8925871601038025
Finance : 3.7007785174320205
Medical : 3.5315355974275078
Sports : 3.463838429425702
Personalization : 3.317161232088458
Communication : 3.2381812027530184
Action : 3.102786866749408
Health & Fitness : 3.0802211440821394
Photography : 2.944826808078529
News & Magazines : 2.798149610741284
Social : 2.6627552747376737
Travel & Local : 2.324269434728647
Shopping : 2.245289405393208
Books & Reference : 2.1437436533904997
Simulation : 2.042197901387792
Dating : 1.8616721200496444
Arcade : 1.8503892587160102
Video Players & Editors : 1.771409229380571
Casual : 1.7601263680469368
Maps & Navigation : 1.399074805370642
Food & Drink : 1.241114746699763
Puzzle : 1.128286133363421
Racing : 0.9928917973598104
Role Playing : 0.9364774906916393
Libraries & Demo : 0.9364774906916393
Auto & Vehi

What are the most common genres?
* Family
* Game
* Tools

What are the most common categories?
* Tools
* Entertainment
* Education

# Apple App Analysis

In [43]:
print("Genre frequency table:\n")

display_table(apple_free, 12)

Genre frequency table:

Games : 58.16263190564867
Entertainment : 7.883302296710118
Photo & Video : 4.9658597144630665
Education : 3.662321539416512
Social Networking : 3.2898820608317814
Shopping : 2.60707635009311
Utilities : 2.5139664804469275
Sports : 2.1415270018621975
Music : 2.0484171322160147
Health & Fitness : 2.0173805090006205
Productivity : 1.7380509000620732
Lifestyle : 1.5828677839851024
News : 1.3345747982619491
Travel : 1.2414649286157666
Finance : 1.1173184357541899
Weather : 0.8690254500310366
Food & Drink : 0.8069522036002483
Reference : 0.5586592178770949
Business : 0.5276225946617008
Book : 0.4345127250155183
Navigation : 0.186219739292365
Medical : 0.186219739292365
Catalogs : 0.12414649286157665


The most common genre in the Apple store is Games by a long-shot. 58% of the Apps are games, compared to about 10% in the Google Play Store

Now, we will isolate each genre of the data sets, and see which apps are used the most. We will use "Installs" in the Play Store, and "rating_count_tot" in the Apple Store

### Apple Store:

In [60]:
apple_genre = freq_table(apple_free, 12)

for genre in apple_genre:
    total = 0
    len_genre = 0
    
    for row in apple_free:     
        app_genre = row[12]
        
        if app_genre == genre:
            ratings = float(row[6])
            total += ratings
            len_genre += 1
            
    avg_rating_num = total / len_genre
    print(genre,':',avg_rating_num)
    

Productivity : 21028.410714285714
Weather : 52279.892857142855
Shopping : 26919.690476190477
Reference : 74942.11111111111
Finance : 31467.944444444445
Music : 57326.530303030304
Utilities : 18684.456790123455
Travel : 28243.8
Social Networking : 71548.34905660378
Sports : 23008.898550724636
Health & Fitness : 23298.015384615384
Games : 22788.6696905016
Food & Drink : 33333.92307692308
News : 21248.023255813954
Book : 39758.5
Photo & Video : 28441.54375
Entertainment : 14029.830708661417
Business : 7491.117647058823
Lifestyle : 16485.764705882353
Education : 7003.983050847458
Navigation : 86090.33333333333
Medical : 612.0
Catalogs : 4004.0


So, it looks like Navigation has the most users, followed by Social Networking. These values are skewed by hugely popular apps like Google Maps and Facebook.

## Anroid:

Install numbers are less precise in the Google data set:


In [64]:
display_table(android_free, 5)

1,000,000+ : 15.728308699086089
100,000+ : 11.55365000564143
10,000,000+ : 10.549475346947986
10,000+ : 10.199706645605326
1,000+ : 8.394448832223853
100+ : 6.916393997517771
5,000,000+ : 6.826131106848697
500,000+ : 5.562450637481666
50,000+ : 4.772650344127271
5,000+ : 4.513144533453684
10+ : 3.542818458761142
500+ : 3.2494640640866526
50,000,000+ : 2.3017037120613786
100,000,000+ : 2.1324607920568655
50+ : 1.9180864267178157
5+ : 0.7898002933543946
1+ : 0.5077287600135394
500,000,000+ : 0.270788672007221
1,000,000,000+ : 0.2256572266726842
0+ : 0.045131445334536835


In [70]:
# Fix the install number data above by removing commas and plus signs

android_category = freq_table(android_free, 1)

for category in android_category:
    total = 0
    len_category = 0
    
    for row in android_free:
        category_app = row[1]
        
        if category_app == category:
            installs = row[5]
            installs = installs.replace('+','')
            installs = installs.replace(',','')

            total += float(installs)
            len_category += 1
            
    avg_installs_num = total / len_category
    print(category,':',avg_installs_num)    

ART_AND_DESIGN : 1986335.0877192982
AUTO_AND_VEHICLES : 647317.8170731707
BEAUTY : 513151.88679245283
BOOKS_AND_REFERENCE : 8767811.894736841
BUSINESS : 1712290.1474201474
COMICS : 817657.2727272727
COMMUNICATION : 38456119.167247385
DATING : 854028.8303030303
EDUCATION : 1833495.145631068
ENTERTAINMENT : 11640705.88235294
EVENTS : 253542.22222222222
FINANCE : 1387692.475609756
FOOD_AND_DRINK : 1924897.7363636363
HEALTH_AND_FITNESS : 4188821.9853479853
HOUSE_AND_HOME : 1331540.5616438356
LIBRARIES_AND_DEMO : 638503.734939759
LIFESTYLE : 1437816.2687861272
GAME : 15588015.603248259
FAMILY : 3697848.1731343283
MEDICAL : 120550.61980830671
SOCIAL : 23253652.127118643
SHOPPING : 7036877.311557789
PHOTOGRAPHY : 17840110.40229885
SPORTS : 3638640.1428571427
TRAVEL_AND_LOCAL : 13984077.710144928
TOOLS : 10801391.298666667
PERSONALIZATION : 5201482.6122448975
PRODUCTIVITY : 16787331.344927534
PARENTING : 542603.6206896552
WEATHER : 5074486.197183099
VIDEO_PLAYERS : 24727872.452830188
NEWS_AND_

Again, the numbers are skewed my monstor apps like WhatsApp, which leak to communications apps having more than 35 million installs.

Making a small change to test Git.