# Analysis of Google Play and Apple App store apps

A study to find what kind of apps will bring in more users for the Google Play and Apple App store, specifically in the category of free English apps.

In [1]:
def explore_data(dataset, start, end, rows_and_columns=False):
    dataset_slice = dataset[start:end]
    for row in dataset_slice:
        print(row)
        print("\n")
        
    if rows_and_columns:
        print("number of rows:", len(dataset))
        print("Number of columns:", len(dataset[0]))
        print("\n")

In [2]:
def open_dataset(file_name, header=True):
    opened_file = open(file_name)
    from csv import reader
    read_file = reader(opened_file)
    data = list(read_file)
    
    if header:
        return data[0], data[1:]
    else:
        return data
    
apple_app_header,apple_app_data = open_dataset("AppleStore.csv")
google_app_header, google_app_data = open_dataset("googleplaystore.csv")

Checking lists

In [3]:
print(apple_app_header, "\n")
explore_data(apple_app_data, 0, 4, True)

print(google_app_header, "\n")
explore_data(google_app_data, 0, 4, True)


['id', 'track_name', 'size_bytes', 'currency', 'price', 'rating_count_tot', 'rating_count_ver', 'user_rating', 'user_rating_ver', 'ver', 'cont_rating', 'prime_genre', 'sup_devices.num', 'ipadSc_urls.num', 'lang.num', 'vpp_lic'] 

['284882215', 'Facebook', '389879808', 'USD', '0.0', '2974676', '212', '3.5', '3.5', '95.0', '4+', 'Social Networking', '37', '1', '29', '1']


['389801252', 'Instagram', '113954816', 'USD', '0.0', '2161558', '1289', '4.5', '4.0', '10.23', '12+', 'Photo & Video', '37', '0', '29', '1']


['529479190', 'Clash of Clans', '116476928', 'USD', '0.0', '2130805', '579', '4.5', '4.5', '9.24.12', '9+', 'Games', '38', '5', '18', '1']


['420009108', 'Temple Run', '65921024', 'USD', '0.0', '1724546', '3842', '4.5', '4.0', '1.6.2', '9+', 'Games', '40', '5', '1', '1']


number of rows: 7197
Number of columns: 16


['App', 'Category', 'Rating', 'Reviews', 'Size', 'Installs', 'Type', 'Price', 'Content Rating', 'Genres', 'Last Updated', 'Current Ver', 'Android Ver'] 

['Photo 

One row in the Google Play is missing a column of data and should be deleted to clean the data

In [4]:
print("Ex. of a non-erroneous row: \n", google_app_data[10470], "\nLength:", len(google_app_data[10470]), "\n")
print("Erroneous row: \n", google_app_data[10472], "\nLength:", len(google_app_data[10472]))

del google_app_data[10472]

Ex. of a non-erroneous row: 
 ['Jazz Wi-Fi', 'COMMUNICATION', '3.4', '49', '4.0M', '10,000+', 'Free', '0', 'Everyone', 'Communication', 'February 10, 2017', '0.1', '2.3 and up'] 
Length: 13 

Erroneous row: 
 ['Life Made WI-Fi Touchscreen Photo Frame', '1.9', '19', '3.0M', '1,000+', 'Free', '0', 'Everyone', '', 'February 11, 2018', '1.0.19', '4.0 and up'] 
Length: 12


Some apps have duplicate entries in the Google Play data. The only difference between the duplicates are the number of reviews, indicating that multiple entries were added at different points in time to the Google Play data. See example below for duplicate `"Facebook"` entries.

In [5]:
for app in google_app_data:
    name = app[0]
    if name == "Facebook":
        print(app, "\n", "Reviews:", app[3], "\n")

['Facebook', 'SOCIAL', '4.1', '78158306', 'Varies with device', '1,000,000,000+', 'Free', '0', 'Teen', 'Social', 'August 3, 2018', 'Varies with device', 'Varies with device'] 
 Reviews: 78158306 

['Facebook', 'SOCIAL', '4.1', '78128208', 'Varies with device', '1,000,000,000+', 'Free', '0', 'Teen', 'Social', 'August 3, 2018', 'Varies with device', 'Varies with device'] 
 Reviews: 78128208 



Creating a list of apps to see what apps appeared more than once in the Google Play app data. 

In [6]:
unique_apps = []
duplicate_apps = []

for app in google_app_data:
    name = app[0]
    if name in unique_apps:
        duplicate_apps.append(name)
    else:
        unique_apps.append(name)

print("Number of duplicate apps:", len(duplicate_apps), "\n")
print("Ex. of duplicate apps:", duplicate_apps[:10])

Number of duplicate apps: 1181 

Ex. of duplicate apps: ['Quick PDF Scanner + OCR FREE', 'Box', 'Google My Business', 'ZOOM Cloud Meetings', 'join.me - Simple Meetings', 'Box', 'Zenefits', 'Google Ads', 'Google My Business', 'Slack']


A dictionary is created to store the highest review count for each app. This can be used to remove duplicate entries that have lower review counts as a lower review count indicates an older duplicate entry. Examples of apps' highest review counts are printed out below.

In [7]:
reviews_max = {}

for app in google_app_data:
    name = app[0]
    n_reviews = float(app[3])
    if name in reviews_max and reviews_max[name] < n_reviews:
        reviews_max[name] = n_reviews
    if name not in reviews_max:
        reviews_max[name] = n_reviews
        
print("Number of apps:", len(reviews_max))
print("Highest number of reviews for Instagram:", reviews_max["Instagram"])
print("Highest number of reviews for Facebook:", reviews_max["Facebook"])

Number of apps: 9659
Highest number of reviews for Instagram: 66577446.0
Highest number of reviews for Facebook: 78158306.0


The Google Play apps data is gone through, row by row. For each row, if an app's review count in the original `google_app_data` list is the same as in the list for highest review count found in `reviews_max` but isn't in the `already_added` list, then add that app's entry (row) to the `google_data_clean` list. We only want to keep entries with the highest review count as that shows that entry is the most recent data. The name of the app for that entry is also added to the `already_added` list to keep track of what apps were already added to the cleaned data in the `google_data_clean` list.

In [8]:
google_data_clean = []
already_added = []

for app in google_app_data:
    name = app[0]
    n_reviews = float(app[3])
    if n_reviews == reviews_max[name] and name not in already_added:
        google_data_clean.append(app)
        already_added.append(name)

print(google_data_clean[:5])

[['Photo Editor & Candy Camera & Grid & ScrapBook', 'ART_AND_DESIGN', '4.1', '159', '19M', '10,000+', 'Free', '0', 'Everyone', 'Art & Design', 'January 7, 2018', '1.0.0', '4.0.3 and up'], ['U Launcher Lite – FREE Live Cool Themes, Hide Apps', 'ART_AND_DESIGN', '4.7', '87510', '8.7M', '5,000,000+', 'Free', '0', 'Everyone', 'Art & Design', 'August 1, 2018', '1.2.4', '4.0.3 and up'], ['Sketch - Draw & Paint', 'ART_AND_DESIGN', '4.5', '215644', '25M', '50,000,000+', 'Free', '0', 'Teen', 'Art & Design', 'June 8, 2018', 'Varies with device', '4.2 and up'], ['Pixel Draw - Number Art Coloring Book', 'ART_AND_DESIGN', '4.3', '967', '2.8M', '100,000+', 'Free', '0', 'Everyone', 'Art & Design;Creativity', 'June 20, 2018', '1.1', '4.4 and up'], ['Paper flowers instructions', 'ART_AND_DESIGN', '4.4', '167', '5.6M', '50,000+', 'Free', '0', 'Everyone', 'Art & Design', 'March 26, 2017', '1.0', '2.3 and up']]


Only apps in English are wanted for the analysis. To filter out non-English apps, the apps' names are first checked for non-English characters. If there are less than three non-English characters present in an app's name, the app and its info is added to a new list.

In [9]:
def string_check(a_string):
    count = 0
    for character in a_string:
        if ord(character) > 127:
            count += 1
        if count > 3:
            return False
    return True

def non_english_filter(dataset, name_index):
    filtered_dataset = []
    for app in dataset:
        name = app[name_index]
        if string_check(name) == True:
            filtered_dataset.append(app)
    return filtered_dataset

english_google_data = non_english_filter(google_data_clean, 0)
english_apple_data = non_english_filter(apple_app_data, 1)

Only free apps are wanted for the analysis. An app's price is checked for each dataset. If the price is 0 then the app and its info is added to a new list.

In [10]:
def non_free_filter(dataset, price_index):
    filtered_dataset = []
    for app in dataset:
        price = app[price_index]
        if price == "0" or price == "0.0":
            filtered_dataset.append(app)
    return filtered_dataset

free_english_apple_data = non_free_filter(english_apple_data, 4)
free_english_google_data = non_free_filter(english_google_data, 7)

In [11]:
print(apple_app_header)
print(free_english_apple_data[:2])
print("\n")
print(google_app_header)
print(free_english_google_data[:2])

['id', 'track_name', 'size_bytes', 'currency', 'price', 'rating_count_tot', 'rating_count_ver', 'user_rating', 'user_rating_ver', 'ver', 'cont_rating', 'prime_genre', 'sup_devices.num', 'ipadSc_urls.num', 'lang.num', 'vpp_lic']
[['284882215', 'Facebook', '389879808', 'USD', '0.0', '2974676', '212', '3.5', '3.5', '95.0', '4+', 'Social Networking', '37', '1', '29', '1'], ['389801252', 'Instagram', '113954816', 'USD', '0.0', '2161558', '1289', '4.5', '4.0', '10.23', '12+', 'Photo & Video', '37', '0', '29', '1']]


['App', 'Category', 'Rating', 'Reviews', 'Size', 'Installs', 'Type', 'Price', 'Content Rating', 'Genres', 'Last Updated', 'Current Ver', 'Android Ver']
[['Photo Editor & Candy Camera & Grid & ScrapBook', 'ART_AND_DESIGN', '4.1', '159', '19M', '10,000+', 'Free', '0', 'Everyone', 'Art & Design', 'January 7, 2018', '1.0.0', '4.0.3 and up'], ['U Launcher Lite – FREE Live Cool Themes, Hide Apps', 'ART_AND_DESIGN', '4.7', '87510', '8.7M', '5,000,000+', 'Free', '0', 'Everyone', 'Art & 

The apps' genres can be analyzed to determine which are most common, and therefore most popular. A function is written to first count how often a category is present in a dataset of apps. The counts can then be converted to percentage based on the total number of apps in the dataset. Another function is then written to display the percentages in descending order.

In [12]:
def freq_table(dataset, index):
    f_table = {}
    for app in dataset:
        grouping = app[index]
        if grouping in f_table:
            f_table[grouping] += 1
        else:
            f_table[grouping] = 1
            
    for key in f_table:
        f_table[key] /= len(dataset)
        f_table[key] *= 100
        f_table[key] = round(f_table[key], 2)
        
    return f_table
            
def display_table(dataset, index):
    table = freq_table(dataset, index)
    table_display = []
    for key in table:
        key_val_as_tuple = (table[key], key)
        table_display.append(key_val_as_tuple)
        
    table_sorted = sorted(table_display, reverse = True)
    for entry in table_sorted:
        print(entry[1], ":", entry[0])
        
print("Apple's 'prime_genre' grouping")
display_table(free_english_apple_data, 11)
print("\n")
print("Google's 'Category' grouping")
display_table(free_english_google_data, 1)
print("\n")
print("Google's 'Genre' grouping")
display_table(free_english_google_data, 9)

Apple's 'prime_genre' grouping
Games : 58.16
Entertainment : 7.88
Photo & Video : 4.97
Education : 3.66
Social Networking : 3.29
Shopping : 2.61
Utilities : 2.51
Sports : 2.14
Music : 2.05
Health & Fitness : 2.02
Productivity : 1.74
Lifestyle : 1.58
News : 1.33
Travel : 1.24
Finance : 1.12
Weather : 0.87
Food & Drink : 0.81
Reference : 0.56
Business : 0.53
Book : 0.43
Navigation : 0.19
Medical : 0.19
Catalogs : 0.12


Google's 'Category' grouping
FAMILY : 18.91
GAME : 9.72
TOOLS : 8.46
BUSINESS : 4.59
LIFESTYLE : 3.9
PRODUCTIVITY : 3.89
FINANCE : 3.7
MEDICAL : 3.53
SPORTS : 3.4
PERSONALIZATION : 3.32
COMMUNICATION : 3.24
HEALTH_AND_FITNESS : 3.08
PHOTOGRAPHY : 2.94
NEWS_AND_MAGAZINES : 2.8
SOCIAL : 2.66
TRAVEL_AND_LOCAL : 2.34
SHOPPING : 2.25
BOOKS_AND_REFERENCE : 2.14
DATING : 1.86
VIDEO_PLAYERS : 1.79
MAPS_AND_NAVIGATION : 1.4
FOOD_AND_DRINK : 1.24
EDUCATION : 1.16
ENTERTAINMENT : 0.96
LIBRARIES_AND_DEMO : 0.94
AUTO_AND_VEHICLES : 0.93
HOUSE_AND_HOME : 0.82
WEATHER : 0.8
EVENTS : 0.7

In [13]:
apple_prime_genre = freq_table(free_english_apple_data, 11)

for genre in apple_prime_genre:
    total = 0
    len_genre = 0
    for app in free_english_apple_data:
        genre_app = app[11]
        if genre_app == genre:
            user_ratings_count = float(app[5])
            total += user_ratings_count
            len_genre += 1
    avg_user_ratings = round(total / len_genre, 2)
    print(genre + ":", avg_user_ratings)
    

Social Networking: 71548.35
Photo & Video: 28441.54
Games: 22788.67
Music: 57326.53
Reference: 74942.11
Health & Fitness: 23298.02
Weather: 52279.89
Utilities: 18684.46
Travel: 28243.8
Shopping: 26919.69
News: 21248.02
Navigation: 86090.33
Lifestyle: 16485.76
Entertainment: 14029.83
Food & Drink: 33333.92
Sports: 23008.9
Book: 39758.5
Finance: 31467.94
Education: 7003.98
Productivity: 21028.41
Business: 7491.12
Catalogs: 4004.0
Medical: 612.0


In [20]:
google_category = freq_table(free_english_google_data, 1)

for category in google_category:
    total = 0
    len_category = 0
    for app in free_english_google_data:
        category_app = app[1]
        if category_app == category:
            installs_count = app[5]
            installs_count = installs_count.replace("+", "")
            category = category.replace(",", "")
            total += float(installs_count)
            len_category += 1
    #avg_installs = total / len_category
    #print(category + ":", avg_installs)

ValueError: could not convert string to float: '10,000+'