#App Ad Analysis
The goal of this project is to identify which free apps/ types of apps will get most user interaction with ads from users. Our aim is to help our developers understand what type of apps are likely to attract more users on Google Play and the App Store

In [10]:
def explore_data(dataset, start, end, rows_and_columns=False):
    dataset_slice = dataset[start:end]    
    for row in dataset_slice:
        print(row)
        print('\n') # adds a new (empty) line after each row

    if rows_and_columns:
        print('Number of rows:', len(dataset))
        print('Number of columns:', len(dataset[0]))
        
def freq_table(list_of_list, column_index):
    column_iso = []
    category_freq = {}
    category_freq_percentages = {}
    for row in list_of_list[1:]:
        column_iso.append(row[column_index])
    for row in column_iso:
        if row in category_freq:
            category_freq[row] += 1
        else:
            category_freq[row] = 1
    for row in category_freq:
        category_freq_percentages[row] = (category_freq[row] / len(column_iso)) * 100 
    return category_freq_percentages
        
def display_table(dataset, index):
    table = freq_table(dataset, index)
    table_display = []
    for key in table:
        key_val_as_tuple = (table[key], key)
        table_display.append(key_val_as_tuple)

    table_sorted = sorted(table_display, reverse = True)
    for entry in table_sorted:
        print(entry[1], ':', entry[0])

def descending_freq_tab(freq_table):
    table_display = []
    for key in freq_table:
        key_val_as_tuple = (freq_table[key], key)
        table_display.append(key_val_as_tuple)

    table_sorted = sorted(table_display, reverse = True)
    for entry in table_sorted:
        print(entry[1], ':', entry[0])
        
def inspect_genre(data_set, category, IOS = True):
    if IOS == True:
        column = 11
    else:
        column = 1
    count = 0
    column_list = []
    for row in data_set:
        genre = row[column]
        if genre == str(category):
            column_list.append(row)
            count += 1
    print(f"There are {count} rows of {category} in the dataset")
    for row in column_list:
        print(row)
        print('\n')

In [None]:
open_file = open(r'C:\Users\bbeckenb\OneDrive\Documents\Local Datasets\AppleStore.csv', encoding="utf8")
import csv
from csv import reader
csv_obj = reader(open_file)
apple_apps_data = list(csv_obj)


open_file = open(r'C:\Users\bbeckenb\OneDrive\Documents\Local Datasets\googleplaystore.csv', encoding="utf8")
csv_obj = reader(open_file)
google_apps_data = list(csv_obj)

#explore_data(apple_apps_data, 0, 5)
explore_data(google_apps_data, 0, 2)

erroneous_row = google_apps_data[10473]
erroneous_row.insert(1, 'LIFESTYLE')
#print(erroneous_row)
google_apps_data[10473] = erroneous_row
#explore_data(google_apps_data, 10472, 10478)


I altered the erroneous row in the google data based on user commments on the data seen here: https://www.kaggle.com/lava18/google-play-store-apps/discussion/66015

Now I will further clean the data by eliminating duplicate data as to not skew the results. I have established a unique name list for the apps and a duplicate name list. As the program iterates through the rows, it compares app names from the full data list against the unique names list to see if the element is already listed. If it is, it adds the name to the duplcates list.

Using the length of the original duplicate names list, the initial length of the google apps data, the duplicate count in the delete function, and the final length of the google apps data, we have confirmed we have only deleted duplicate rows in the list.

In [None]:
reviews_max = {}
reviews_max_IOS = {}
for row in google_apps_data[1:]:
    name = row[0]
    reviews = float(row[3])
    if name in reviews_max and reviews_max[name] < reviews:
        reviews_max[name] = reviews
    elif name not in reviews_max:
        reviews_max[name] = reviews

for row in apple_apps_data[1:]:
    name = row[0]
    reviews = float(row[5])
    if name in reviews_max_IOS and reviews_max_IOS[name] < reviews:
        reviews_max_IOS[name] = reviews
    elif name not in reviews_max_IOS:
        reviews_max_IOS[name] = reviews        
print('length of unique Android apps dictionary with the largest volume of reviews per unique app:', len(reviews_max))
print('length of unique IOS apps dictionary with the largest volume of reviews per unique app:', len(reviews_max_IOS))

Now we have isolated a unique apps list with the max volume of reviews for each particular title. We will need to create a 'clean apps google data list to eliminate duplicates. Below I will create that list, then filter by comparing the name plus review number in the overall google apps data to the dicftionary table to ensure we are taking out duplicate titles with less reviews.

In [None]:
google_apps_clean = []
google_apps_duplicate = []
for row in google_apps_data[1:]:
    name = row[0]
    reviews_n = float(row[3])
    if reviews_max[name] == reviews_n and row not in google_apps_clean:
        google_apps_clean.append(row)
    else:
        google_apps_duplicate.append(row)
        
print('Overall google apps data length:', len(google_apps_data))       
print('Clean google apps data length:', len(google_apps_clean))
print('Duplicate google apps data length:', len(google_apps_duplicate))
print('Ensuring the Clean and Duplicate List sum equate to the total google apps data length:', len(google_apps_clean) + len(google_apps_duplicate))

IOS_apps_clean = []
IOS_apps_duplicate = []
for row in apple_apps_data[1:]:
    name = row[0]
    reviews_n = float(row[5])
    if reviews_max_IOS[name] == reviews_n and row not in IOS_apps_clean:
        IOS_apps_clean.append(row)
    else:
        IOS_apps_duplicate.append(row)
print('\n')        
print('Overall IOS apps data length:', len(apple_apps_data))       
print('Clean IOS apps data length:', len(IOS_apps_clean))
print('Duplicate IOS apps data length:', len(IOS_apps_duplicate))
print('Ensuring the Clean and Duplicate List sum equate to the total IOS apps data length:', len(IOS_apps_clean) + len(IOS_apps_duplicate))

Now that I have separated clean data from the duplicates, I will do another pass BECAUSE we only compared the review numbers in the data, there could be duplicate titles with other different elements (Ratings, installs, etc.).

In [None]:
unique_apps = []
duplicate_apps = []
for row in google_apps_clean:
    name = row[0]
    if name in unique_apps:
        duplicate_apps.append(name)
    else:
        unique_apps.append(name)
#print(len(duplicate_apps))

duplicate_count = 0
for app_name in duplicate_apps:
    name_count = 0
    row_count = 0
    for row in google_apps_clean:
        name = row[0]
        if app_name == name:
            name_count += 1
            if name_count > 1:
                #print(name)
                google_apps_clean.pop(row_count)
                duplicate_count += 1
        row_count += 1

print(f"There were still {duplicate_count} duplicates in our clean Android data list")
print(f"After taking {duplicate_count} duplicates out, we are left with {len(google_apps_clean)} rows in our clean Android data list")

IOS_unique_apps = []
IOS_duplicate_apps = []
for row in IOS_apps_clean:
    name = row[1]
    if name in IOS_unique_apps:
        IOS_duplicate_apps.append(name)
    else:
        IOS_unique_apps.append(name)

IOS_duplicate_count = 0
for app_name in IOS_duplicate_apps:
    name_count = 0
    row_count = 0
    for row in IOS_apps_clean:
        name = row[1]
        if app_name == name:
            name_count += 1
            if name_count > 1:
                IOS_apps_clean.pop(row_count)
                IOS_duplicate_count += 1
        row_count += 1
print('\n')
print(f"There were still {IOS_duplicate_count} duplicates in our clean IOS data list")
print(f"After taking {IOS_duplicate_count} duplicates out, we are left with {len(IOS_apps_clean)} rows in our clean IOS data list")

Because we are targeting ads for US based consumers, we will further filter the cleaned data list by eliminating foreign titles. I will find non-english titles and pull them from our list by taking titles out that have more than 3 non-ASCII characters.

In [None]:
google_apps_foreign = []
row_count = 0
for row in google_apps_clean:
    name = row[0]
    non_english_check = 0
    for i in name:
        if ord(i) > 127:
            non_english_check += 1
    if non_english_check > 2:
        google_apps_foreign.append(row)
        google_apps_clean.pop(row_count)
    row_count += 1
    
print('New length of our clean Android data list is:' , len(google_apps_clean))
print('Length of our non-english Android apps list is:', len(google_apps_foreign))

IOS_apps_foreign = []
row_count = 0
for row in IOS_apps_clean:
    name = row[1]
    non_english_check = 0
    for i in name:
        if ord(i) > 127:
            non_english_check += 1
    if non_english_check > 2:
        IOS_apps_foreign.append(row)
        IOS_apps_clean.pop(row_count)
    row_count += 1

print('\n')
print('New length of our clean IOS data list is:' , len(IOS_apps_clean))
print('Length of our non-english IOS apps list is:', len(IOS_apps_foreign))

At this point in the data cleaning process, we have taken out/ repaired erroneous data, duplicate data, and non-english data. The final piece to obtain our final list is to remove non-free apps.

In [None]:
non_free_apps = []
for row in google_apps_clean:
    row_count = 0
    price = row[7]
    if price[0] == '$':
        price = price[1:]
        row[7] = price
    price = float(price)
    if price > 0:
        non_free_apps.append(row)
        google_apps_clean.pop(row_count)
    row_count += 1
    
print('New length of our clean Android data list is:' , len(google_apps_clean))
print('Length of our non-free Android apps list is:', len(non_free_apps))

IOS_non_free_apps = []
for row in IOS_apps_clean:
    row_count = 0
    price = row[4]
    if price[0] == '$':
        price = price[1:]
        row[4] = price
    price = float(price)
    if price > 0:
        IOS_non_free_apps.append(row)
        IOS_apps_clean.pop(row_count)
    row_count += 1
    
print('\n')
print('New length of our clean IOS data list is:' , len(IOS_apps_clean))
print('Length of our non-free IOS apps list is:', len(IOS_non_free_apps))

To minimize risks and overhead, our validation strategy for an app idea is comprised of three steps:

Build a minimal Android version of the app, and add it to Google Play.
If the app has a good response from users, we develop it further.
If the app is profitable after six months, we build an iOS version of the app and add it to the App Store.
Because our end goal is to add the app on both Google Play and the App Store, we need to find app profiles that are successful on both markets. For instance, a profile that works well for both markets might be a productivity app that makes use of gamification.

Because our end goal is to add the app on both Google Play and the App Store, we need to find app profiles that are successful on both markets. For instance, a profile that works well for both markets might be a productivity app that makes use of gamification.

Let's begin the analysis by getting a sense of what are the most common genres for each market. For this, we'll need to build frequency tables for a few columns in our data sets.

In [None]:
print('Google Apps Category')
display_table(google_apps_clean, 1) 
print('\n'+'Google Apps Genre')
display_table(google_apps_clean, 9)  
print('\n' + 'Apple Apps Prime Genre')
display_table(IOS_apps_clean, 11)


Analyze the frequency table you generated for the prime_genre column of the App Store data set.

What is the most common genre? What is the runner-up? - As we can see, the largest amount of reviews given for the Apple Store Apps is the 'Games' genere, coming in at over 52% with the next closest being education at under 8%.
What is the general impression — are most of the apps designed for practical purposes (education, shopping, utilities, productivity, lifestyle) or more for entertainment (games, photo and video, social networking, sports, music)? - Entertainment
Can you recommend an app profile for the App Store market based on this frequency table alone? If there's a large number of apps for a particular genre, does that also imply that apps of that genre generally have a large number of users? -52% are Games, further investigation into installs needs to be done
Analyze the frequency table you generated for the Category and Genres column of the Google Play data set.

What are the most common genres?
What other patterns do you see? - Tools are the most prevalent at just under 10%. Otherwise, it is a fairly balanced distribution of the other genres.
Compare the patterns you see for the Google Play market with those you saw for the App Store market. - 10% of the apps are of the 'Game' category which is the closest link I see, 20% is in the 'Family' category. More investigation as to what that contains needs to be done.
Can you recommend an app profile based on what you found so far? Do the frequency tables you generated reveal the most frequent app genres or what genres have the most users? - Games is my best guess with the given information. I think install information to give us an idea of how many users represent each category will be helpful in making a decision.

In [None]:
#inspect_genre(IOS_apps_clean, 'Entertainment', IOS = True)
#inspect_genre(google_apps_clean, 'FAMILY', IOS = False)

google_installs_by_cat = {}
google_cat_count = {}
for row in google_apps_clean:
    genre = row[1]
    installs = row[5]
    installs = installs.replace('+', '')
    installs = installs.replace(',', '')
    installs = float(installs)
    if genre in google_cat_count:
        google_cat_count[genre] += 1
    else:
        google_cat_count[genre] = 1
    if genre in google_installs_by_cat:
        google_installs_by_cat[genre] += installs
    else:
        google_installs_by_cat[genre] = installs

google_installs_by_cat_avg = {}
for genre in google_installs_by_cat:
    google_installs_by_cat_avg[genre] = google_installs_by_cat[genre] / google_cat_count[genre]
    #print(genre, google_installs_by_genre_avg[genre])
print('Google Apps count of categore to compare to google installs\n')
descending_freq_tab(google_installs_by_cat_avg)

google_installs_by_genre = {}
google_genre_count = {}
for row in google_apps_clean:
    genre = row[9]
    installs = row[5]
    installs = installs.replace('+', '')
    installs = installs.replace(',', '')
    installs = float(installs)
    if genre in google_genre_count:
        google_genre_count[genre] += 1
    else:
        google_genre_count[genre] = 1
    if genre in google_installs_by_genre:
        google_installs_by_genre[genre] += installs
    else:
        google_installs_by_genre[genre] = installs

google_installs_by_genre_avg = {}
for genre in google_installs_by_genre:
    google_installs_by_genre_avg[genre] = google_installs_by_genre[genre] / google_genre_count[genre]
    #print(genre, google_installs_by_genre_avg[genre])
print('\nGoogle Apps count of genre to compare to google installs\n')
descending_freq_tab(google_installs_by_genre_avg)

IOS_rating_count_by_genre = {}
IOS_genre_count = {}
for row in IOS_apps_clean:
    genre = row[11]
    rating_count = float(row[5])
    if genre in IOS_genre_count:
        IOS_genre_count[genre] += 1
    else:
        IOS_genre_count[genre] = 1
    if genre in IOS_rating_count_by_genre:
        IOS_rating_count_by_genre[genre] += rating_count
    else:
        IOS_rating_count_by_genre[genre] = rating_count

IOS_rating_count_by_genre_avg = {}
for genre in IOS_rating_count_by_genre:
    IOS_rating_count_by_genre_avg[genre] = IOS_rating_count_by_genre[genre] /  IOS_genre_count[genre]
print('\nApple Apps count of ratings as proxy to compare to google installs\n')
descending_freq_tab(IOS_rating_count_by_genre_avg)   

Given the data, 10% of the most popular Google Apps are games with Puzzle;Action & Adventure and Arcade having an average install base of >20,000,000 installs. For Apple, >52% of the most popular apps are of the game genre which have an average install number of 367 for a data list with 4467 rows, >2000 of them are games giving us around 734000 game installs in total. This is the clearest bridge between the two platforms to develop a free app on Android and port over to Apple if/when success criteria is met by Google users.