# Guide Project: Profitable App Profiles for the App Store and Google Play Markets

This project is built using the Dataquest introduction to data science module. It analyzes data from an App store.

In [2]:
apple_opened_file = open('AppleStore.csv', encoding='utf8')
google_opened_file = open('googleplaystore.csv', encoding='utf8')
from csv import reader
apple_read_file = reader(apple_opened_file)
apple_app_data = list(apple_read_file)
google_read_file = reader(google_opened_file)
google_app_data = list(google_read_file)

def explore_data(dataset, start, end, rows_and_columns = False):
    dataset_slice = dataset[start:end]
    
    for row in dataset_slice:
        print(row)
        print('\n') # Adds an empty line after each row
        
    if rows_and_columns:
        print('Number of rows: ', len(dataset))
        print('Number of columns: ', len(dataset[0]))

# The first thing we do to clean the data is remove duplicates. Below, we see how many duplicates there are in the data
duplicate_apps = []
unique_apps = []

for app in google_app_data:
    name = app[0]
    if name in unique_apps:
        duplicate_apps.append(name)
    else:
        unique_apps.append(name)
        
del google_app_data[10473]
print('Number of duplicate apps:', len(duplicate_apps))

Number of duplicate apps: 1181


We will now strategically pick which of the duplicates we would like to keep in the dataset. The fourth column shows the number of ratings. It is safe to say that the entry for a given app with the most ratings is the most recent, 
so we will just keep that one.

Below, we use a dictionary to store the key (app name) and its maximum number of reviews.

In [3]:
reviews_max = {}

for row in google_app_data[1:]:
    name = row[0]
    n_reviews = float(row[3])
    
    if (name in reviews_max) and (reviews_max[name] < n_reviews):
        reviews_max[name] = n_reviews
        
    if name not in reviews_max:
        reviews_max[name] = n_reviews
        
print('The number of items in the dictionary:', len(reviews_max))

The number of items in the dictionary: 9659


In the block below, we will sweep through the whole dataset and keep only the items that meet the maximum review criterion we specified above.

In [4]:
android_clean = []
already_added = []

for row in google_app_data[1:]:
    name = row[0]
    n_reviews = float(row[3])
    
    if (reviews_max[name] == n_reviews) and (name not in already_added):
        android_clean.append(row)
        already_added.append(name)
        
print('The length of our new list is:', len(android_clean))


The length of our new list is: 9659


Cleaning the data of non-English Apps:

In [5]:
def english_detector(app_name):
    counter = 0
    for character in app_name:
        if ord(character) > 127:
            counter +=1
        
    if counter <= 3:
        return True
    else:
        return False      
    
android_english = []
ios_english = []
    
for row in android_clean:
    app_name = row[0]
    if english_detector(app_name):
        android_english.append(row)

for row in apple_app_data:
    app_name = row[1]
    if english_detector(app_name):
        ios_english.append(row)    



Extract the free apps

In [6]:
android_free = []

for row in android_english:
    app_price = row[6]
    if app_price == 'Free':
        android_free.append(row)
print(len(android_free))        

8863


We now want to operationalize the dataset to see how we can determine which apps will be bring us the most profit.

With that in mind, we should use 'Rating', 'Reviews', and 'Installs' to see which will be successful in the future.


In [38]:
# Create a frequency table in the form of a dictionary. Return items as percentages.
def freq_table(dataset, index):
    frequency_table = {}
    total = 0
    
    for row in dataset:
        total += 1
        data_item = row[index]
        if data_item in frequency_table:
            frequency_table[data_item] += 1
        else:
            frequency_table[data_item] = 1
    
    percentage_table = {}
    for key in frequency_table:
        percentage = (frequency_table[key]/total) * 100
        percentage_table[key] = percentage
    
    return percentage_table



# Create a display table in the form of a tuple from the frequency dictionary above
def display_table(dataset, index):
    table = freq_table(dataset, index)
    table_display = []
    for key in table:
        key_val_as_tuple = (table[key], key)
        table_display.append(key_val_as_tuple)

    table_sorted = sorted(table_display, reverse = True)
    for entry in table_sorted:
        print(entry[1], ':', entry[0])

display_table(android_free, 1)

FAMILY : 18.898792733837304
GAME : 9.725826469592688
TOOLS : 8.462146000225657
BUSINESS : 4.592124562789123
LIFESTYLE : 3.9038700214374367
PRODUCTIVITY : 3.8925871601038025
FINANCE : 3.7007785174320205
MEDICAL : 3.5315355974275078
SPORTS : 3.396141261423897
PERSONALIZATION : 3.317161232088458
COMMUNICATION : 3.2381812027530184
HEALTH_AND_FITNESS : 3.0802211440821394
PHOTOGRAPHY : 2.944826808078529
NEWS_AND_MAGAZINES : 2.798149610741284
SOCIAL : 2.6627552747376737
TRAVEL_AND_LOCAL : 2.335552296062281
SHOPPING : 2.245289405393208
BOOKS_AND_REFERENCE : 2.1437436533904997
DATING : 1.8616721200496444
VIDEO_PLAYERS : 1.7939749520478394
MAPS_AND_NAVIGATION : 1.399074805370642
FOOD_AND_DRINK : 1.241114746699763
EDUCATION : 1.1621347173643235
ENTERTAINMENT : 0.9590432133589079
LIBRARIES_AND_DEMO : 0.9364774906916393
AUTO_AND_VEHICLES : 0.9251946293580051
HOUSE_AND_HOME : 0.8236488773552973
WEATHER : 0.8010831546880289
EVENTS : 0.7108202640189552
PARENTING : 0.6544059573507841
ART_AND_DESIGN : 0

In [39]:
print(google_app_data[0][1])
print(google_app_data[0][9])

Category
Genres
