# Analyzing ad selection

Ads that run in free Android and iOS mobile apps available on Google Play and the App Store are our main source of revenue. To assist our developers to enhance customer experience and increase revenue to our company, an analysis of what content our customers prefer was conducted.

## Downloading The Data
The Google Play data set contains approximately 10,000 Android apps; the data was collected in August 2018. You can download the data set [here](https://dq-content.s3.amazonaws.com/350/googleplaystore.csv).

The App Store data set contains approximately 7,000 iOS apps; the data was collected in July 2017. You can download the data set [here](https://dq-content.s3.amazonaws.com/350/AppleStore.csv).

## Import  and Explore Data

In [2]:
#import function
from csv import reader
import pandas as pd
#define function
def explore_data(dataset, start, end, rows_and_columns=False):
    dataset_slice = dataset[start:end]    
    for row in dataset_slice:
        print(row)
        print('\n') # adds a new (empty) line after each row

    if rows_and_columns:
        print('Number of rows:', len(dataset))
        print('Number of columns:', len(dataset[0]))

In [5]:
#Open and Explore the Google Play data set
opened_file = open('googleplaystore.csv', encoding="utf8")
read_file = reader(opened_file)
google = list(read_file)
google_header = google[0]
google = google[1:]

explore_data(google, 0, 4, True)

FileNotFoundError: [Errno 2] No such file or directory: 'googleplaystore.csv'

In [4]:
#Open and Explore the App Store data set
opened_file = open('AppleStore.csv', encoding="utf8")
read_file = reader(opened_file)
apple = list(read_file)
apple_header = apple[0]
apple = apple[1:]

explore_data(apple, 0, 4, True)

FileNotFoundError: [Errno 2] No such file or directory: 'AppleStore.csv'

In [5]:
#inspect column names to identify which columns are most important
print(google_header)
print(apple_header)

NameError: name 'google_header' is not defined

## Data Preparation

In [None]:
#detect row that is not the same length as the header row
#if all rows are the same, there will be no output
for row in google:
    header_length = len(google_header)
    row_length = len(row)
    if row_length != header_length:
        print(row)
        print(google.index(row))
  
#delete row
del google[10472]

In [None]:
#detect row that is not the same length as the header row
#if all rows are the same, there will be no output
for row in apple:
    header_length = len(apple_header)
    row_length = len(row)
    if row_length != header_length:
        print(row)
        print(apple.index(row))

In [None]:
#detect duplicates
google_duplicate_apps = []
google_duplicate_indices = []
google_unique_apps = []

for row in google:
    name = row[0]
    if name in google_unique_apps:
        google_duplicate_apps.append(name)            
    else:
        google_unique_apps.append(name)
    
print('Number of duplicate apps:', len(google_duplicate_apps))

In [None]:
#create for loop to eliminate duplicate apps from Google Play Store data set
reviews_max = {}

for row in google:
    name = row[0]
    n_reviews = float(row[3])
    
    if name in reviews_max and reviews_max[name] < n_reviews:
        reviews_max[name] = n_reviews
        
    elif name not in reviews_max:
        reviews_max[name] = n_reviews
    
print('Number of duplicate apps:', len(reviews_max))

First, two empty lists are created: `google_clean` and `already_added`. The for loop below will go through the Google data set and put the apps' name in the `already_added` list and the apps' updated number of reviews in the `google_clean` list. 

In [None]:

google_clean = []
already_added = []

for row in google:
    app_name = row[0]
    n_reviews = float(row[3])
    
    if reviews_max[app_name] == n_reviews and app_name not in already_added:
        google_clean.append(row)
        already_added.append(app_name)
        
print('Number of duplicate apps:', len(google_clean))

In [None]:
#detect duplicates
duplicate_apps = []
unique_apps = []

for app in apple:
    name = app[0]
    if name in unique_apps:
        duplicate_apps.append(name)
    else:
        unique_apps.append(name)
    
print('Number of duplicate apps:', len(duplicate_apps))

Our company wants to focus on English only apps. Let's detect and eliminate non-English apps in either data set. 

English characters correspond to numbers less than 127. The scan function will tell the user if the string is English by returning True or Non-English by returning False. 

Some examples are included to test the function.

In [None]:
def is_english(string):
    for character in string:
        if ord(character) > 127:
            return False
        elif ord(character) < 127:
            return True
    
print(is_english('Docs To Go™ Free Office Suite'))
print(is_english('Instachat 😜'))
print(is_english('😜'))
print(is_english('爱奇艺PPS -《欢乐颂2》电视剧热播'))

The function is not evaluating emojis correctly so improvements to the function are written below. The previous examples are used to test the function.

In [None]:
def is_english(string):
    for character in string:
        non_ascii = 0
        if ord(character) > 127:
            non_ascii += 1
            
        if non_ascii > 3:
            return False
        else:
            return True
    
print(is_english('Docs To Go™ Free Office Suite'))
print(is_english('Instachat 😜'))
print(is_english('😜'))
print(is_english('爱奇艺PPS -《欢乐颂2》电视剧热播'))

The is_english function will check that all apps in either data set contain only english apps. Explore each data set.

In [None]:
google_english_apps = []

for row in google_clean:
    app_name = row[0]
    if is_english(app_name) == True:
        google_english_apps.append(row)
         
explore_data(google_english_apps, 0, 4, True)

In [None]:
apple_english_apps = []

for row in apple:
    app_name = row[1]
    if is_english(app_name)== True:
        apple_english_apps.append(row)
        
explore_data(apple_english_apps, 0, 4, True)

Since the company is only interested in the free apps. Isolate the free apps from the Google Play Store data set and the Apple Store data sets.

In [None]:
#isolate the free apps from the Google Play Store data set
free_google_english_apps = []

for row in google_english_apps:
    price = row[7]
    if price == '0':
        free_google_english_apps.append(row)

#print the length of the Google Play Store data set      
print(len(free_google_english_apps))

In [None]:
#isolate the free apps from the App Store data set
free_apple_english_apps = []

for row in apple_english_apps:
    price = row[4]
    
    if price == '0.0':
        free_apple_english_apps.append(row)

#print the length of the App Store data set      
print(len(free_apple_english_apps))

## Aim and validation strategy of the analysis

The aim of the analysis is to determine the kinds of apps that are likely to attract more users so that the company's revenue will increase.

To minimize risks and overhead, the validation strategy for an app idea is comprised of three steps:

* Build a minimal Android version of the app, and add it to Google Play.
* If the app has a good response from users, develop it further
* If the app is profitable after six months, we build an iOS version of the app and add it to the App Store.

## Build frequency tables for the most common genres for apps in Google Play and App Store

The most common genres are `prime_genre`, `Genres`, and `Category`. Two function will be used to analyze the frequency tables. One function to generate the frequency tables that show percentages. Another function to display the percentages in descending order.

In [None]:
#define a function to create a frequency table
def freq_table(dataset, index):
    table = {}
    total = 0
    #build frequency table
    for row in dataset:
        total += 1
        app = row[index]
        if app in table:
            table[app] += 1
        else:
            table[app] = 1
    #convert the frequecy from integers to decimals
    table_percentages = {}
    for variable in table:
        percentage = (table[variable] / total) * 100
        #assign apps to their percetages
        table_percentages[variable] = percentage
    return table_percentages

#define a function to display a frequency table
def display_table(dataset, index):
    table = freq_table(dataset, index)
    table_display = []
    for key in table:
        key_val_as_tuple = (table[key], key)
        table_display.append(key_val_as_tuple)

    table_sorted = sorted(table_display, reverse = True)
    for entry in table_sorted:
        print(entry[1], ':', entry[0])

Let's build the frequency tables for both data sets!

In [None]:
#create a genres frequency table using Google Play Store data set
google_genres = freq_table(free_google_english_apps, -4)

#create a category frequency table using Google Play Store data set
google_category = freq_table(free_google_english_apps, 1)

In [None]:
#create a prime_genre frequency table using App Store data set
apple_freq_table = freq_table(free_apple_english_apps, -5)

Now, focus on analyzing the three frequency tables.

In [None]:
#display the genres column in the Google Play Store data set
display_table(free_google_english_apps, -4)

In [None]:
#display the category column in the Google Play Store data set
display_table(free_google_english_apps, 1)

In [None]:
#display the prime_genre column in the Google Play Store data set
display_table(free_apple_english_apps, -5)

# Calculating the Most Popular Apps by Genre from App Store

In [None]:
#create a prime_genre frequency table using App Store data set
genre_apple = freq_table(free_apple_english_apps, -5)

#loop over unique genres in App Store with interation variable 'genre'
for genre in genre_apple:
    #initiate 'total' varaiable with value 0
    total = 0
    #initiate 'len_genre' variable with value 0
    len_genre = 0
    
    #loop over preprocessed App Store data set with interation variable 'app'
    for app in free_apple_english_apps:
        #save app genre to 'genre_app' variable
        genre_app = app[-5]
        #condtional statement
        #if the genre column in the frequecy table matches the preprocessed App Store data set
        if genre_app == genre:
            #save the number of user ratings as a float
            user_rating = float(app[5])
            total += user_rating
            len_genre += 1
    #calculate the average for each genre        
    avg = total / len_genre
    print(genre, ':', avg)
    

# Calculating the Most Popular Apps by Genre from the Google Play Store

In [None]:
#create a category frequency table using Google Play Store data set
category_google = freq_table(free_google_english_apps, 1)

#loop over unique categories in Google Play Store with interation variable 'category'
for category in category_google:
    #initiate 'total' varaiable with value 0
    total = 0
    #initiate 'len_category' variable with value 0
    len_category = 0
    
    #loop over preprocessed Google Play Store data set with interation variable 'app'
    for app in free_google_english_apps:
        #save app category to 'category_app' variable
        category_app = app[1]
        #condtional statement
        #if the category column in the frequecy table matches the preprocessed Google Play Store data set
        if category_app == category:
            #save the number of installs
            n_install = app[5]
            #remove commas
            n_install = n_install.replace(',', '')
            #remove plus signs 
            n_install = n_install.replace('+', '')
            total += float(n_install)
            len_category += 1
    #calculate the average for each category        
    avg = total / len_category
    print(category, ':', avg)