# Profitable App Profiles for the App Store
This project is to use data analysis techniques in Python to determine the most profitable app for my hypothetical company.
Determining which types of apps have the largest ad engagement will help in making this decision.

In [1]:
from csv import reader

### The Google Play data set ###
opened_file = open('googleplaystore.csv')
read_file = reader(opened_file)
android = list(read_file)
android_header = android[0]
android = android[1:]


### The App Store data set ###
opened_file = open('AppleStore.csv')
read_file = reader(opened_file)
ios = list(read_file)
ios_header = ios[0]
ios = ios[1:]


In [2]:
def explore_data(dataset, start, end, rows_and_columns=False):
    dataset_slice = dataset[start:end]    
    for row in dataset_slice:
        print(row)
        print('\n') # adds a new (empty) line after each row

    if rows_and_columns:
        print('Number of rows:', len(dataset))
        print('Number of columns:', len(dataset[0]))


Columns below to keep track of column indexes for this analysis:

_Android_
__('App', 'Category', 'Rating', 'Reviews', 'Size', 'Installs', 'Type', 'Price', 'Content Rating', 'Genres', 'Last Updated', 'Current Ver', 'Android Ver')__

_iOS_
__('id', 'track_name', 'size_bytes', 'currency', 'price', 'rating_count_tot', 'rating_count_ver', 'user_rating', 'user_rating_ver', 'ver', 'cont_rating', 'prime_genre', 'sup_devices.num', 'ipadSc_urls.num', 'lang.num', 'vpp_lic')__


Link to the dataset documentation:

[Android](https://www.kaggle.com/lava18/google-play-store-apps)
[iOS](https://www.kaggle.com/ramamet4/app-store-apple-data-set-10k-apps)






In [3]:
explore_data(android,10472,10473,True)



['Life Made WI-Fi Touchscreen Photo Frame', '1.9', '19', '3.0M', '1,000+', 'Free', '0', 'Everyone', '', 'February 11, 2018', '1.0.19', '4.0 and up']


Number of rows: 10841
Number of columns: 13


Clean any missing data in the rows for the data sets.\
Using an if statement to match the length of header to each row, code identified any rows that did not match.\
Bad row in 10472 from the Google play store discussion thread, and then deleted.

In [4]:
for row in android:
    if len(row) != len(android_header):
        print(row)
        print(android.index(row))
print('DONE Checking for matching rows in Android Data')

for row in ios:
    if len(row) != len(ios_header):
        print(row)
        print(ios.index(row))
print('DONE Checking for matching rows in iOS Data')

del android[10472]

['Life Made WI-Fi Touchscreen Photo Frame', '1.9', '19', '3.0M', '1,000+', 'Free', '0', 'Everyone', '', 'February 11, 2018', '1.0.19', '4.0 and up']
10472
DONE Checking for matching rows in Android Data
DONE Checking for matching rows in iOS Data


Another step in cleaning the dataset is to **remove any duplicate rows** for the same app.  More reviews are recorded and the data set appends the duplicate app row.\
For example using Instagram as the app with duplicate rows, we can see there are duplicates.


In [5]:
for row in android:
    name = row[0]
    if name == "Instagram":
        print(row)
    

['Instagram', 'SOCIAL', '4.5', '66577313', 'Varies with device', '1,000,000,000+', 'Free', '0', 'Teen', 'Social', 'July 31, 2018', 'Varies with device', 'Varies with device']
['Instagram', 'SOCIAL', '4.5', '66577446', 'Varies with device', '1,000,000,000+', 'Free', '0', 'Teen', 'Social', 'July 31, 2018', 'Varies with device', 'Varies with device']
['Instagram', 'SOCIAL', '4.5', '66577313', 'Varies with device', '1,000,000,000+', 'Free', '0', 'Teen', 'Social', 'July 31, 2018', 'Varies with device', 'Varies with device']
['Instagram', 'SOCIAL', '4.5', '66509917', 'Varies with device', '1,000,000,000+', 'Free', '0', 'Teen', 'Social', 'July 31, 2018', 'Varies with device', 'Varies with device']


To keep track of the duplicate app entries, we should create a list of non duplicates and duplicate apps to identify the rows we need to remove.\
Then we can remove all other duplicate app rows that do not have the highest review count.\
So far we observe that the Play store app store is the only data set materially effected by duplicate entries.  The iOS app store only has two.

In [6]:
duplicates = []
non_dupes = []

for row in android:
    name = row[0]
    if name in non_dupes:
        duplicates.append(name)
    else:
        non_dupes.append(name)
print('Number of duplicate Android apps:', len(duplicates) )

ios_duplicates = []
ios_non_dupes = []

for row in ios:
    name = row[0]
    if name in ios_non_dupes:
        ios_duplicates.append(name)
    else:
        ios_non_dupes.append(name)
print('Number of duplicate iOS apps:', len(ios_duplicates) )



Number of duplicate Android apps: 1181
Number of duplicate iOS apps: 0


In the code below, I created a list to keep track of all apps that already have been appended to the `android_clean = []` into `already_added = []`.\
Without the seperate list `already_added = []` , the condition we use to append apps `if reviews_max[name]==n_reviews` would continue to produce duplicate values.\
This is because some duplicate apps have values with the same max review count

In [7]:
reviews_max = {}
for rows in android:
    name = rows[0]
    n_reviews = float(rows[3])
    if name not in reviews_max:
        reviews_max[name] = n_reviews
    if name in reviews_max and reviews_max[name] < n_reviews:
        reviews_max[name] = n_reviews
print("Number of reviews:",len(reviews_max))
android_clean = []
already_added = []
for row in android:
    name = row[0]
    n_reviews = float(row[3])
    if reviews_max[name]==n_reviews and name not in already_added:
        android_clean.append(row)
        already_added.append(name)

    
print(len(android_clean))
    

Number of reviews: 9659
9659


Now we have cleaned the data for duplicates, we need to remove any non-English apps.  We can use the order number associated with each character's ASCII output.  All english characters are all in the range of 0-127.  Therefore we can loop through each character for a string using the function:\
`ord()`\
Using a defined function, we can take in a string and return `False` if any chracter in the string that doesn't belong to the set of common English chracters, otherwise returning `True`\
If the app name has more than **3** non english characters, the function will return `False`

In [8]:
def english_checker(string):
    char_count = 0
    for character in string:
        order = ord(character)
        if order > 127:
            char_count += 1
            if char_count > 3:
                return False
    return True
    

english_checker('爱奇艺艺yyy')

False

Create another list to parse out any non english apps for both data sets using the function we created.

In [9]:
english_android=[]
english_ios=[]
for row in android_clean:
    name = row[0]
    if english_checker(name) is True:
        english_android.append(row)

for row in ios:
    name = row[1]
    if english_checker(name) is True:
        english_ios.append(row)
        
print("Android data set[English]:",len(english_android))
print("iOS data set[English]:", len(english_ios))





Android data set[English]: 9614
iOS data set[English]: 6183


In [10]:
free_android=[]
free_ios=[]
for row in english_android:
    price = row[6]
    if price == "Free":
        free_android.append(row)
for row in english_ios:
    price = float(row [4])
    if price == 0.0:
        free_ios.append(row)
print("Android data set[Free]:",len(free_android))
print("iOS data set[Free]:", len(free_ios))

Android data set[Free]: 8863
iOS data set[Free]: 3222


The goal in this analysis is to determine which apps attract the largest number of users.
The validation strategy for the app idea goes as follows:
1. Build a minimal Android version of the app, and add it to Google Play.
2. If the app has a good response from users, we develop it further.
3. If the app is profitable after six months, we build an iOS version of the app and add it to the App Store.

The the goal is to release the app on Google play and eventually the App Store, therefore we need an analysis of successful app profiles on both platforms.

In [11]:
def android_head(android_header):
    x=0
    for i in android_header:
        print(x,i)
        x = x+1
        if x==len(ios_header):
            break
def ios_head(ios_header):
    x=0
    for i in ios_header:
        print(x,i)
        x = x+1
print(android_head(android_header))

print(ios_head(ios_header))

0 App
1 Category
2 Rating
3 Reviews
4 Size
5 Installs
6 Type
7 Price
8 Content Rating
9 Genres
10 Last Updated
11 Current Ver
12 Android Ver
None
0 id
1 track_name
2 size_bytes
3 currency
4 price
5 rating_count_tot
6 rating_count_ver
7 user_rating
8 user_rating_ver
9 ver
10 cont_rating
11 prime_genre
12 sup_devices.num
13 ipadSc_urls.num
14 lang.num
15 vpp_lic
None


Columns we could use to generate frequency tables in android_\
**0 App**
**1 Category**
**2 Rating**
3 Reviews
4 Size
**5 Installs**
**6 Type**
7 Price
8 Content Rating
**9 Genres**
10 Last Updated
11 Current Ver
12 Android Ver.\
***
Columns for ios_\
**0 id**
**1 track_name**
2 size_bytes
3 currency
4 price
**5 rating_count_tot**
6 rating_count_ver
**7 user_rating**
8 user_rating_ver
9 ver
10 cont_rating
**11 prime_genre**
12 sup_devices.num
13 ipadSc_urls.num
14 lang.num
15 vpp_lic


In [12]:
def freq_table(dataset, index):
    freq_column = {}
    for row in dataset:
        i = row[index]
        if i not in freq_column:
            freq_column[i]=1
        else:
            freq_column[i]+=1
    for key in freq_column:
        n = freq_column[key]/len(dataset)
        perc = round(n*100,3)
        freq_column[key]=perc        
    return freq_column

def display_table(dataset, index):
    table = freq_table(dataset, index)
    table_display = []
    for key in table:
        key_val_as_tuple = (table[key], key)
        table_display.append(key_val_as_tuple)

    table_sorted = sorted(table_display, reverse = True)
    for entry in table_sorted:
        print(entry[1], ':', entry[0])

# print(display_table(free_android,1),'\n')

# print(display_table(free_android,9),'\n')        
print('prime_genre column','\n')
print(display_table(free_ios,11),'\n')



    
    

prime_genre column 

Games : 58.163
Entertainment : 7.883
Photo & Video : 4.966
Education : 3.662
Social Networking : 3.29
Shopping : 2.607
Utilities : 2.514
Sports : 2.142
Music : 2.048
Health & Fitness : 2.017
Productivity : 1.738
Lifestyle : 1.583
News : 1.335
Travel : 1.241
Finance : 1.117
Weather : 0.869
Food & Drink : 0.807
Reference : 0.559
Business : 0.528
Book : 0.435
Navigation : 0.186
Medical : 0.186
Catalogs : 0.124
None 



What is the most common genre? What is the runner-up?
1. Games : 58.163
2. Entertainment : 7.883
3. Photo & Video : 4.966

What other patterns do you see?
- I notice the **top 3** apps make up 70% share of all the apps in the App store.

What is the general impression — are most of the apps designed for practical purposes (education, shopping, utilities, productivity, lifestyle) or more for entertainment (games, photo and video, social networking, sports, music)?
- There is a material proportion of apps under the Gaming genre and entertainment.

Can you recommend an app profile for the App Store market based on this frequency table alone? If there's a large number of apps for a particular genre, does that also imply that apps of that genre generally have a large number of users?
- No we need to analyze the amount of downloads to get a better understanding of which are more popular. Also we need to look at the Android data set. 


In [13]:
print('Category column \n')
print(display_table(free_android,1),'\n')

print('Genre column \n')
print(display_table(free_android,9),'\n') 

Category column 

FAMILY : 18.899
GAME : 9.726
TOOLS : 8.462
BUSINESS : 4.592
LIFESTYLE : 3.904
PRODUCTIVITY : 3.893
FINANCE : 3.701
MEDICAL : 3.532
SPORTS : 3.396
PERSONALIZATION : 3.317
COMMUNICATION : 3.238
HEALTH_AND_FITNESS : 3.08
PHOTOGRAPHY : 2.945
NEWS_AND_MAGAZINES : 2.798
SOCIAL : 2.663
TRAVEL_AND_LOCAL : 2.336
SHOPPING : 2.245
BOOKS_AND_REFERENCE : 2.144
DATING : 1.862
VIDEO_PLAYERS : 1.794
MAPS_AND_NAVIGATION : 1.399
FOOD_AND_DRINK : 1.241
EDUCATION : 1.162
ENTERTAINMENT : 0.959
LIBRARIES_AND_DEMO : 0.936
AUTO_AND_VEHICLES : 0.925
HOUSE_AND_HOME : 0.824
WEATHER : 0.801
EVENTS : 0.711
PARENTING : 0.654
ART_AND_DESIGN : 0.643
COMICS : 0.621
BEAUTY : 0.598
None 

Genre column 

Tools : 8.451
Entertainment : 6.07
Education : 5.348
Business : 4.592
Productivity : 3.893
Lifestyle : 3.893
Finance : 3.701
Medical : 3.532
Sports : 3.464
Personalization : 3.317
Communication : 3.238
Action : 3.103
Health & Fitness : 3.08
Photography : 2.945
News & Magazines : 2.798
Social : 2.663
Tra

What are the most common genres?
-  Most common genres are Family, Game, and Tools looking at the Category column.  The Tools, Entertainment, and Education are the top 3 most popular genres looking at the Genre column.

What other patterns do you see?
- The genre column of the Google play store has a larger range of genres.
Compare the patterns you see for the Google Play market with those you saw for the App Store market.
Can you recommend an app profile based on what you found so far? Do the frequency tables you generated reveal the most frequent app genres or what genres have the most users?
- Entertainment, Games, and Tools appear to the most common genres in both platforms.  The App store for Apple is heavily skewed towards Entertainment apps(games).  We need to still look at the number of downloads for the data sets.

In [14]:
#Change the previous Freq_table function to produce absolute 
#numbers instead of percentages

def freq_table2(dataset, index):
    freq_column = {}
    for row in dataset:
        i = row[index]
        if i not in freq_column:
            freq_column[i]=1
        else:
            freq_column[i]+=1      
    return freq_column

#Creating a dictionary to store the avgerage number of rating
#per genre

avg_user_dict={}
user_rating_freq = freq_table(free_ios,11)
for genre in user_rating_freq:
    total = 0
    len_genre = 0

    for row in free_ios:
        genre_app = row[11]
        if genre == genre_app:
            num_ratings = float(row[5])
            total += num_ratings
            len_genre += 1
    
    avg_user_rating = round(total/len_genre,0)
    avg_user_dict[genre]=avg_user_rating
    
#Creating a list of tuples to sort the avgerage number of rating
#per genre    

table_display = []
for key in avg_user_dict:
    key_val_as_tuple = (avg_user_dict[key], key)
    table_display.append(key_val_as_tuple)

table_sorted = sorted(table_display, reverse = True)
print(table_sorted)
    

    
# ios_head(ios_header)

[(86090.0, 'Navigation'), (74942.0, 'Reference'), (71548.0, 'Social Networking'), (57327.0, 'Music'), (52280.0, 'Weather'), (39758.0, 'Book'), (33334.0, 'Food & Drink'), (31468.0, 'Finance'), (28442.0, 'Photo & Video'), (28244.0, 'Travel'), (26920.0, 'Shopping'), (23298.0, 'Health & Fitness'), (23009.0, 'Sports'), (22789.0, 'Games'), (21248.0, 'News'), (21028.0, 'Productivity'), (18684.0, 'Utilities'), (16486.0, 'Lifestyle'), (14030.0, 'Entertainment'), (7491.0, 'Business'), (7004.0, 'Education'), (4004.0, 'Catalogs'), (612.0, 'Medical')]


From looking at the average number of user ratings per genre as a proxy for Installs,  we can see that Navigation, Reference, and Social Networking apps have the highest user rating counts.\
The Navigation app category in the App store appears to have a low number apps in the App store but the highest use.  


In [15]:
for row in free_ios:
    if row[11]=='Navigation':
        print(row[1],':',row[5])

Waze - GPS Navigation, Maps & Real-time Traffic : 345046
Google Maps - Navigation & Transit : 154911
Geocaching® : 12811
CoPilot GPS – Car Navigation & Offline Maps : 3582
ImmobilienScout24: Real Estate Search in Germany : 187
Railway Route Search : 5


Waze and Google Maps take up nearly 500,000 reviews. Making an app for Navigation would require use of Navigation technologies such as GPS and satellites.  This would be out of our scope for profitable app idea.

In [16]:
t=0
for row in free_ios:
    if row[11]=='Social Networking':
        print(row[1],':',row[5])
        t=t+1   
        if t==15:
            break
        

Facebook : 2974676
Pinterest : 1061624
Skype for iPhone : 373519
Messenger : 351466
Tumblr : 334293
WhatsApp Messenger : 287589
Kik : 260965
ooVoo – Free Video Call, Text and Voice : 177501
TextNow - Unlimited Text + Calls : 164963
Viber Messenger – Text & Call : 164249
Followers - Social Analytics For Instagram : 112778
MeetMe - Chat and Meet New People : 97072
We Heart It - Fashion, wallpapers, quotes, tattoos : 90414
InsTrack for Instagram - Analytics Plus More : 85535
Tango - Free Video Call, Voice and Chat : 75412


Also in Reference genre, the app share is heavily dominated by the Bible app and dictionaries.
## App Store Profile Strategy:
- Given the App store is saturated with games and entertainment apps,  we could design a social networking app.  To reduce overhead and costs, an analytics app that can keep track of Instagram, Tiktok metrics on on User Interface.\

**Followers - Social Analytics For Instagram : 112778\
InsTrack for Instagram - Analytics Plus More : 85535**

- App would be similar to the above Instragram metric apps, but would extend to other social media apps within our program.  I don't see any app that resembles a Dashboard for social media analytics, so this could be profitable idea.

In [17]:
avg_user_and={}  # Dictionary for average number of installs

# Category_dict is the dictionary used when we iterate over the 
# distinct categories
# example of a "nested loop"
category_dict = freq_table(free_android,1)  
for category in category_dict:
    total = 0
    len_category = 0
    for row in free_android:
        category_app = row[1]
        installs = row[5]
        if category == category_app:
            installs = installs.replace('+','')   ## to replace "+"and "," with no character to convert to float
            installs = float(installs.replace(',',''))
            total += installs
            len_category += 1
    
    avg_installs = round(total/len_category,0)
    avg_user_and[category]=avg_installs

# using our now filled dictionary with average installs for the play store
# we will create a list of tuples to be able to quickly sort the dict

table_display_android = []
for key in category_dict:
    key_val_as_tuple = (avg_user_and[key], key)
    table_display_android.append(key_val_as_tuple)

table_sorted = sorted(table_display_android, reverse = True)
for i in table_sorted:
    print(i)

(38456119.0, 'COMMUNICATION')
(24727872.0, 'VIDEO_PLAYERS')
(23253652.0, 'SOCIAL')
(17840110.0, 'PHOTOGRAPHY')
(16787331.0, 'PRODUCTIVITY')
(15588016.0, 'GAME')
(13984078.0, 'TRAVEL_AND_LOCAL')
(11640706.0, 'ENTERTAINMENT')
(10801391.0, 'TOOLS')
(9549178.0, 'NEWS_AND_MAGAZINES')
(8767812.0, 'BOOKS_AND_REFERENCE')
(7036877.0, 'SHOPPING')
(5201483.0, 'PERSONALIZATION')
(5074486.0, 'WEATHER')
(4188822.0, 'HEALTH_AND_FITNESS')
(4056942.0, 'MAPS_AND_NAVIGATION')
(3697848.0, 'FAMILY')
(3638640.0, 'SPORTS')
(1986335.0, 'ART_AND_DESIGN')
(1924898.0, 'FOOD_AND_DRINK')
(1833495.0, 'EDUCATION')
(1712290.0, 'BUSINESS')
(1437816.0, 'LIFESTYLE')
(1387692.0, 'FINANCE')
(1331541.0, 'HOUSE_AND_HOME')
(854029.0, 'DATING')
(817657.0, 'COMICS')
(647318.0, 'AUTO_AND_VEHICLES')
(638504.0, 'LIBRARIES_AND_DEMO')
(542604.0, 'PARENTING')
(513152.0, 'BEAUTY')
(253542.0, 'EVENTS')
(120551.0, 'MEDICAL')


From the data above, we can see the app store has top install counts for Communication, Video_Players, and Social.  We picked an app idea that was apart of Social Networking genre in App store.  We can see that this idea still could be a profitable one since Social category has large user base in the Google Play store.