# Profitable app Profiles for the App Store and Google Play

Analyze data to understand what type of apps are likely to attract more users

In [1]:
import re
import csv
import random

In [2]:
with open('AppleStore.csv', encoding='utf8', newline = '') as f:
    reader1 = csv.reader(f)
    ios_data = list(reader1)
with open('googleplaystore.csv', encoding='utf8', newline = '') as f:
    reader2 = csv.reader(f)    
    google_data = list(reader2)

In [3]:
def explore_data(dataset, start, end, rows_and_columns=False):
    dataset_slice = dataset[start:end]    
    for row in dataset_slice:
        print(row)
    if rows_and_columns:
        print("Number of rows:", len(dataset))
        print('Number of columns:', len(dataset[0]))

In [4]:
explore_data(ios_data, start= 0, end= 1, rows_and_columns= False)
len(ios_data)

['id', 'track_name', 'size_bytes', 'currency', 'price', 'rating_count_tot', 'rating_count_ver', 'user_rating', 'user_rating_ver', 'ver', 'cont_rating', 'prime_genre', 'sup_devices.num', 'ipadSc_urls.num', 'lang.num', 'vpp_lic']


7198

In [5]:
explore_data(dataset= google_data, start= 0, end= 1, rows_and_columns= False)
explore_data(dataset= ios_data, start=0, end=1, rows_and_columns= False)

['App', 'Category', 'Rating', 'Reviews', 'Size', 'Installs', 'Type', 'Price', 'Content Rating', 'Genres', 'Last Updated', 'Current Ver', 'Android Ver']
['id', 'track_name', 'size_bytes', 'currency', 'price', 'rating_count_tot', 'rating_count_ver', 'user_rating', 'user_rating_ver', 'ver', 'cont_rating', 'prime_genre', 'sup_devices.num', 'ipadSc_urls.num', 'lang.num', 'vpp_lic']


## Description of the Apple Store dataset 
One can find the dataset [here](https://www.kaggle.com/ramamet4/app-store-apple-data-set-10k-apps).
The columns headers descriptions are:-
* "id" : App ID
* "track_name": App Name
* "size_bytes": Size (in Bytes)
* "currency": Currency Type
* "price": Price amount
* "ratingcounttot": User Rating counts (for all version)
* "ratingcountver": User Rating counts (for current version)
* "user_rating" : Average User Rating value (for all version)
* "userratingver": Average User Rating value (for current version)
* "ver" : Latest version code
* "cont_rating": Content Rating
* "prime_genre": Primary Genre
* "sup_devices.num": Number of supporting devices
* "ipadSc_urls.num": Number of screenshots showed for display
* "lang.num": Number of supported languages
* "vpp_lic": Vpp Device Based Licensing Enabled

In [6]:
explore_data(dataset= ios_data, start= 0,end= 1, rows_and_columns= False)

['id', 'track_name', 'size_bytes', 'currency', 'price', 'rating_count_tot', 'rating_count_ver', 'user_rating', 'user_rating_ver', 'ver', 'cont_rating', 'prime_genre', 'sup_devices.num', 'ipadSc_urls.num', 'lang.num', 'vpp_lic']


## Description of the Google playstore dataset
The link to the description of the dataset is [here](https://www.kaggle.com/lava18/google-play-store-apps)
* "App" : Application name
* "Category" : Category the app belongs to
* "Rating" : Overall user rating of the app (as when scraped)
* "Reviews" : Number of user reviews for the app (as when scraped)
* "Size" : Size of the app (as when scraped)
* "Installs" : Number of user downloads/installs for the app (as when scraped)
* "Type" : Paid or Free
* "Price" : Price of the app (as when scraped)
* "Content Rating" : Age group the app is targeted at - Children / Mature 21+ / Adult
* "Genres" : An app can belong to multiple genres (apart from its main category). For eg, a musical family game will belong to Music, Game, Family genres.

In [7]:
explore_data(dataset= google_data, start= 0,end= 1, rows_and_columns= False)

['App', 'Category', 'Rating', 'Reviews', 'Size', 'Installs', 'Type', 'Price', 'Content Rating', 'Genres', 'Last Updated', 'Current Ver', 'Android Ver']


## Data Cleaning
We are interested in free English apps
* Remove non-english apps
* Remove apps that are not free

In [8]:
explore_data(dataset= google_data, start= 10473,end= 10474, rows_and_columns= False)
del google_data[10473]

['Life Made WI-Fi Touchscreen Photo Frame', '1.9', '19', '3.0M', '1,000+', 'Free', '0', 'Everyone', '', 'February 11, 2018', '1.0.19', '4.0 and up']


In [9]:
explore_data(dataset= google_data, start= 10473,end= 10474, rows_and_columns= False)

['osmino Wi-Fi: free WiFi', 'TOOLS', '4.2', '134203', '4.1M', '10,000,000+', 'Free', '0', 'Everyone', 'Tools', 'August 7, 2018', '6.06.14', '4.4 and up']


## Remove duplicate entries
* Check whether the data contains any app data for multiple times.
* Print any duplicate app occurence from the main data.

Below we show that the data indeed contains duplicate entries. We print few instances of an app which has duplicate entries. We won't remove the duplicate entries randdomly. Instead we will remove these duplicates by their number of reviews. The higher the number of reviews, more recent the data is.

In [10]:
print(google_data[3])
print(ios_data[1][1])

['U Launcher Lite – FREE Live Cool Themes, Hide Apps', 'ART_AND_DESIGN', '4.7', '87510', '8.7M', '5,000,000+', 'Free', '0', 'Everyone', 'Art & Design', 'August 1, 2018', '1.2.4', '4.0.3 and up']
Facebook


In [11]:
duplicate_apps = []
unique_apps = []
for app in google_data[1:]:
    app_name = app[0]
    if app_name not in unique_apps:
        unique_apps.append(app_name)
    else:
        duplicate_apps.append(app_name)
print(len(unique_apps))

verify_unique = 'Instagram'
for app in google_data:
    app_name = app[0]
    if app_name == verify_unique:
        print(app)

9659
['Instagram', 'SOCIAL', '4.5', '66577313', 'Varies with device', '1,000,000,000+', 'Free', '0', 'Teen', 'Social', 'July 31, 2018', 'Varies with device', 'Varies with device']
['Instagram', 'SOCIAL', '4.5', '66577446', 'Varies with device', '1,000,000,000+', 'Free', '0', 'Teen', 'Social', 'July 31, 2018', 'Varies with device', 'Varies with device']
['Instagram', 'SOCIAL', '4.5', '66577313', 'Varies with device', '1,000,000,000+', 'Free', '0', 'Teen', 'Social', 'July 31, 2018', 'Varies with device', 'Varies with device']
['Instagram', 'SOCIAL', '4.5', '66509917', 'Varies with device', '1,000,000,000+', 'Free', '0', 'Teen', 'Social', 'July 31, 2018', 'Varies with device', 'Varies with device']


### Cleaned unique data
* We curated a dictonary using the number of reviews as stated above to find the app data corresponsing to the maximum number of reviews. 
* We used to lists here. One for the cleaned data, second for the already added app names. If the number of reviews is maximum, we append the corresponding entry to the android_clean list.

In [12]:
# Dictionary with unique app name
reviews_max = {}
for app in google_data[1:]:
    name = app[0]
    n_reviews = float(app[3])
    if (name in reviews_max) and (reviews_max[name] <= n_reviews):
        reviews_max[name] = n_reviews
    elif name not in reviews_max:
        reviews_max[name] = n_reviews
len(reviews_max)

9659

In [13]:
android_clean = []
already_added = []
for app in google_data[1:]:
    name = app[0]
    n_reviews = float(app[3])
    if n_reviews == reviews_max[name] and name not in already_added:
        android_clean.append(app)
        already_added.append(name)
len(android_clean)
android_clean[0]

['Photo Editor & Candy Camera & Grid & ScrapBook',
 'ART_AND_DESIGN',
 '4.1',
 '159',
 '19M',
 '10,000+',
 'Free',
 '0',
 'Everyone',
 'Art & Design',
 'January 7, 2018',
 '1.0.0',
 '4.0.3 and up']

### iOS data do not contain duplicate entries
Number of iOS apps are same as APP ID which is unique

In [14]:
ios_app_ids = []

for ios_app in ios_data:
    ios_app_ids.append(ios_app[0])

len(ios_app_ids)

7198

### Filter non-English apps
We will focus here only on the English apps

In [15]:
print(chr(8000))
print(ord('😜'))
print(True and False and True)

ὀ
128540
False


In [16]:
def engtest(a):
    test = []
    for c in a:
        test.append(ord(c)>127)
    if sum(test)<=3:
        return True
    else: 
        return False

In [17]:
a = 'Instachat 😜'
test = []
for c in a:
    test.append(ord(c)>127)
if sum(test)<=3:
    print('True')
else: 
    print('False')
#True in test and (len(test)-sum(test)<=3)

True


In [18]:
engtest('abc')

True

In [19]:
# test of engtest
print(engtest('Instagram'))
print(engtest('爱奇艺PPS -《欢乐颂2》电视剧热播'))
print(engtest('Docs To Go™ Free Office Suite'))
print(engtest('Instachat 😜'))

True
False
True
True


In [20]:
android_clean_english = []
android_clean_other = []
ios_clean_english = []
ios_clean_other = []
for app in android_clean:
    if engtest(app[0]) is True:
        android_clean_english.append(app)
    else:
        android_clean_other.append(app)
for app in ios_data[1:]:
    if engtest(app[1]) is True:
        ios_clean_english.append(app)
    else:
        ios_clean_other.append(app)
print(len(android_clean_english))
print(len(ios_clean_english)) 

9614
6183


In [21]:
print(android_clean_other[random.randint(0,len(android_clean_other))])
print(ios_clean_other[random.randint(0,len(ios_clean_other))])

['Al Quran Free - القرآن (Islam)', 'BOOKS_AND_REFERENCE', '4.7', '1777', '23M', '50,000+', 'Free', '0', 'Everyone', 'Books & Reference', 'February 15, 2015', '1.1', '2.2 and up']
['1081270431', 'どうして私じゃダメなの？', '45325312', 'USD', '0.0', '0', '0', '0.0', '0.0', '1.0.2', '4+', 'Games', '38', '4', '1', '1']


In [22]:
ios_clean_english[2]

['529479190',
 'Clash of Clans',
 '116476928',
 'USD',
 '0.0',
 '2130805',
 '579',
 '4.5',
 '4.5',
 '9.24.12',
 '9+',
 'Games',
 '38',
 '5',
 '18',
 '1']

### Identify the free apps

In [23]:
android_clean_english_free = []
android_clean_english_nonfree = []
for app in android_clean_english:
    if app[7] == '0':
        android_clean_english_free.append(app)
    else:
        android_clean_english_nonfree.append(app)
ios_clean_english_free = []
ios_clean_english_nonfree = []
for app in ios_clean_english:
    if app[4] == '0.0':
        ios_clean_english_free.append(app)
    else:
        ios_clean_english_nonfree.append(app)


In [24]:
print(android_clean_english_free[1])
print((android_clean_english_nonfree[1]))
print(ios_clean_english_free[1])
print(ios_clean_english_nonfree[1])

['U Launcher Lite – FREE Live Cool Themes, Hide Apps', 'ART_AND_DESIGN', '4.7', '87510', '8.7M', '5,000,000+', 'Free', '0', 'Everyone', 'Art & Design', 'August 1, 2018', '1.2.4', '4.0.3 and up']
['Tiny Scanner Pro: PDF Doc Scan', 'BUSINESS', '4.8', '10295', '39M', '100,000+', 'Paid', '$4.99', 'Everyone', 'Business', 'April 11, 2017', '3.4.6', '3.0 and up']
['389801252', 'Instagram', '113954816', 'USD', '0.0', '2161558', '1289', '4.5', '4.0', '10.23', '12+', 'Photo & Video', '37', '0', '29', '1']
['500116670', 'Clear Vision (17+)', '37879808', 'USD', '0.99', '541693', '69225', '4.5', '4.5', '1.1.3', '17+', 'Games', '43', '5', '1', '1']


In [25]:
print(len(android_clean_english_free))
print(len(ios_clean_english_free))

8864
3222


In [26]:
explore_data(dataset= google_data, start= 0, end= 1, rows_and_columns= False)
explore_data(dataset= ios_data, start=0, end=1, rows_and_columns= False)

['App', 'Category', 'Rating', 'Reviews', 'Size', 'Installs', 'Type', 'Price', 'Content Rating', 'Genres', 'Last Updated', 'Current Ver', 'Android Ver']
['id', 'track_name', 'size_bytes', 'currency', 'price', 'rating_count_tot', 'rating_count_ver', 'user_rating', 'user_rating_ver', 'ver', 'cont_rating', 'prime_genre', 'sup_devices.num', 'ipadSc_urls.num', 'lang.num', 'vpp_lic']


### Find an app profile that fits both the App Store and Google Play
* An app that is profitable in both the android and ios market
* First we test the app on the Google Play.
* If the apps turns out to be profitable, we build for ios.
* To find the profitable genre from the Play Store data, we can use "Category", "Rating", "Reviews", "Genre"
* To find the profitable genre from the App Store data, we can use "rating_count_tot", "user_rating", "cont_rating", "prime_genre" 

In [27]:
def freq_table(dataset, index):
    col_dict = {}
    for app in dataset:
        app_col = app[index]
        if app_col not in col_dict:
            col_dict[app_col] = 1
        else:
            col_dict[app_col] +=1
    total_counts = sum(col_dict.values())
    for element in col_dict:
        col_dict[element] = round((col_dict[element]/total_counts)*100,2)
    return col_dict
#freq_table(ios_clean_english_free, 11)

In [28]:
def display_table(dataset, index):
    table = freq_table(dataset, index)
    table_display = []
    for key in table:
        key_val_as_tuple = (table[key], key)
        table_display.append(key_val_as_tuple)

    table_sorted = sorted(table_display, reverse = True)
    for entry in table_sorted:
        print(entry[1], ':', entry[0])

In [29]:
print('prime_genre_freq-----------------------')
prime_genre_freq = display_table(ios_clean_english_free,11)
print('genre_freq------------------------')
genre_freq = display_table(android_clean_english_free,9)
print('category_freq---------------------------')
category_freq = display_table(android_clean_english_free,1)

prime_genre_freq-----------------------
Games : 58.16
Entertainment : 7.88
Photo & Video : 4.97
Education : 3.66
Social Networking : 3.29
Shopping : 2.61
Utilities : 2.51
Sports : 2.14
Music : 2.05
Health & Fitness : 2.02
Productivity : 1.74
Lifestyle : 1.58
News : 1.33
Travel : 1.24
Finance : 1.12
Weather : 0.87
Food & Drink : 0.81
Reference : 0.56
Business : 0.53
Book : 0.43
Navigation : 0.19
Medical : 0.19
Catalogs : 0.12
genre_freq------------------------
Tools : 8.45
Entertainment : 6.07
Education : 5.35
Business : 4.59
Productivity : 3.89
Lifestyle : 3.89
Finance : 3.7
Medical : 3.53
Sports : 3.46
Personalization : 3.32
Communication : 3.24
Action : 3.1
Health & Fitness : 3.08
Photography : 2.94
News & Magazines : 2.8
Social : 2.66
Travel & Local : 2.32
Shopping : 2.25
Books & Reference : 2.14
Simulation : 2.04
Dating : 1.86
Arcade : 1.85
Video Players & Editors : 1.77
Casual : 1.76
Maps & Navigation : 1.4
Food & Drink : 1.24
Puzzle : 1.13
Racing : 0.99
Role Playing : 0.94
Librar

## Analysis
### Analysis of "prime_genre" in App Store data
* The most common genre is "Games". The second most common genre is "Entertaiment"
* The least common genre is "Catalog".
* Most apps are designed for entertainment purpose (games, photo and videos etc) as opposed to the practical purposes (education, shopping, utilities etc)

In [30]:
prime_genre_freq = freq_table(ios_clean_english_free,11)
avg_prime_genre_rating_dict ={}
for genre in prime_genre_freq.keys():
    total = 0
    len_genre = 0
    for app in ios_clean_english_free:
        genre_app = app[11]
        if genre_app == genre:
            user_rating = float(app[7])
            total += user_rating
            len_genre+=1
    avg_rating = round(total/len_genre,2)
    avg_prime_genre_rating_dict[genre] = avg_rating
    print((genre,avg_rating))

('Social Networking', 3.59)
('Photo & Video', 3.9)
('Games', 4.04)
('Music', 3.95)
('Reference', 3.67)
('Health & Fitness', 3.77)
('Weather', 3.48)
('Utilities', 3.53)
('Travel', 3.49)
('Shopping', 3.97)
('News', 3.24)
('Navigation', 3.83)
('Lifestyle', 3.41)
('Entertainment', 3.54)
('Food & Drink', 3.63)
('Sports', 3.07)
('Book', 3.07)
('Finance', 3.38)
('Education', 3.64)
('Productivity', 4.0)
('Business', 3.97)
('Catalogs', 4.12)
('Medical', 3.0)


### App Store app recommnedation
Besed on the analysis above based on user rating per genre, the recommended app profile should be "Games".

### Analysis of "Category" and "Genres" in Play Store data
* Tools, Entertainment, Education, Business are the most common genres.
* Apps for practical and well as for entertainment is present which is in contrast with the App Store data.
* An app which can cover both entertainment and gaming genre can become a profitable app.
* The frequncy tables indicate that the app genres with most frequency can be the apps with most users. But some other parameters must be considered before a concrete conclusion. 

In [31]:
category_freq = freq_table(android_clean_english_free,1)
avg_catergory_rating_dict={}
for category in category_freq.keys():
    total = 0
    len_category = 0
    for app in android_clean_english_free:
        category_app = app[1]
        if category_app == category:
            app_install = app[5]
            apin1 = app_install.replace('+','')
            apin2 = apin1.replace(',','')
            total += float(apin2)
            len_category+=1
    avg_rating = format(round(total/len_genre,2))
    avg_catergory_rating_dict[category] = avg_rating
    print((category, avg_rating))

('ART_AND_DESIGN', '18870183.33')
('AUTO_AND_VEHICLES', '8846676.83')
('BEAUTY', '4532841.67')
('BOOKS_AND_REFERENCE', '277647376.67')
('BUSINESS', '116150348.33')
('COMICS', '7495191.67')
('COMMUNICATION', '1839484366.83')
('DATING', '23485792.83')
('EDUCATION', '31475000.0')
('ENTERTAINMENT', '164910000.0')
('EVENTS', '2662193.33')
('FINANCE', '75860522.0')
('FOOD_AND_DRINK', '35289791.83')
('HEALTH_AND_FITNESS', '190591400.33')
('HOUSE_AND_HOME', '16200410.17')
('LIBRARIES_AND_DEMO', '8832635.0')
('LIFESTYLE', '82914071.5')
('GAME', '2239478241.67')
('FAMILY', '1032315948.33')
('MEDICAL', '6288724.0')
('SOCIAL', '914643650.33')
('SHOPPING', '233389764.17')
('PHOTOGRAPHY', '776044802.5')
('SPORTS', '182538447.17')
('TRAVEL_AND_LOCAL', '482450681.0')
('TOOLS', '1350173912.33')
('PERSONALIZATION', '254872648.0')
('PRODUCTIVITY', '965271552.33')
('PARENTING', '5245168.33')
('WEATHER', '60048086.67')
('VIDEO_PLAYERS', '655288620.0')
('NEWS_AND_MAGAZINES', '394699376.67')
('MAPS_AND_NAVIG

In [33]:
print(max(avg_catergory_rating_dict, key = avg_catergory_rating_dict.get))
print(max(avg_prime_genre_rating_dict, key= avg_prime_genre_rating_dict.get))

PRODUCTIVITY
Catalogs


### Most profitable app profile
App that deals with productivity will be the most profitable app profile.