# Our goal is to determine the kind free apps that are likely to attract more users in North america.

## Find app profiles that are successful in Google Play and the App Store that fit these criteria

## A special challenge in this project is to not use numpy, or Pandas and rely on lists and Dicts only

In [2]:
from csv import reader

In [105]:
#opens CSV data file
def load_data(file_name):
    with open(file_name) as data_file:
        data = reader(data_file)
        data=list(data)
    return data

#prints first 3 rows including header and can return the number of rows and columns CSV file
def explore_data(dataset,show_sample=True, rows_and_columns=False):
    dataset_slice = dataset[:3]  
    if show_sample:
        for row in dataset_slice:
            print(row)
            print('\n')

    if rows_and_columns:
        print('Number of rows:', len(dataset))
        print('Number of columns:', len(dataset[0]) , '\n')
        
###Checks if the given string is english, based on ASCII range (0 - 127) for english language
### marksname as non english if more than 3 charchters are `ord(char)>127`
def check_if_english(name):
    list_of_char = list(name)
    count=0
    for i in list_of_char:
        ident = ord(i)
        if ident>127:
            count+=1
    if count>3:
        return  False
    return True

#this function will display the frequencies of a specific param in the dataset in descending order
def display_table(table):
    table_display = []
    for key in table:
        key_val_as_tuple = (table[key], key)
        table_display.append(key_val_as_tuple)

    table_sorted = sorted(table_display, reverse = True)
    for entry in table_sorted:
        print(entry[1], ':', entry[0])
        
#this function returns the frequency as a percentage of the total for any param we specify
#to specify a param for freq calc insert `index` of the param in the dataset
def freq_table(dataset, index):
    freq = {}
    for row in dataset:
        param = row[index]
        if param in freq:
            freq[param] +=1
        elif param not in freq:
            freq[param] =1
    for key in freq:
        freq[key]/=len(dataset)
        freq[key]*=100
    display_table(freq)

       
# this function will retunr the average user count for each genre based on the index of the user count we provide in the dataset
def users_per_genre(dataset, genre_index, user_count_index):
    user_freq = {}
    genre_freq = {}
    for row in dataset:
        genre = row[genre_index]
        user_count = int(row[user_count_index])
        if genre in user_freq:
            user_freq[genre] += user_count
            genre_freq[genre] +=1
        elif genre not in user_freq:
            user_freq[genre] = user_count
            genre_freq[genre] =1
    for key in user_freq:
        user_freq[key]/=genre_freq[key]
        user_freq[key]= round(user_freq[key])
    display_table(user_freq)

In [41]:
apple_data= load_data('AppleStore.csv')
google_data= load_data('googleplaystore.csv')

In [28]:
explore_data(apple_data,True,True)
explore_data(google_data,True,True)

['id', 'track_name', 'size_bytes', 'currency', 'price', 'rating_count_tot', 'rating_count_ver', 'user_rating', 'user_rating_ver', 'ver', 'cont_rating', 'prime_genre', 'sup_devices.num', 'ipadSc_urls.num', 'lang.num', 'vpp_lic']


['284882215', 'Facebook', '389879808', 'USD', '0.0', '2974676', '212', '3.5', '3.5', '95.0', '4+', 'Social Networking', '37', '1', '29', '1']


['389801252', 'Instagram', '113954816', 'USD', '0.0', '2161558', '1289', '4.5', '4.0', '10.23', '12+', 'Photo & Video', '37', '0', '29', '1']


Number of rows: 7198
Number of columns: 16 

['App', 'Category', 'Rating', 'Reviews', 'Size', 'Installs', 'Type', 'Price', 'Content Rating', 'Genres', 'Last Updated', 'Current Ver', 'Android Ver']


['Photo Editor & Candy Camera & Grid & ScrapBook', 'ART_AND_DESIGN', '4.1', '159', '19M', '10,000+', 'Free', '0', 'Everyone', 'Art & Design', 'January 7, 2018', '1.0.0', '4.0.3 and up']


['Coloring book moana', 'ART_AND_DESIGN', '3.9', '967', '14M', '500,000+', 'Free', '0', 'Everyo

In [45]:
# find which rows have missing data, we can simply delet those since its not critical
for index, row in zip(range(1,len(google_data[1:])),google_data[1:]):
    if len(row) != len(google_data[0]):
        print(index)

    

In [44]:
###delete the row with missing data
###do not run this more than once
#del google_data[10473]

In [60]:
#find if there is duplicate entries for apps
app_list={}
for row in google_data[1:]:
    name= row[0]
    if name in app_list:
        app_list[name] +=1
    elif name not in app_list:
        app_list[name] =1
repeated_entry_apps= []
for key in app_list:
    if app_list[key] >1:
        repeated_entry_apps.append(key)
print('Number of apps that have more than one entry',len(repeated_entry_apps))

Number of apps that have more than one entry 798


In [67]:
# Use the number of reviews to select the entries with the highest number of reviews if there is more than one entry per app
#identify the entries per app that have the highest user rating count
app_list={}
for row in google_data[1:]:
    name = row[0]
    n_reviews = row[5]
    if name in app_list:
        if n_reviews> app_list[name]:
            app_list[name]=n_reviews
    elif name not in app_list:
        app_list[name]= n_reviews
        
# make a new list `cleaned_google_data` to store the new dataset free of duplicates
# make a list `already_added_apps` to store the apps we added in case there is duplicates with the same number of total user count
cleaned_google_data=[]
already_added_apps = []
for row in google_data[1:]:
    name =  row[0]
    n_reviews = row[5]
    if app_list[name]== n_reviews and name not in already_added_apps:
        cleaned_google_data.append(row)
        
        already_added_apps.append(name)



In [68]:
#show the number of rows in the cleaned dataset vs original
explore_data(google_data,False,True)
explore_data(cleaned_google_data,False,True)

Number of rows: 10841
Number of columns: 13 

Number of rows: 9659
Number of columns: 13 



In [83]:
clnd_eng_apple_data=[]
for row in apple_data[1:]:
    name = row[1]
    if check_if_english(name):
        clnd_eng_apple_data.append(row)
        
clnd_eng_google_data=[]
for row in cleaned_google_data:
    name = row[0]
    if check_if_english(name):
        clnd_eng_google_data.append(row)
    

In [85]:
eng_free_google_apps=[]
for row in clnd_eng_google_data:
    if row[7] == '0':
        eng_free_google_apps.append(row)
        
eng_free_apple_apps=[]
for row in clnd_eng_apple_data:
    if row[4] == '0.0':
        eng_free_apple_apps.append(row)

In [113]:
#print all data and english free only app count for android and apple data separetly
explore_data(eng_free_google_apps,False,True)
explore_data(clnd_eng_google_data,False,True)

explore_data(eng_free_apple_apps,False,True)
explore_data(clnd_eng_apple_data,False,True)

Number of rows: 8864
Number of columns: 13 

Number of rows: 9614
Number of columns: 13 

Number of rows: 3222
Number of columns: 16 

Number of rows: 6183
Number of columns: 16 



## We need to determine the most common app Genres in each market (apple and android)
1)build frequency tables for number of apps in each genre in each datset

2)build frequency tables to show the avg number of user ratings or installs for each genre in each datset

In [114]:
# we need to adjust the dataset #of installs column because it has aa plus sign before running our function
for row in eng_free_google_apps:
    num_installs = row[5]
    num_installs = num_installs.replace('+','')
    num_installs = int(num_installs.replace(',',''))
    row[5]=num_installs

AttributeError: 'int' object has no attribute 'replace'

In [102]:
# display the frequency of each genre in the apple dataset as a percentage of the total number of apps
freq_table(eng_free_apple_apps, 11)

Games : 58.16263190564867
Entertainment : 7.883302296710118
Photo & Video : 4.9658597144630665
Education : 3.662321539416512
Social Networking : 3.2898820608317814
Shopping : 2.60707635009311
Utilities : 2.5139664804469275
Sports : 2.1415270018621975
Music : 2.0484171322160147
Health & Fitness : 2.0173805090006205
Productivity : 1.7380509000620732
Lifestyle : 1.5828677839851024
News : 1.3345747982619491
Travel : 1.2414649286157666
Finance : 1.1173184357541899
Weather : 0.8690254500310366
Food & Drink : 0.8069522036002483
Reference : 0.5586592178770949
Business : 0.5276225946617008
Book : 0.4345127250155183
Navigation : 0.186219739292365
Medical : 0.186219739292365
Catalogs : 0.12414649286157665


In [106]:
# shows avg user count per genre in descending order based on #of reviews for each genre
users_per_genre(eng_free_apple_apps, genre_index= 11,user_count_index=5)

Navigation : 86090
Reference : 74942
Social Networking : 71548
Music : 57327
Weather : 52280
Book : 39758
Food & Drink : 33334
Finance : 31468
Photo & Video : 28442
Travel : 28244
Shopping : 26920
Health & Fitness : 23298
Sports : 23009
Games : 22789
News : 21248
Productivity : 21028
Utilities : 18684
Lifestyle : 16486
Entertainment : 14030
Business : 7491
Education : 7004
Catalogs : 4004
Medical : 612


In [103]:
# display the frequency of each category in the google dataset as a percentage of the total number of apps

freq_table(eng_free_google_apps,1)

FAMILY : 18.456678700361014
GAME : 9.860108303249097
TOOLS : 8.449909747292418
BUSINESS : 4.591606498194946
LIFESTYLE : 3.9034296028880866
PRODUCTIVITY : 3.892148014440433
FINANCE : 3.7003610108303246
MEDICAL : 3.531137184115524
SPORTS : 3.395758122743682
PERSONALIZATION : 3.3167870036101084
COMMUNICATION : 3.2378158844765346
HEALTH_AND_FITNESS : 3.0798736462093865
PHOTOGRAPHY : 2.944494584837545
NEWS_AND_MAGAZINES : 2.7978339350180503
SOCIAL : 2.6624548736462095
TRAVEL_AND_LOCAL : 2.33528880866426
SHOPPING : 2.2450361010830324
BOOKS_AND_REFERENCE : 2.1435018050541514
DATING : 1.861462093862816
VIDEO_PLAYERS : 1.782490974729242
MAPS_AND_NAVIGATION : 1.3989169675090252
EDUCATION : 1.286101083032491
FOOD_AND_DRINK : 1.2409747292418771
ENTERTAINMENT : 1.128158844765343
LIBRARIES_AND_DEMO : 0.9363718411552346
AUTO_AND_VEHICLES : 0.9250902527075812
HOUSE_AND_HOME : 0.8348375451263539
WEATHER : 0.8009927797833934
EVENTS : 0.7107400722021661
ART_AND_DESIGN : 0.6768953068592057
PARENTING : 0.6

In [117]:
# find the avg number of user per category for the google based dataset
users_per_genre(eng_free_google_apps, genre_index= 1,user_count_index=5)

COMMUNICATION : 38456119
VIDEO_PLAYERS : 24852732
SOCIAL : 23253652
ENTERTAINMENT : 21134600
PHOTOGRAPHY : 17840110
PRODUCTIVITY : 16772839
GAME : 15935777
TRAVEL_AND_LOCAL : 13984078
TOOLS : 10814477
NEWS_AND_MAGAZINES : 9549178
BOOKS_AND_REFERENCE : 8767812
SHOPPING : 7036877
PERSONALIZATION : 5201483
WEATHER : 5074486
HEALTH_AND_FITNESS : 4188822
MAPS_AND_NAVIGATION : 4056942
SPORTS : 3638640
EDUCATION : 3082018
FAMILY : 2690034
FOOD_AND_DRINK : 1924898
ART_AND_DESIGN : 1905352
BUSINESS : 1712290
LIFESTYLE : 1437816
FINANCE : 1387692
HOUSE_AND_HOME : 1313682
DATING : 854029
COMICS : 817657
AUTO_AND_VEHICLES : 647318
LIBRARIES_AND_DEMO : 638504
PARENTING : 542604
BEAUTY : 513152
EVENTS : 253542
MEDICAL : 120551


In [104]:
# display the frequency of each genre in the apple dataset as a percentage of the total number of apps
freq_table(eng_free_google_apps,9)

Tools : 8.438628158844766
Entertainment : 6.069494584837545
Education : 5.347472924187725
Business : 4.591606498194946
Productivity : 3.892148014440433
Lifestyle : 3.892148014440433
Finance : 3.7003610108303246
Medical : 3.531137184115524
Sports : 3.463447653429603
Personalization : 3.3167870036101084
Communication : 3.2378158844765346
Action : 3.1024368231046933
Health & Fitness : 3.0798736462093865
Photography : 2.944494584837545
News & Magazines : 2.7978339350180503
Social : 2.6624548736462095
Travel & Local : 2.3240072202166067
Shopping : 2.2450361010830324
Books & Reference : 2.1435018050541514
Simulation : 2.0419675090252705
Dating : 1.861462093862816
Arcade : 1.8501805054151623
Video Players & Editors : 1.7712093862815883
Casual : 1.7486462093862816
Maps & Navigation : 1.3989169675090252
Food & Drink : 1.2409747292418771
Puzzle : 1.128158844765343
Racing : 0.9927797833935018
Role Playing : 0.9363718411552346
Libraries & Demo : 0.9363718411552346
Auto & Vehicles : 0.9250902527075

In [118]:
# find the avg number of user per genre for the google based dataset
users_per_genre(eng_free_google_apps, genre_index= 9,user_count_index=5)

Communication : 38456119
Adventure;Action & Adventure : 35333333
Video Players & Editors : 24947336
Social : 23253652
Arcade : 22888365
Casual : 19630959
Puzzle;Action & Adventure : 18366667
Photography : 17840110
Educational;Action & Adventure : 17016667
Productivity : 16772839
Racing : 15910646
Travel & Local : 14051476
Casual;Action & Adventure : 12916667
Action : 12603589
Strategy : 11199903
Tools : 10815566
Tools;Education : 10000000
Role Playing;Brain Games : 10000000
Lifestyle;Pretend Play : 10000000
Casual;Music & Video : 10000000
Card;Action & Adventure : 10000000
Adventure;Education : 10000000
News & Magazines : 9549178
Music : 9445583
Educational;Pretend Play : 9375000
Word : 9094459
Puzzle;Brain Games : 9013125
Racing;Action & Adventure : 8816667
Books & Reference : 8767812
Puzzle : 8302862
Video Players & Editors;Music & Video : 7500000
Shopping : 7036877
Role Playing;Action & Adventure : 7000000
Casual;Pretend Play : 6957143
Entertainment;Music & Video : 6413333
Action;Ac

# Based on this analysis
## There is a large number of users for photography and book apps Yet the number of apps in these two categories is under 3% of the total free english apps number
Consider exploring this idea further to creat an app that combines both