# Profitable App Profiles for the App Store and Google Play Markets

## Goal: Analyze data to help developers understand what type of apps are likely to attract more users

In [60]:
import pandas as pd
import numpy as np

### Printing first few rows of the dataset

In [2]:
directory = 'C:/Personal Projects/Dataquest/Python Project/Apple and Google Apps/'
df = pd.read_csv(directory + 'GooglePlayStore.csv')
df.head()

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19M,"10,000+",Free,0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7M,"5,000,000+",Free,0,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25M,"50,000,000+",Free,0,Teen,Art & Design,"June 8, 2018",Varies with device,4.2 and up
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8M,"100,000+",Free,0,Everyone,Art & Design;Creativity,"June 20, 2018",1.1,4.4 and up


## Data exploration
### Number of rows and columns

In [3]:
df.shape

(10841, 13)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10841 entries, 0 to 10840
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   App             10841 non-null  object 
 1   Category        10841 non-null  object 
 2   Rating          9367 non-null   float64
 3   Reviews         10841 non-null  object 
 4   Size            10841 non-null  object 
 5   Installs        10841 non-null  object 
 6   Type            10840 non-null  object 
 7   Price           10841 non-null  object 
 8   Content Rating  10840 non-null  object 
 9   Genres          10841 non-null  object 
 10  Last Updated    10841 non-null  object 
 11  Current Ver     10833 non-null  object 
 12  Android Ver     10838 non-null  object 
dtypes: float64(1), object(12)
memory usage: 1.1+ MB


In [5]:
df.describe(include='O')

Unnamed: 0,App,Category,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
count,10841,10841,10841,10841,10841,10840,10841,10840,10841,10841,10833,10838
unique,9660,34,6002,462,22,3,93,6,120,1378,2832,33
top,ROBLOX,FAMILY,0,Varies with device,"1,000,000+",Free,0,Everyone,Tools,"August 3, 2018",Varies with device,4.1 and up
freq,9,1972,596,1695,1579,10039,10040,8714,842,326,1459,2451


## Data Cleaning and Transformation
### Modify column names

In [6]:
df.columns = df.columns.str.lower()

In [7]:
df.rename({'content rating':'cont_rating', 'last updated':'last_updated', 'current ver':'curr_ver', 'android ver':'and_ver'},
         axis = 1, inplace=True)

In [8]:
df.columns

Index(['app', 'category', 'rating', 'reviews', 'size', 'installs', 'type',
       'price', 'cont_rating', 'genres', 'last_updated', 'curr_ver',
       'and_ver'],
      dtype='object')

In [9]:
df.head()

Unnamed: 0,app,category,rating,reviews,size,installs,type,price,cont_rating,genres,last_updated,curr_ver,and_ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19M,"10,000+",Free,0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7M,"5,000,000+",Free,0,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25M,"50,000,000+",Free,0,Teen,Art & Design,"June 8, 2018",Varies with device,4.2 and up
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8M,"100,000+",Free,0,Everyone,Art & Design;Creativity,"June 20, 2018",1.1,4.4 and up


### Checking for duplicate
- First, order by app name by ascending and number of reviews descending
- Count how many duplicated rows exist
- Drop the duplicated values. We want to keep the app with the highest reviews

In [10]:
df.sort_values(by=['app', 'reviews'], ascending=[True, False], inplace=True)

In [11]:
dup_df = df[df.duplicated('app')]

In [12]:
dup_df.shape

(1181, 13)

In [13]:
remove_df = df.duplicated('app')
df = df[~remove_df]

In [14]:
df.shape

(9660, 13)

In [15]:
df[df.app=='8 Ball Pool']

Unnamed: 0,app,category,rating,reviews,size,installs,type,price,cont_rating,genres,last_updated,curr_ver,and_ver
1871,8 Ball Pool,GAME,4.5,14201891,52M,"100,000,000+",Free,0,Everyone,Sports,"July 31, 2018",4.0.0,4.0.3 and up


- From the original dataset, it is confirmed that reviews with 14201891 for 8 Ball Pool app is the highest (Keeping the highest reviews)

### Removing Non-English Apps

In [16]:
def non_eng_remover(val):
    number = 0
    for letter in val:
        if ord(letter) > 127:
            number += 1
    
    if number > 3:
        return 'delete'
    else:
        return val

df['new_app'] = df.app.apply(non_eng_remover)

In [17]:
non_eng_app = df.new_app == 'delete'
non_eng_app

8884    False
8532    False
324     False
4541    False
4636    False
        ...  
6334     True
4362    False
2575    False
7559    False
882     False
Name: new_app, Length: 9660, dtype: bool

In [18]:
df = df[~non_eng_app]
df

Unnamed: 0,app,category,rating,reviews,size,installs,type,price,cont_rating,genres,last_updated,curr_ver,and_ver,new_app
8884,"""i DT"" Fútbol. Todos Somos Técnicos.",SPORTS,,27,3.6M,500+,Free,0,Everyone,Sports,"October 7, 2017",0.22,4.1 and up,"""i DT"" Fútbol. Todos Somos Técnicos."
8532,+Download 4 Instagram Twitter,SOCIAL,4.5,40467,22M,"1,000,000+",Free,0,Everyone,Social,"August 2, 2018",5.03,4.1 and up,+Download 4 Instagram Twitter
324,- Free Comics - Comic Apps,COMICS,3.5,115,9.1M,"10,000+",Free,0,Mature 17+,Comics,"July 13, 2018",5.0.12,5.0 and up,- Free Comics - Comic Apps
4541,.R,TOOLS,4.5,259,203k,"10,000+",Free,0,Everyone,Tools,"September 16, 2014",1.1.06,1.5 and up,.R
4636,/u/app,COMMUNICATION,4.7,573,53M,"10,000+",Free,0,Mature 17+,Communication,"July 3, 2018",4.2.4,4.1 and up,/u/app
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
313,"漫咖 Comics - Manga,Novel and Stories",COMICS,4.1,12088,21M,"1,000,000+",Free,0,Mature 17+,Comics,"July 6, 2018",2.3.1,4.0.3 and up,"漫咖 Comics - Manga,Novel and Stories"
4362,💎 I'm rich,LIFESTYLE,3.8,718,26M,"10,000+",Paid,$399.99,Everyone,Lifestyle,"March 11, 2018",1.0.0,4.4 and up,💎 I'm rich
2575,"💘 WhatsLov: Smileys of love, stickers and GIF",SOCIAL,4.6,22098,18M,"1,000,000+",Free,0,Everyone,Social,"July 24, 2018",4.2.4,4.0.3 and up,"💘 WhatsLov: Smileys of love, stickers and GIF"
7559,📏 Smart Ruler ↔️ cm/inch measuring for homework!,TOOLS,4.0,19,3.2M,"10,000+",Free,0,Everyone,Tools,"October 21, 2017",1.0,4.2 and up,📏 Smart Ruler ↔️ cm/inch measuring for homework!


In [19]:
df = df.drop('app', axis=1)

### Isolating Free Apps

In [20]:
df.type.value_counts()

Free    8861
Paid     752
0          1
Name: type, dtype: int64

In [21]:
df.price.value_counts()

0          8862
$0.99       145
$2.99       124
$1.99        73
$4.99        70
           ... 
$25.99        1
$389.99       1
$37.99        1
$18.99        1
$1.20         1
Name: price, Length: 93, dtype: int64

In [22]:
free_apps = df.price=='0'
free_apps

8884     True
8532     True
324      True
4541     True
4636     True
        ...  
313      True
4362    False
2575     True
7559     True
882      True
Name: price, Length: 9615, dtype: bool

In [23]:
df = df[free_apps]
df.shape

(8862, 13)

## Data Analysis
### Most common apps by genre

In [29]:
df.genres

8884               Sports
8532               Social
324                Comics
4541                Tools
4636        Communication
              ...        
3824    Maps & Navigation
313                Comics
2575               Social
7559                Tools
882         Entertainment
Name: genres, Length: 8862, dtype: object

In [43]:
df.category.value_counts(normalize=True).mul(100).round(2)

FAMILY                 18.93
GAME                    9.69
TOOLS                   8.45
BUSINESS                4.59
LIFESTYLE               3.90
PRODUCTIVITY            3.89
FINANCE                 3.70
MEDICAL                 3.52
SPORTS                  3.40
PERSONALIZATION         3.32
COMMUNICATION           3.24
HEALTH_AND_FITNESS      3.08
PHOTOGRAPHY             2.95
NEWS_AND_MAGAZINES      2.80
SOCIAL                  2.66
TRAVEL_AND_LOCAL        2.34
SHOPPING                2.25
BOOKS_AND_REFERENCE     2.14
DATING                  1.86
VIDEO_PLAYERS           1.79
MAPS_AND_NAVIGATION     1.40
FOOD_AND_DRINK          1.24
EDUCATION               1.17
ENTERTAINMENT           0.96
LIBRARIES_AND_DEMO      0.94
AUTO_AND_VEHICLES       0.93
HOUSE_AND_HOME          0.82
WEATHER                 0.80
EVENTS                  0.71
PARENTING               0.65
ART_AND_DESIGN          0.64
COMICS                  0.62
BEAUTY                  0.60
Name: category, dtype: float64

### Most popular apps by genre

In [49]:
df.installs = df.installs.str.replace(',', '').str.replace('+', '')

  df.installs = df.installs.str.replace(',', '').str.replace('+', '')


In [58]:
df.installs = df.installs.astype('int')

In [89]:
num_of_installs_by_app_genre = df.groupby('category')['installs'].sum().sort_index() // df.category.value_counts().sort_index()

In [91]:
num_of_installs_by_app_genre.sort_values(ascending=False)

category
COMMUNICATION          38456119
VIDEO_PLAYERS          24727872
SOCIAL                 23253652
PHOTOGRAPHY            17805627
PRODUCTIVITY           16787331
GAME                   15560965
TRAVEL_AND_LOCAL       13984077
ENTERTAINMENT          11640705
TOOLS                  10682301
NEWS_AND_MAGAZINES      9549178
BOOKS_AND_REFERENCE     8767811
SHOPPING                7036877
PERSONALIZATION         5201482
WEATHER                 5074486
HEALTH_AND_FITNESS      4188821
MAPS_AND_NAVIGATION     4056941
FAMILY                  3694276
SPORTS                  3638640
ART_AND_DESIGN          1986335
FOOD_AND_DRINK          1924897
EDUCATION               1820673
BUSINESS                1712290
LIFESTYLE               1437816
FINANCE                 1387692
HOUSE_AND_HOME          1331540
DATING                   854028
COMICS                   817657
AUTO_AND_VEHICLES        647317
LIBRARIES_AND_DEMO       638503
PARENTING                542603
BEAUTY                   513151

In [96]:
communication = df.category == 'COMMUNICATION'
installation = df.installs > 100000000

boolean = communication & installation

df[boolean][['new_app', 'installs']]

Unnamed: 0,new_app,installs
451,Gmail,1000000000
411,Google Chrome: Fast & Secure,1000000000
371,Google Duo - High Quality Video Calls,500000000
464,Hangouts,1000000000
403,LINE: Free Calls & Messages,500000000
382,Messenger – Text and Video Chat for Free,1000000000
391,Skype - free IM & video calls,1000000000
420,UC Browser - Fast Download Private & Secure,500000000
4676,Viber Messenger,500000000
336,WhatsApp Messenger,1000000000


## Getting ready to load clean data into Database
- Modifying column name
- Dropping unnecessary columns

In [99]:
df.columns = ['category', 'rating', 'reviews', 'size', 'installs', 'type', 'price',
       'cont_rating', 'genres', 'last_updated', 'curr_ver', 'and_ver',
       'app']

In [103]:
df.drop(['last_updated', 'curr_ver', 'and_ver'], axis=1, inplace=True)

In [104]:
df

Unnamed: 0,category,rating,reviews,size,installs,type,price,cont_rating,genres,app
8884,SPORTS,,27,3.6M,500,Free,0,Everyone,Sports,"""i DT"" Fútbol. Todos Somos Técnicos."
8532,SOCIAL,4.5,40467,22M,1000000,Free,0,Everyone,Social,+Download 4 Instagram Twitter
324,COMICS,3.5,115,9.1M,10000,Free,0,Mature 17+,Comics,- Free Comics - Comic Apps
4541,TOOLS,4.5,259,203k,10000,Free,0,Everyone,Tools,.R
4636,COMMUNICATION,4.7,573,53M,10000,Free,0,Mature 17+,Communication,/u/app
...,...,...,...,...,...,...,...,...,...,...
3824,MAPS_AND_NAVIGATION,4.4,50459,Varies with device,5000000,Free,0,Everyone,Maps & Navigation,乗換NAVITIME　Timetable & Route Search in Japan T...
313,COMICS,4.1,12088,21M,1000000,Free,0,Mature 17+,Comics,"漫咖 Comics - Manga,Novel and Stories"
2575,SOCIAL,4.6,22098,18M,1000000,Free,0,Everyone,Social,"💘 WhatsLov: Smileys of love, stickers and GIF"
7559,TOOLS,4.0,19,3.2M,10000,Free,0,Everyone,Tools,📏 Smart Ruler ↔️ cm/inch measuring for homework!
