In [54]:
#In order to create a meaningful mobile app that consumers would actually like to use, it is critical to conduct
#some initial market research. For this data project, I am planning to work with a Apple Store dataset to see 
#what are the most popular apps that users are downloading and using today. I found this dataset on Kaggle and 
#it looked very interesting because the data is collected from the iTunes Search API at the Apple Inc. website.
#There are 7200 mobile apps in this dataset and is presented with their information as well.

#Key Steps:
#1. Understand the Key Objective: Determining what are the qualities of a competitive and desirable mobile app.
#2. Conduct Preliminary Data Exploration: Gaining a clear understanding of the entire dataset and brainstorm strategy.
#3. Perform Data Cleaning: Cleaning the dataset for better workability.
#2. Conduct Exploratory Data Analysis (EDA) and Visualization
#3. Testing Assumptions, Hypotheses, and Uncovering Trends 

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

#I will first import the dataset and read it into a Panda dataframe. My objective with this dataset will be to
#determine the qualities of a mobile app that consumers love.
appstore = pd.read_csv("AppleStore.csv", index_col=0)

In [55]:
#Let's start the preliminary data exploration by first taking a comprehensive overview of the data followed by looking
#at the headers and the top 3 rows.

#Comprehensive overview.
appstore.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7197 entries, 1 to 11097
Data columns (total 16 columns):
id                  7197 non-null int64
track_name          7197 non-null object
size_bytes          7197 non-null int64
currency            7197 non-null object
price               7197 non-null float64
rating_count_tot    7197 non-null int64
rating_count_ver    7197 non-null int64
user_rating         7197 non-null float64
user_rating_ver     7197 non-null float64
ver                 7197 non-null object
cont_rating         7197 non-null object
prime_genre         7197 non-null object
sup_devices.num     7197 non-null int64
ipadSc_urls.num     7197 non-null int64
lang.num            7197 non-null int64
vpp_lic             7197 non-null int64
dtypes: float64(3), int64(8), object(5)
memory usage: 955.9+ KB


In [71]:
#Headers and the top 3 rows.
appstore.head(3)

Unnamed: 0,id,track_name,size_bytes,currency,price,rating_count_tot,rating_count_ver,user_rating,user_rating_ver,ver,cont_rating,prime_genre,sup_devices.num,ipadSc_urls.num,lang.num,vpp_lic
1,281656475,PAC-MAN Premium,100788224,USD,3.99,21292,26,4.0,4.5,6.3.5,4+,Games,38,5,10,1
2,281796108,Evernote - stay organized,158578688,USD,0.0,161065,26,4.0,3.5,8.2.2,4+,Productivity,37,5,23,1
3,281940292,"WeatherBug - Local Weather, Radar, Maps, Alerts",100524032,USD,0.0,188583,2822,3.5,4.5,5.0.0,4+,Weather,37,5,3,1


In [57]:
#Then let's look at all the columns in this dataset and see what information to focus on.
appstore.columns

Index(['id', 'track_name', 'size_bytes', 'currency', 'price',
       'rating_count_tot', 'rating_count_ver', 'user_rating',
       'user_rating_ver', 'ver', 'cont_rating', 'prime_genre',
       'sup_devices.num', 'ipadSc_urls.num', 'lang.num', 'vpp_lic'],
      dtype='object')

In [58]:
appstore.describe()

Unnamed: 0,id,size_bytes,price,rating_count_tot,rating_count_ver,user_rating,user_rating_ver,sup_devices.num,ipadSc_urls.num,lang.num,vpp_lic
count,7197.0,7197.0,7197.0,7197.0,7197.0,7197.0,7197.0,7197.0,7197.0,7197.0,7197.0
mean,863131000.0,199134500.0,1.726218,12892.91,460.373906,3.526956,3.253578,37.361817,3.7071,5.434903,0.993053
std,271236800.0,359206900.0,5.833006,75739.41,3920.455183,1.517948,1.809363,3.737715,1.986005,7.919593,0.083066
min,281656500.0,589824.0,0.0,0.0,0.0,0.0,0.0,9.0,0.0,0.0,0.0
25%,600093700.0,46922750.0,0.0,28.0,1.0,3.5,2.5,37.0,3.0,1.0,1.0
50%,978148200.0,97153020.0,0.0,300.0,23.0,4.0,4.0,37.0,5.0,1.0,1.0
75%,1082310000.0,181924900.0,1.99,2793.0,140.0,4.5,4.5,38.0,5.0,8.0,1.0
max,1188376000.0,4025970000.0,299.99,2974676.0,177050.0,5.0,5.0,47.0,5.0,75.0,1.0


In [65]:
#Now that I have gained a great overview of the whole dataset, I have gained some idea of what I would like to at.
#However, before I approach the analysis aspect, I would like to clean up my dataset and select the revelant columns
#before doing some EDA.

#First, I would like to make a copy of the appstore dataframe in case I want to revert my changes. Afterwards, I will
#change the column name'size_bytes' to 'size_mb' since I am more use to working with that MB.
appstore_c=appstore.copy()
appstore_c.loc[:,'size_bytes']=round(appstore.loc[:,'size_bytes']/1000000,2)

In [87]:
#Since I have converted the column from bytes to MB, let's rename the column as well to not confuse anybody.
appstore_c.rename(columns={'track_name':'app_name','size_bytes':'size_mb',}, inplace=True)
appstore_c.head(3)

Unnamed: 0,id,app_name,size_mb,currency,price,rating_count_tot,rating_count_ver,user_rating,user_rating_ver,ver,cont_rating,prime_genre
1,281656475,PAC-MAN Premium,100.79,USD,3.99,21292,26,4.0,4.5,6.3.5,4+,Games
2,281796108,Evernote - stay organized,158.58,USD,0.0,161065,26,4.0,3.5,8.2.2,4+,Productivity
3,281940292,"WeatherBug - Local Weather, Radar, Maps, Alerts",100.52,USD,0.0,188583,2822,3.5,4.5,5.0.0,4+,Weather


In [88]:
#Now that the dataset has been cleaned a little, I will go ahead and select the relevant columns I would like to work
#with. Columns like 'sup_devices.num' , 'ipadSc_urls.num', 'lang.num', 'vpp_lic' are not necessary for my strategy of
#analysis, so I will make a new dataframe without these columns in order to be more concise.
appstore_c=appstore_c.loc[:,'id':'prime_genre']
appstore_c.head(3)

Unnamed: 0,id,app_name,size_mb,currency,price,rating_count_tot,rating_count_ver,user_rating,user_rating_ver,ver,cont_rating,prime_genre
1,281656475,PAC-MAN Premium,100.79,USD,3.99,21292,26,4.0,4.5,6.3.5,4+,Games
2,281796108,Evernote - stay organized,158.58,USD,0.0,161065,26,4.0,3.5,8.2.2,4+,Productivity
3,281940292,"WeatherBug - Local Weather, Radar, Maps, Alerts",100.52,USD,0.0,188583,2822,3.5,4.5,5.0.0,4+,Weather


In [96]:
#Before I go ahead and perform any EDA to gain insights into what is the most popular type of mobile apps, I would 
#like to do my due diligence and check for errors or duplicates in the data.
id_dup=appstore_c.loc[:,'id'].duplicated()
appstore_c.loc[id_dup,:]

Unnamed: 0,id,app_name,size_mb,currency,price,rating_count_tot,rating_count_ver,user_rating,user_rating_ver,ver,cont_rating,prime_genre


In [97]:
appname_dup=appstore_c.loc[:,'app_name'].duplicated(keep=False)
appstore_c.loc[appname_dup,:]

Unnamed: 0,id,app_name,size_mb,currency,price,rating_count_tot,rating_count_ver,user_rating,user_rating_ver,ver,cont_rating,prime_genre
4000,952877179,VR Roller Coaster,169.52,USD,0.0,107,102,3.5,3.5,2.0.0,4+,Games
7579,1089824278,VR Roller Coaster,240.96,USD,0.0,67,44,3.5,4.0,0.81,4+,Games
10751,1173990889,Mannequin Challenge,109.71,USD,0.0,668,87,3.0,3.0,1.4,9+,Games
10885,1178454060,Mannequin Challenge,59.57,USD,0.0,105,58,4.0,4.5,1.0.1,4+,Games


In [99]:
#From my initial due diligence, I do not see any duplicates in app id which is comforting to know. However, there are
#two duplicates in app names. There are two copies 'VR Roller Coaster' and 'Mannequin Challenge'. They each have very
#similar qualities to their duplicated partners, but they have different app ids.

appstore_c.describe()

#I believe that these duplicates are most likely newer version of their older counterparts. Because the newer version
#of these apps have more total ratings, I would like to drop the older duplicates.

Unnamed: 0,id,size_mb,price,rating_count_tot,rating_count_ver,user_rating,user_rating_ver
count,7197.0,7197.0,7197.0,7197.0,7197.0,7197.0,7197.0
mean,863131000.0,199.134477,1.726218,12892.91,460.373906,3.526956,3.253578
std,271236800.0,359.206912,5.833006,75739.41,3920.455183,1.517948,1.809363
min,281656500.0,0.59,0.0,0.0,0.0,0.0,0.0
25%,600093700.0,46.92,0.0,28.0,1.0,3.5,2.5
50%,978148200.0,97.15,0.0,300.0,23.0,4.0,4.0
75%,1082310000.0,181.92,1.99,2793.0,140.0,4.5,4.5
max,1188376000.0,4025.97,299.99,2974676.0,177050.0,5.0,5.0


In [100]:
appstore_c.drop_duplicates(subset='app_name',keep='first',inplace=True)

In [103]:
appname_dup=appstore_c.loc[:,'app_name'].duplicated(keep=False)
appstore_c.loc[appname_dup,:]

#I was able to remove all the duplicates and replaced the cleaned appstore dataframe. To make sure I have done
#it correctly, I will go ahead and find the corresponding rows for 'VR Roller Coaster' and 'Mannequin Challenge' to
#see if I kept the correct ones.

Unnamed: 0,id,app_name,size_mb,currency,price,rating_count_tot,rating_count_ver,user_rating,user_rating_ver,ver,cont_rating,prime_genre


In [107]:
vr_bool=appstore_c.loc[:,'app_name']=='VR Roller Coaster'
mc_bool=appstore_c.loc[:,'app_name']=='Mannequin Challenge'

print(appstore_c.loc[vr_bool,:])
print(appstore_c.loc[mc_bool,:])

             id           app_name  size_mb currency  price  rating_count_tot  \
4000  952877179  VR Roller Coaster   169.52      USD    0.0               107   

      rating_count_ver  user_rating  user_rating_ver    ver cont_rating  \
4000               102          3.5              3.5  2.0.0          4+   

     prime_genre  
4000       Games  
               id             app_name  size_mb currency  price  \
10751  1173990889  Mannequin Challenge   109.71      USD    0.0   

       rating_count_tot  rating_count_ver  user_rating  user_rating_ver  ver  \
10751               668                87          3.0              3.0  1.4   

      cont_rating prime_genre  
10751          9+       Games  


In [110]:
#By looking into the 'currency' column, I can see that all the currency is USD, so there is really no need to keep
#this column. Let's drop this column and rename the price to note that is it in USD.
appstore_c.loc[:,'currency'].value_counts()

USD    7195
Name: currency, dtype: int64

In [111]:
#Dropping 'currency' column.
appstore_c=appstore_c.drop(columns='currency')

In [113]:
#Renaming 'price' to 'price_usd'.
appstore_c.rename(columns={'price':'price_usd'}, inplace=True)
appstore_c.head(3)

Unnamed: 0,id,app_name,size_mb,price_usd,rating_count_tot,rating_count_ver,user_rating,user_rating_ver,ver,cont_rating,prime_genre
1,281656475,PAC-MAN Premium,100.79,3.99,21292,26,4.0,4.5,6.3.5,4+,Games
2,281796108,Evernote - stay organized,158.58,0.0,161065,26,4.0,3.5,8.2.2,4+,Productivity
3,281940292,"WeatherBug - Local Weather, Radar, Maps, Alerts",100.52,0.0,188583,2822,3.5,4.5,5.0.0,4+,Weather


In [114]:
#Now that I have done all the data cleaning I feel like is nececessary. 
appstore_c.head(3)

Unnamed: 0,id,app_name,size_mb,price_usd,rating_count_tot,rating_count_ver,user_rating,user_rating_ver,ver,cont_rating,prime_genre
1,281656475,PAC-MAN Premium,100.79,3.99,21292,26,4.0,4.5,6.3.5,4+,Games
2,281796108,Evernote - stay organized,158.58,0.0,161065,26,4.0,3.5,8.2.2,4+,Productivity
3,281940292,"WeatherBug - Local Weather, Radar, Maps, Alerts",100.52,0.0,188583,2822,3.5,4.5,5.0.0,4+,Weather


In [30]:
#From initial glance, there are already a few important columns that I want to take a look at: 'prime_genre' and
#'price'. I would like to go ahead and dive deeper into these columns and see if I can find anything interesting.

appstore.loc[:,'prime_genre'].value_counts()

appstore.loc[:,'price'].describe()

appstore.loc[:,'price'].value_counts().head()

#Interesting. Not that it was completely unexpected, but from what I see. About 60% of the apps in the App Store are 
#free with the most expensive app being $299.99 and the average being $1.73. The most popular apps

#From looking at the columns and deducing the meaning of each column, I can already get an idea of what columns
#I will be mainly using. There are some columns that contain information like number of supported devices and 
#number of languages. This is not particularly useful to my current business objective so I may go ahead and just
#do a selection of the dataframe excluding these 4 columns.

appstore=appstore.loc[:,'id':'prime_genre']
appstore.head(3)

Unnamed: 0,id,track_name,size_bytes,currency,price,rating_count_tot,rating_count_ver,user_rating,user_rating_ver,ver,cont_rating,prime_genre
1,281656475,PAC-MAN Premium,1.007882e-10,USD,3.99,21292,26,4.0,4.5,6.3.5,4+,Games
2,281796108,Evernote - stay organized,1.585787e-10,USD,0.0,161065,26,4.0,3.5,8.2.2,4+,Productivity
3,281940292,"WeatherBug - Local Weather, Radar, Maps, Alerts",1.00524e-10,USD,0.0,188583,2822,3.5,4.5,5.0.0,4+,Weather
