# Descriptive Statistics

In [25]:
import re
import pandas as pd
import numpy as np
%matplotlib inline

In [26]:
data = pd.read_csv('/Users/datasets/googleplaystore.csv')
data.shape

(10841, 13)

In [27]:
data.head()

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19M,"10,000+",Free,0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7M,"5,000,000+",Free,0,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25M,"50,000,000+",Free,0,Teen,Art & Design,"June 8, 2018",Varies with device,4.2 and up
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8M,"100,000+",Free,0,Everyone,Art & Design;Creativity,"June 20, 2018",1.1,4.4 and up


### Granularity
- Each row represents one application in google store

In [28]:
data.isna().sum()

App                  0
Category             0
Rating            1474
Reviews              0
Size                 0
Installs             0
Type                 1
Price                0
Content Rating       1
Genres               0
Last Updated         0
Current Ver          8
Android Ver          3
dtype: int64

In [None]:
data.head(1)

In [29]:
data.nunique()

App               9660
Category            34
Rating              40
Reviews           6002
Size               462
Installs            22
Type                 3
Price               93
Content Rating       6
Genres             120
Last Updated      1378
Current Ver       2832
Android Ver         33
dtype: int64

In [33]:
categories = ['Category', 'Installs', 'Type', 'Content Rating',
              'Genres', 'Android Ver']
numbers = ['Rating', 'Reviews', 'Installs_numeric', 'Price_numeric']
dates = ['Last Updated']
texts = ['App']
locations = []

In [None]:
#data['Price'].unique()

In [34]:
data['Installs_numeric'] = data['Installs'].apply(
    lambda v: re.sub('[^0-9]', '', v)).replace('', None).astype(float)
data['Price_numeric'] = pd.to_numeric(data['Price'].str.replace('[^0-9\.]',''),
                             errors='coerce')

In [30]:
def categories_descriptive(data, categories):
    summary = data[categories].describe()
    freq_perc = summary.loc['freq'] / data.shape[0] * 100
    freq_perc.name = 'freq_perc'
    summary = summary.append(freq_perc)
    
    nulls = data.shape[0] - summary.loc['count']
    nulls_perc = nulls / data.shape[0] * 100
    nulls_perc.name = 'missing_perc'
    summary = summary.append(nulls_perc)
    return summary

In [31]:
categories_descriptive(data, categories)

Unnamed: 0,Category,Installs,Type,Content Rating,Genres,Android Ver
count,10841,10841,10840,10840,10841,10838
unique,34,22,3,6,120,33
top,FAMILY,"1,000,000+",Free,Everyone,Tools,4.1 and up
freq,1972,1579,10039,8714,842,2451
freq_perc,18.1902,14.5651,92.6022,80.38,7.76681,22.6086
missing_perc,0,0,0.00922424,0.00922424,0,0.0276727


In [35]:
data[numbers].describe()

Unnamed: 0,Rating,Installs_numeric,Price_numeric
count,9367.0,10841.0,10840.0
mean,4.193338,15462920.0,1.027368
std,0.537431,85025570.0,15.949703
min,1.0,0.0,0.0
25%,4.0,1000.0,0.0
50%,4.3,100000.0,0.0
75%,4.5,5000000.0,0.0
max,19.0,1000000000.0,400.0


In [36]:
data['Last Updated'].head(1)

0    January 7, 2018
Name: Last Updated, dtype: object

In [37]:
data['Last Updated'] = pd.to_datetime(data['Last Updated'],
                                     format='%B %d, %Y',
                                     errors='coerce')
data['Last Updated'].head()

0   2018-01-07
1   2018-01-15
2   2018-08-01
3   2018-06-08
4   2018-06-20
Name: Last Updated, dtype: datetime64[ns]

In [38]:
data['Last Updated'].describe()

count                   10840
unique                   1377
top       2018-08-03 00:00:00
freq                      326
first     2010-05-21 00:00:00
last      2018-08-08 00:00:00
Name: Last Updated, dtype: object

In [39]:
data['Last Updated'].min(), data['Last Updated'].max()

(Timestamp('2010-05-21 00:00:00'), Timestamp('2018-08-08 00:00:00'))

In [40]:
data['Last Updated'].value_counts().head(10)

2018-08-03    326
2018-08-02    304
2018-07-31    294
2018-08-01    285
2018-07-30    211
2018-07-25    164
2018-07-26    161
2018-08-06    158
2018-07-27    151
2018-07-24    148
Name: Last Updated, dtype: int64