In [1]:
# packages
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt

from plotly.offline import init_notebook_mode, plot, iplot
import plotly.graph_objs as go
init_notebook_mode(connected=True)

* 套件需求：`$ pip install plotly`
* `offline` 和 `init_notebook_mode` 為必要

# Google play store apps anlytics

* Kaggle 資料來源：[Google Play Store Apps](https://www.kaggle.com/lava18/google-play-store-apps)
* 只有用到第一個資料表

In [2]:
data = pd.read_csv('googleplaystore.csv')
data.head()

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19M,"10,000+",Free,0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7M,"5,000,000+",Free,0,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25M,"50,000,000+",Free,0,Teen,Art & Design,"June 8, 2018",Varies with device,4.2 and up
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8M,"100,000+",Free,0,Everyone,Art & Design;Creativity,"June 20, 2018",1.1,4.4 and up


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10841 entries, 0 to 10840
Data columns (total 13 columns):
App               10841 non-null object
Category          10841 non-null object
Rating            9367 non-null float64
Reviews           10841 non-null object
Size              10841 non-null object
Installs          10841 non-null object
Type              10840 non-null object
Price             10841 non-null object
Content Rating    10840 non-null object
Genres            10841 non-null object
Last Updated      10841 non-null object
Current Ver       10833 non-null object
Android Ver       10838 non-null object
dtypes: float64(1), object(12)
memory usage: 1.1+ MB


資料約 1 萬筆，評分的部分比較少資料

In [4]:
mask = data['Category'] == '1.9'
data[mask]

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
10472,Life Made WI-Fi Touchscreen Photo Frame,1.9,19.0,3.0M,"1,000+",Free,0,Everyone,,"February 11, 2018",1.0.19,4.0 and up,


In [5]:
data[mask] = data[mask].shift(periods=1, axis=1)
data[mask]

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
10472,,Life Made WI-Fi Touchscreen Photo Frame,,1.9,3.0M,"1,000+",Free,0,Everyone,,"February 11, 2018",1.0.19,4.0 and up


In [6]:
index = data[mask].index
data.loc[index, 'App'] = 'Life Made WI-Fi Touch screen Photo Frame'
data.loc[index, 'Category'] = 'LIFESTYLE'
data.loc[index, 'Rating'] = 1.9
data.loc[index, 'Reviews'] = '19'
data.iloc[index]

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
10472,Life Made WI-Fi Touch screen Photo Frame,LIFESTYLE,1.9,19,3.0M,"1,000+",Free,0,Everyone,,"February 11, 2018",1.0.19,4.0 and up


修正資料錯誤

## App categories

In [7]:
temp = data['Category'].value_counts()
temp.head()

FAMILY      1972
GAME        1144
TOOLS        843
MEDICAL      463
BUSINESS     460
Name: Category, dtype: int64

In [8]:
colors = ['#7fb0ff'] * 5
colors = colors + ['#1c66ce'] * (len(temp)-10)
colors = colors + ['#04367c'] * 5

trace = go.Bar(
    x=temp.index,
    y=temp.values,
    marker=go.bar.Marker(color=colors)
)

layout = go.Layout(
    title='App categories',
    xaxis = go.layout.XAxis(
        tickangle=30,
        tickfont=go.layout.xaxis.Tickfont(size=10),
        automargin=True
    )
)

fig = go.Figure(data=[trace], layout=layout)
iplot(fig)

* App 以家庭、遊戲與工具向為大宗，最少則是育兒、漫畫、美妝
* 實際上美妝相關的修圖軟體是歸類在攝影
* 數量多不一定代表熱門、產值高，只能推測說因為細項多與競爭多

In [9]:
mask = data['Category'] == 'GAME'
temp = data[mask]['Genres'].value_counts()
temp.head()

Action       365
Arcade       220
Racing        98
Adventure     75
Card          48
Name: Genres, dtype: int64

In [10]:
colors = ['#1752a5', '#2f6ec6', '#4a85d6', '#619ae8', '#7daced']
colors = colors + ['#93bbf2'] * 5
colors = colors + ['#b5d4ff'] * (len(temp)-10)

trace = go.Pie(
    labels=temp.index,
    values=temp.values,
    marker=go.pie.Marker(colors=colors),
    hole=0.5
)

layout = go.Layout(
    title='Game categories'
)

fig = go.Figure(data=[trace], layout=layout)
iplot(fig)

遊戲的分類數量其實不亞於主分類，種類主要為動作、街機與賽車

## App fee type

In [11]:
temp = data['Type'].value_counts()
temp

Free    10040
Paid      800
Name: Type, dtype: int64

In [12]:
colors = ['#5394fc', '#2d68c6']

trace = go.Pie(
    labels=temp.index,
    values=temp.values,
    marker=go.pie.Marker(colors=colors),
    hole=0.7
)

layout= go.Layout(
    title='App fee types',
    autosize=False,
    width=500,
    height=500,
)

fig = go.Figure(data=[trace], layout=layout)
iplot(fig)

可以看出免費軟體佔了極大多數，但即便如此仍無法看出收益來自內購或廣告

In [13]:
temp = data[['Category', 'Type']].copy()
mask1 = temp['Type'] == 'Free'
mask2 = temp['Type'] == 'Paid'
free = temp[mask1]
paid = temp[mask2]
paid.head()

Unnamed: 0,Category,Type
234,BUSINESS,Paid
235,BUSINESS,Paid
290,BUSINESS,Paid
291,BUSINESS,Paid
427,COMMUNICATION,Paid


In [14]:
free_counts = free['Category'].value_counts()
paid_counts = paid['Category'].value_counts()
paid_counts.head()

FAMILY             191
MEDICAL            109
GAME                83
PERSONALIZATION     83
TOOLS               78
Name: Category, dtype: int64

In [15]:
trace1 = go.Bar(
    x=free_counts.index,
    y=free_counts.values,
    name='Free',
    marker=go.bar.Marker(color='#5897fc')
)

trace2 = go.Bar(
    x=paid_counts.index,
    y=paid_counts.values,
    name='Paid',
    marker=go.bar.Marker(color='#285fb7')
)

layout = go.Layout(
    barmode='stack',
    title='App fee type by category',
    xaxis = go.layout.XAxis(
        tickangle=30,
        tickfont=go.layout.xaxis.Tickfont(size=10),
        automargin=True
    )
)

fig = go.Figure(data=[trace1, trace2], layout=layout)
iplot(fig)

以分類為基礎，更進一步分析收費的類型

## Paid app

In [16]:
temp = data.copy()
temp['Price'] = temp['Price'].str.replace('$', '')
temp['Price'] = temp['Price'].apply(lambda x: float(x))
temp.sort_values(by=['Price'], ascending=False).head(5)

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
4367,I'm Rich - Trump Edition,LIFESTYLE,3.6,275,7.3M,"10,000+",Paid,400.0,Everyone,Lifestyle,"May 3, 2018",1.0.1,4.1 and up
5364,I am rich (Most expensive app),FINANCE,4.1,129,2.7M,"1,000+",Paid,399.99,Teen,Finance,"December 6, 2017",2,4.0.3 and up
5373,I AM RICH PRO PLUS,FINANCE,4.0,36,41M,"1,000+",Paid,399.99,Everyone,Finance,"June 25, 2018",1.0.2,4.1 and up
5354,I am Rich Plus,FAMILY,4.0,856,8.7M,"10,000+",Paid,399.99,Everyone,Entertainment,"May 19, 2018",3.0,4.4 and up
4197,most expensive app (H),FAMILY,4.3,6,1.5M,100+,Paid,399.99,Everyone,Entertainment,"July 16, 2018",1.0,7.0 and up


付費軟體最高有要到 400 美元，但基本上都是 "I'm rich" 這類的軟體，也曾經上過新聞

In [17]:
mask = temp['Price'] == 0
tt = temp[~mask]['Price'].value_counts().sort_index()
tt.head()

0.99    148
1.00      3
1.04      1
1.20      1
1.26      1
Name: Price, dtype: int64

In [18]:
most = tt.sort_values(ascending=False).head(3)
most

0.99    148
2.99    129
1.99     73
Name: Price, dtype: int64

In [19]:
annotations = []
for i in range(3):
    x = go.layout.Annotation(
        x=most.index[i],
        y=most.values[i],
        xref='x',
        yref='y',
        text=str(most.index[i])+', '+str(most.values[i]),
        ax=30,
        ay=-40
    )
    annotations.append(x)

trace = go.Scatter(
    x=tt.index,
    y=tt.values,
    mode='markers',
    marker=go.scatter.Marker(
        color=-tt.index,
        colorscale='Blues'
    )
)

layout= go.Layout(
    title='Paid app prices',
    autosize=False,
    width=1000,
    height=400,
    annotations=annotations
)
fig = go.Figure(data=[trace], layout=layout)
iplot(fig)

基本上費用還是集中在 50 美元內，多數落在 3 美元內

## App installs and reviews

In [20]:
mask = data['Installs'].str.contains('+', regex=False)
temp = data[mask].copy()
data[~mask]

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
9148,Command & Conquer: Rivals,FAMILY,,0,Varies with device,0,,0,Everyone 10+,Strategy,"June 28, 2018",Varies with device,Varies with device


沒 "+" 的安裝數不考慮

In [21]:
temp['Installs'] = temp['Installs'].str.replace(',', '')
temp['Installs'] = temp['Installs'].str.replace('+', '')
temp['Installs'] = temp['Installs'].apply(lambda x: int(x))
temp.head()

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19M,10000,Free,0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,500000,Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7M,5000000,Free,0,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25M,50000000,Free,0,Teen,Art & Design,"June 8, 2018",Varies with device,4.2 and up
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8M,100000,Free,0,Everyone,Art & Design;Creativity,"June 20, 2018",1.1,4.4 and up


修改安裝數的字串以供比較

In [22]:
tt = temp['Installs'].value_counts().sort_index(ascending=False)
tt.head()

1000000000      58
500000000       72
100000000      409
50000000       289
10000000      1252
Name: Installs, dtype: int64

In [23]:
trace1 = go.Bar(
    x=tt.index,
    y=tt.values,
    name='Installs',
    marker=go.bar.Marker(color='#5897fc')
)

trace2 = go.Scatter(
    x=tt.index,
    y=[tt.values.mean()]*len(tt),
    name='Count mean',
    marker=go.scatter.Marker(color='#285fb7')
)
    
layout = go.Layout(
    title='App installs',
    xaxis = go.layout.XAxis(type='category')
)

fig = go.Figure(data=[trace1, trace2], layout=layout)
iplot(fig)

最高下載量有超過 10 億的，多數落在千萬、百萬、十萬的區間

In [24]:
temp['Reviews'] = temp['Reviews'].apply(lambda x: int(x))
temp['Percent'] = temp['Reviews'] / temp['Installs']
temp.head()

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver,Percent
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19M,10000,Free,0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up,0.0159
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,500000,Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up,0.001934
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7M,5000000,Free,0,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up,0.017502
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25M,50000000,Free,0,Teen,Art & Design,"June 8, 2018",Varies with device,4.2 and up,0.004313
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8M,100000,Free,0,Everyone,Art & Design;Creativity,"June 20, 2018",1.1,4.4 and up,0.00967


In [25]:
trace1 = go.Scatter(
    x=temp['Category'],
    y=temp['Percent'],
    mode='markers',
    name='Reviews percent',
    marker=go.scatter.Marker(color='#5897fc')
)

trace2 = go.Scatter(
    x=temp['Category'].unique(),
    y=[temp['Percent'].mean()] * len(temp['Category'].unique()),
    name='percent mean',
    marker=go.scatter.Marker(color='#285fb7'),

)
    
layout = go.Layout(
    title='App reviews percent',
    xaxis = go.layout.XAxis(
        tickangle=30,
        tickfont=go.layout.xaxis.Tickfont(size=10),
        automargin=True
    ),
    annotations=[go.layout.Annotation(
        x=temp['Category'].unique()[-1],
        y=temp['Percent'].mean(),
        xref='x',
        yref='y',
        text='mean: '+'{:.3f}'.format(temp['Percent'].mean()),
        ax=30,
        ay=-40
    )]
)

fig = go.Figure(data=[trace1, trace2], layout=layout)
iplot(fig)

* 評論於下載量的比例極低，平均不到 0.5% 的人會評論
* 有些特例是評論比大於 1，這部分有待商榷，目前 google 商店應該是要先下載才能評論才對