# Analyzing The Google Play Store Dataset

In [3]:
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
df = pd.read_csv('googleplaystore.csv')
df

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19M,"10,000+",Free,0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7M,"5,000,000+",Free,0,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25M,"50,000,000+",Free,0,Teen,Art & Design,"June 8, 2018",Varies with device,4.2 and up
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8M,"100,000+",Free,0,Everyone,Art & Design;Creativity,"June 20, 2018",1.1,4.4 and up
...,...,...,...,...,...,...,...,...,...,...,...,...,...
10836,Sya9a Maroc - FR,FAMILY,4.5,38,53M,"5,000+",Free,0,Everyone,Education,"July 25, 2017",1.48,4.1 and up
10837,Fr. Mike Schmitz Audio Teachings,FAMILY,5.0,4,3.6M,100+,Free,0,Everyone,Education,"July 6, 2018",1.0,4.1 and up
10838,Parkinson Exercices FR,MEDICAL,,3,9.5M,"1,000+",Free,0,Everyone,Medical,"January 20, 2017",1.0,2.2 and up
10839,The SCP Foundation DB fr nn5n,BOOKS_AND_REFERENCE,4.5,114,Varies with device,"1,000+",Free,0,Mature 17+,Books & Reference,"January 19, 2015",Varies with device,Varies with device


# Find Shape of Our Dataset (Number of Rows & Number of Columns)

In [5]:
df.shape

(10841, 13)

In [6]:
print(f'no of rows are {df.shape[0]} and no of columns are {df.shape[1]}')

no of rows are 10841 and no of columns are 13


# Get Information About Our Dataset Like Total Number Rows, Total Number of Columns, Datatypes of Each Column And Memory Requirement

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10841 entries, 0 to 10840
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   App             10841 non-null  object 
 1   Category        10841 non-null  object 
 2   Rating          9367 non-null   float64
 3   Reviews         10841 non-null  object 
 4   Size            10841 non-null  object 
 5   Installs        10841 non-null  object 
 6   Type            10840 non-null  object 
 7   Price           10841 non-null  object 
 8   Content Rating  10840 non-null  object 
 9   Genres          10841 non-null  object 
 10  Last Updated    10841 non-null  object 
 11  Current Ver     10833 non-null  object 
 12  Android Ver     10838 non-null  object 
dtypes: float64(1), object(12)
memory usage: 1.1+ MB


 # Get Overall Statistics About The Dataframe

In [8]:
df.describe()

Unnamed: 0,Rating
count,9367.0
mean,4.193338
std,0.537431
min,1.0
25%,4.0
50%,4.3
75%,4.5
max,19.0


# Total Number of App Titles Contain Astrology

In [12]:
len(df[df['App'].str.contains('Astrology',case = False)])

3

# Find Average App Rating

In [13]:
df['Rating'].mean()

4.193338315362443

# Find Total Number of Unique Category

In [17]:
len(df['Category'].unique())

34

# Which Category Getting The Highest Average Rating?

In [21]:
df.groupby('Category')['Rating'].mean().sort_values(ascending = False)

Category
1.9                    19.000000
EVENTS                  4.435556
EDUCATION               4.389032
ART_AND_DESIGN          4.358065
BOOKS_AND_REFERENCE     4.346067
PERSONALIZATION         4.335987
PARENTING               4.300000
GAME                    4.286326
BEAUTY                  4.278571
HEALTH_AND_FITNESS      4.277104
SHOPPING                4.259664
SOCIAL                  4.255598
WEATHER                 4.244000
SPORTS                  4.223511
PRODUCTIVITY            4.211396
HOUSE_AND_HOME          4.197368
FAMILY                  4.192272
PHOTOGRAPHY             4.192114
AUTO_AND_VEHICLES       4.190411
MEDICAL                 4.189143
LIBRARIES_AND_DEMO      4.178462
FOOD_AND_DRINK          4.166972
COMMUNICATION           4.158537
COMICS                  4.155172
NEWS_AND_MAGAZINES      4.132189
FINANCE                 4.131889
ENTERTAINMENT           4.126174
BUSINESS                4.121452
TRAVEL_AND_LOCAL        4.109292
LIFESTYLE               4.094904
V

# Find Total Number of App having 5 Star Rating

In [23]:
len(df[df['Rating'] == 5])

274

# Find Average Value of Reviews

In [39]:
df['Reviews'] = df['Reviews'].replace('3.0M',3.0)

In [40]:
df['Reviews'].astype(float).mean()

444111.9265750392

# Find Total Number of Free and Paid Apps

In [42]:
df['Type'].value_counts()

Free    10039
Paid      800
0           1
Name: Type, dtype: int64

#  Which App Has Maximum Reviews?

In [55]:
df['Reviews'] = df['Reviews'].astype(float)
df[df['Reviews'] == df['Reviews'].max()]['App']

2544    Facebook
Name: App, dtype: object

# Display Top 5 Apps Having Highest Reviews

In [69]:
index = df['Reviews'].sort_values(ascending = False).head(5).index
df.loc[index]['App']

2544              Facebook
3943              Facebook
381     WhatsApp Messenger
336     WhatsApp Messenger
3904    WhatsApp Messenger
Name: App, dtype: object

# Find Average Rating of Free and Paid Apps

In [78]:
df.groupby('Type')['Rating'].mean()


Type
0       19.000000
Free     4.186203
Paid     4.266615
Name: Rating, dtype: float64

# Display Top  5 Apps Having Maximum Installs

In [87]:
df['Installs'] = df['Installs'].str.replace('+','')

  df['Installs'] = df['Installs'].str.replace('+','')


In [89]:
df['Installs'] = df['Installs'].str.replace(',','')
df['Installs']

0           10000
1          500000
2         5000000
3        50000000
4          100000
           ...   
10836        5000
10837         100
10838        1000
10839        1000
10840    10000000
Name: Installs, Length: 10841, dtype: object

In [95]:
df['Installs'] = df['Installs'].replace('Free',0)

In [96]:
df['Installs'] = df['Installs'].astype(float)

In [101]:
index = df['Installs'].sort_values(ascending = False).head(5).index
df.loc[index]['App']

3896                              Subway Surfers
3943                                    Facebook
335     Messenger – Text and Video Chat for Free
3523                                Google Drive
3565                                Google Drive
Name: App, dtype: object