In [1]:
import numpy as np
import pandas as pd

from sklearn.impute import SimpleImputer

In [2]:
data = pd.read_csv("googleplaystore.csv")
data.head()

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19M,"10,000+",Free,0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7M,"5,000,000+",Free,0,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25M,"50,000,000+",Free,0,Teen,Art & Design,"June 8, 2018",Varies with device,4.2 and up
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8M,"100,000+",Free,0,Everyone,Art & Design;Creativity,"June 20, 2018",1.1,4.4 and up


# First deal with null values

In [3]:
# let's check for null values
data.isnull().sum()

App                  0
Category             0
Rating            1474
Reviews              0
Size                 0
Installs             0
Type                 1
Price                0
Content Rating       1
Genres               0
Last Updated         0
Current Ver          8
Android Ver          3
dtype: int64

* Rating attribute have more null values than other
* so we deal with it first using imputer
* replace null value with its mean

In [4]:
imputer = SimpleImputer(missing_values= np.nan, strategy="mean")
imputer.fit(data[['Rating']])

data['Rating'] = imputer.transform(data[['Rating']])

In [5]:
data.isnull().sum()
# we see that we replace all null values in rating column

App               0
Category          0
Rating            0
Reviews           0
Size              0
Installs          0
Type              1
Price             0
Content Rating    1
Genres            0
Last Updated      0
Current Ver       8
Android Ver       3
dtype: int64

* now let's check others
* as we see other values are much less compare to size of data
* so if we drop that rows it not affect our data as much as Rating do
* so we drop rows where null values are present


In [6]:
data.dropna(inplace=True)

# check if it work properly or not?
data.isnull().sum()

App               0
Category          0
Rating            0
Reviews           0
Size              0
Installs          0
Type              0
Price             0
Content Rating    0
Genres            0
Last Updated      0
Current Ver       0
Android Ver       0
dtype: int64

# Handling duplicate values

In [7]:
data.duplicated().sum()

483

In [8]:
# deleting last occurence of duplicate rows
data.drop_duplicates(keep='first', inplace=True)

In [9]:
data.duplicated().sum()

0

In [10]:
data.shape

(10346, 13)

# Q1. How many free apps are there in ART_AND_DESIGN?

In [24]:
apps = data[(data['Category']=='ART_AND_DESIGN') & (data['Type']=='Free')].shape[0]
print("There are",apps,"apps in ART_AND_DESIGN which are free.")

There are 61 apps in ART_AND_DESIGN which are free.


### & How many are paids?

In [25]:
apps = data[(data['Category']=='ART_AND_DESIGN') & (data['Type']=='Paid')].shape[0]
print("There are",apps,"apps in ART_AND_DESIGN which are paid.")

There are 3 apps in ART_AND_DESIGN which are paid.


## Q2. How many apps are there in ART_AND_DESIGN with rating greater than               4.5?

In [22]:
apps = data[(data['Category']=='ART_AND_DESIGN') & (data['Rating']>4.5)].shape[0]
print("There are",apps,"apps in ART_AND_DESIGN which has more than 4.5 rating.")

There are 22 apps in ART_AND_DESIGN which has more than 4.5 rating.


## Q3. How many apps are there in FAMILY with rating more than 4.5 and Free?

In [21]:
apps = data[(data['Category']=='FAMILY') & (data['Type']=='Free') & (data['Rating']>4.5)].shape[0]
print("There are",apps,'apps in FAMILY category which are free and have Rating more than 4.5')

There are 313 in FAMILY category which are free and have Rating more than 4.5


## Q3. List all the apps are which are from FAMILY with rating more than 4.5 and Free?

In [30]:
apps = data[(data['Category']=='FAMILY') & (data['Type']=='Free') & (data['Rating']>4.5)]['App']
print(apps)

2020     Super ABC! Learning games for kids! Preschool ...
2023                                       Candy Pop Story
2029                           Dog Run - Pet Dog Simulator
2032       Puzzle Kids - Animals Shapes and Jigsaw Puzzles
2040          No. Color - Color by Number, Number Coloring
                               ...                        
10691                                               Pin-fo
10801                                  Fr Ignacio Outreach
10809                Castle Clash: RPG War and Strategy FR
10820                                      Fr. Daoud Lamei
10837                     Fr. Mike Schmitz Audio Teachings
Name: App, Length: 313, dtype: object


In [31]:
type(apps)

pandas.core.series.Series

# GroupBY

### Q1. Which category has maximum average rating?

In [38]:
data.groupby(by='Category').mean()['Rating']

  data.groupby(by='Category').mean()['Rating']


Category
ART_AND_DESIGN         4.368438
AUTO_AND_VEHICLES      4.190824
BEAUTY                 4.260882
BOOKS_AND_REFERENCE    4.312461
BUSINESS               4.135958
COMICS                 4.156445
COMMUNICATION          4.158216
DATING                 4.013538
EDUCATION              4.374564
ENTERTAINMENT          4.136036
EVENTS                 4.363647
FAMILY                 4.191501
FINANCE                4.135315
FOOD_AND_DRINK         4.168388
GAME                   4.277598
HEALTH_AND_FITNESS     4.251656
HOUSE_AND_HOME         4.169001
LIBRARIES_AND_DEMO     4.182938
LIFESTYLE              4.113799
MAPS_AND_NAVIGATION    4.065061
MEDICAL                4.185279
NEWS_AND_MAGAZINES     4.140784
PARENTING              4.282223
PERSONALIZATION        4.304871
PHOTOGRAPHY            4.183479
PRODUCTIVITY           4.200279
SHOPPING               4.245774
SOCIAL                 4.247001
SPORTS                 4.219279
TOOLS                  4.065970
TRAVEL_AND_LOCAL       4.107539

### Q2. How many free apps are there in each category?

In [43]:
new_data = data[data['Type']=='Free']
new_data.groupby('Category').count()['Type'].sort_values(ascending=False)

Category
FAMILY                 1753
GAME                   1038
TOOLS                   764
BUSINESS                415
PRODUCTIVITY            379
LIFESTYLE               354
FINANCE                 343
COMMUNICATION           339
SPORTS                  327
MEDICAL                 324
PERSONALIZATION         305
PHOTOGRAPHY             302
HEALTH_AND_FITNESS      291
SOCIAL                  277
NEWS_AND_MAGAZINES      262
TRAVEL_AND_LOCAL        225
SHOPPING                222
BOOKS_AND_REFERENCE     201
DATING                  189
VIDEO_PLAYERS           171
MAPS_AND_NAVIGATION     132
EDUCATION               126
FOOD_AND_DRINK          122
ENTERTAINMENT           109
LIBRARIES_AND_DEMO       83
AUTO_AND_VEHICLES        82
HOUSE_AND_HOME           80
WEATHER                  74
EVENTS                   63
ART_AND_DESIGN           61
COMICS                   60
PARENTING                58
BEAUTY                   53
Name: Type, dtype: int64