# Google Play Store

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import pandas as pd

First, we'll load the dataset into memory. Here, we're using pandas to read a CSV file.

In [2]:
df = pd.read_csv('googleplaystore.csv')
df.head()

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19M,"10,000+",Free,0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7M,"5,000,000+",Free,0,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25M,"50,000,000+",Free,0,Teen,Art & Design,"June 8, 2018",Varies with device,4.2 and up
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8M,"100,000+",Free,0,Everyone,Art & Design;Creativity,"June 20, 2018",1.1,4.4 and up


## Exploring data

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10841 entries, 0 to 10840
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   App             10841 non-null  object 
 1   Category        10841 non-null  object 
 2   Rating          9367 non-null   float64
 3   Reviews         10841 non-null  object 
 4   Size            10841 non-null  object 
 5   Installs        10841 non-null  object 
 6   Type            10840 non-null  object 
 7   Price           10841 non-null  object 
 8   Content Rating  10840 non-null  object 
 9   Genres          10841 non-null  object 
 10  Last Updated    10841 non-null  object 
 11  Current Ver     10833 non-null  object 
 12  Android Ver     10838 non-null  object 
dtypes: float64(1), object(12)
memory usage: 1.1+ MB


In [4]:
df.describe()

Unnamed: 0,Rating
count,9367.0
mean,4.193338
std,0.537431
min,1.0
25%,4.0
50%,4.3
75%,4.5
max,19.0


In [5]:
category_counts = df['Category'].value_counts().reset_index()

# Rename the columns for better readability
category_counts.columns = ['Category', 'Count']

# Sort the result for better readability
category_counts = category_counts.sort_values(by='Count', ascending=False)

# Display the count of each category
print(category_counts)

               Category  Count
0                FAMILY   1972
1                  GAME   1144
2                 TOOLS    843
3               MEDICAL    463
4              BUSINESS    460
5          PRODUCTIVITY    424
6       PERSONALIZATION    392
7         COMMUNICATION    387
8                SPORTS    384
9             LIFESTYLE    382
10              FINANCE    366
11   HEALTH_AND_FITNESS    341
12          PHOTOGRAPHY    335
13               SOCIAL    295
14   NEWS_AND_MAGAZINES    283
15             SHOPPING    260
16     TRAVEL_AND_LOCAL    258
17               DATING    234
18  BOOKS_AND_REFERENCE    231
19        VIDEO_PLAYERS    175
20            EDUCATION    156
21        ENTERTAINMENT    149
22  MAPS_AND_NAVIGATION    137
23       FOOD_AND_DRINK    127
24       HOUSE_AND_HOME     88
25   LIBRARIES_AND_DEMO     85
26    AUTO_AND_VEHICLES     85
27              WEATHER     82
28       ART_AND_DESIGN     65
29               EVENTS     64
30            PARENTING     60
31      

## Cleaning duplicates

In [6]:
df.duplicated().sum()

483

In [7]:
df.duplicated(subset='App').sum()

1181

In [8]:
duplicates_app = df[df.duplicated(['App'])].groupby('App').size().reset_index(name='counts')
print(duplicates_app.sort_values(by='counts', ascending=False))

                                                   App  counts
570                                             ROBLOX       8
102  CBS Sports App - Scores, News, Stats & Watch Live       7
211                     Duolingo: Learn Languages Free       6
218                                               ESPN       6
6                                          8 Ball Pool       6
..                                                 ...     ...
301                     Golfshot: Golf GPS + Tee Times       1
302                                             Google       1
305                                   Google Analytics       1
311              Google Duo - High Quality Video Calls       1
797                     wetter.com - Weather and Radar       1

[798 rows x 2 columns]


In [9]:
duplicates_app = df[df.duplicated(['App'], keep=False)]

# Get the names of all duplicate applications and sort them alphabetically
duplicate_app_names = sorted(duplicates_app['App'].unique())

# Display the names of the duplicate applications
print("Duplicate applications sorted alphabetically:", duplicate_app_names)
print("Number of duplicate applications:", len(duplicate_app_names))

Duplicate applications sorted alphabetically: ['10 Best Foods for You', '1800 Contacts - Lens Store', '2017 EMRA Antibiotic Guide', '21-Day Meditation Experience', '365Scores - Live Scores', '420 BZ Budeze Delivery', '8 Ball Pool', '8fit Workouts & Meal Planner', '95Live -SG#1 Live Streaming App', 'A Manual of Acupuncture', 'A&E - Watch Full Episodes of TV Shows', 'AAFP', 'ABC News - US & World News', 'AC - Tips & News for Android™', 'AP Mobile - Breaking News', 'ASCCP Mobile', 'ASOS', 'Accounting App - Zoho Books', 'AccuWeather: Daily Forecast & Live Weather Reports', 'Acorns - Invest Spare Change', 'AdWords Express', 'Ada - Your Health Guide', 'Adobe Acrobat Reader', 'Adobe Photoshop Express:Photo Editor Collage Maker', 'Adult Dirty Emojis', 'Advanced Comprehension Therapy', 'Agar.io', 'Airbnb', 'Airway Ex - Intubate. Anesthetize. Train.', 'AliExpress - Smarter Shopping, Better Living', 'All Football - Latest News & Videos', 'All Mental disorders', 'All Social Networks', "Alto's Adve

In [10]:
# Sorts the DataFrame by the number of reviews in descending order
# drops duplicate rows based on the application name ('App'),
# keeping only the first occurrence of each application with the highest number of reviews.
df = df.sort_values(by='Reviews', ascending=False).drop_duplicates(subset='App')

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9660 entries, 2989 to 4177
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   App             9660 non-null   object 
 1   Category        9660 non-null   object 
 2   Rating          8197 non-null   float64
 3   Reviews         9660 non-null   object 
 4   Size            9660 non-null   object 
 5   Installs        9660 non-null   object 
 6   Type            9659 non-null   object 
 7   Price           9660 non-null   object 
 8   Content Rating  9659 non-null   object 
 9   Genres          9660 non-null   object 
 10  Last Updated    9660 non-null   object 
 11  Current Ver     9652 non-null   object 
 12  Android Ver     9657 non-null   object 
dtypes: float64(1), object(12)
memory usage: 1.0+ MB


In [12]:
df.duplicated().sum()

0

## Exploring null values

In [13]:
df.isnull().sum().sort_values(ascending=False)

Rating            1463
Current Ver          8
Android Ver          3
Type                 1
Content Rating       1
App                  0
Category             0
Reviews              0
Size                 0
Installs             0
Price                0
Genres               0
Last Updated         0
dtype: int64

In [14]:
(df.isnull().sum() * 100 / len(df)).sort_values(ascending=False)

Rating            15.144928
Current Ver        0.082816
Android Ver        0.031056
Type               0.010352
Content Rating     0.010352
App                0.000000
Category           0.000000
Reviews            0.000000
Size               0.000000
Installs           0.000000
Price              0.000000
Genres             0.000000
Last Updated       0.000000
dtype: float64

In [15]:
df = df.dropna(subset=['Rating'])

In [16]:
df['Rating'].unique()

array([ 4.2,  3.3,  4. ,  4.7,  4.3,  3.7,  4.6,  4.5,  4.4,  3.9,  4.1,
        2.4,  3.6,  2.9,  4.8,  2.8,  3.5,  1.9,  3.8,  3. ,  3.2,  4.9,
        2.6,  3.4,  2.7,  5. ,  2.3,  2. ,  2.2,  2.1,  2.5,  3.1,  1.7,
        1.6,  1.5,  1.8,  1. ,  1.4,  1.2, 19. ])

In [17]:
df['Rating'].value_counts(dropna=False).sort_values(ascending=False)

4.4     897
4.3     897
4.5     849
4.2     811
4.6     683
4.1     621
4.0     513
4.7     438
3.9     359
3.8     286
5.0     271
3.7     224
4.8     221
3.6     167
3.5     156
3.4     126
3.3     100
4.9      85
3.0      81
3.1      69
3.2      63
2.9      45
2.8      40
2.6      24
2.7      23
2.5      20
2.3      20
2.4      19
1.0      16
2.2      14
2.0      12
1.9      11
1.7       8
1.8       8
2.1       8
1.6       4
1.5       3
1.4       3
1.2       1
19.0      1
Name: Rating, dtype: int64

In [18]:
#Changing the error in the 19 rating
df['Rating'].replace(19.0, 1.9, inplace=True)

In [19]:
df['Rating'].value_counts(dropna=False).sort_values(ascending=False)

4.4    897
4.3    897
4.5    849
4.2    811
4.6    683
4.1    621
4.0    513
4.7    438
3.9    359
3.8    286
5.0    271
3.7    224
4.8    221
3.6    167
3.5    156
3.4    126
3.3    100
4.9     85
3.0     81
3.1     69
3.2     63
2.9     45
2.8     40
2.6     24
2.7     23
2.3     20
2.5     20
2.4     19
1.0     16
2.2     14
1.9     12
2.0     12
1.8      8
2.1      8
1.7      8
1.6      4
1.5      3
1.4      3
1.2      1
Name: Rating, dtype: int64

In [20]:
df['Rating'].value_counts(dropna=False).sort_index(ascending=False)

5.0    271
4.9     85
4.8    221
4.7    438
4.6    683
4.5    849
4.4    897
4.3    897
4.2    811
4.1    621
4.0    513
3.9    359
3.8    286
3.7    224
3.6    167
3.5    156
3.4    126
3.3    100
3.2     63
3.1     69
3.0     81
2.9     45
2.8     40
2.7     23
2.6     24
2.5     20
2.4     19
2.3     20
2.2     14
2.1      8
2.0     12
1.9     12
1.8      8
1.7      8
1.6      4
1.5      3
1.4      3
1.2      1
1.0     16
Name: Rating, dtype: int64

## Exploring other mistakes in data

Please note that I observed inaccuracies in the data layout of the Life Made Wi-Fi Touchscreen Photo Frame application. Consequently, I've opted to adjust this information for accuracy.

In [21]:
df.loc[df['App'] == 'Life Made WI-Fi Touchscreen Photo Frame', 'Category'] = 'No Category'
df.loc[df['App'] == 'Life Made WI-Fi Touchscreen Photo Frame', 'Rating'] = 1.9
df.loc[df['App'] == 'Life Made WI-Fi Touchscreen Photo Frame', 'Reviews'] = 19
df.loc[df['App'] == 'Life Made WI-Fi Touchscreen Photo Frame', 'Size'] = '3.0M'
df.loc[df['App'] == 'Life Made WI-Fi Touchscreen Photo Frame', 'Installs'] = '1,000+'
df.loc[df['App'] == 'Life Made WI-Fi Touchscreen Photo Frame', 'Type'] = 'Free'
df.loc[df['App'] == 'Life Made WI-Fi Touchscreen Photo Frame', 'Price'] = 0
df.loc[df['App'] == 'Life Made WI-Fi Touchscreen Photo Frame', 'Content Rating'] = 'Everyone'
df.loc[df['App'] == 'Life Made WI-Fi Touchscreen Photo Frame', 'Genres'] = ''
df.loc[df['App'] == 'Life Made WI-Fi Touchscreen Photo Frame', 'Last Updated'] = 'February 11, 2018'
df.loc[df['App'] == 'Life Made WI-Fi Touchscreen Photo Frame', 'Current Ver'] = '1.0.19'
df.loc[df['App'] == 'Life Made WI-Fi Touchscreen Photo Frame', 'Android Ver'] = '4.0 and up'

In [22]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8197 entries, 2989 to 2482
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   App             8197 non-null   object 
 1   Category        8197 non-null   object 
 2   Rating          8197 non-null   float64
 3   Reviews         8197 non-null   object 
 4   Size            8197 non-null   object 
 5   Installs        8197 non-null   object 
 6   Type            8197 non-null   object 
 7   Price           8197 non-null   object 
 8   Content Rating  8197 non-null   object 
 9   Genres          8197 non-null   object 
 10  Last Updated    8197 non-null   object 
 11  Current Ver     8193 non-null   object 
 12  Android Ver     8195 non-null   object 
dtypes: float64(1), object(12)
memory usage: 896.5+ KB


After that I prefer to drop that line

In [23]:
df = df[df['App'] != 'Life Made WI-Fi Touchscreen Photo Frame']

In [24]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8196 entries, 2989 to 2482
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   App             8196 non-null   object 
 1   Category        8196 non-null   object 
 2   Rating          8196 non-null   float64
 3   Reviews         8196 non-null   object 
 4   Size            8196 non-null   object 
 5   Installs        8196 non-null   object 
 6   Type            8196 non-null   object 
 7   Price           8196 non-null   object 
 8   Content Rating  8196 non-null   object 
 9   Genres          8196 non-null   object 
 10  Last Updated    8196 non-null   object 
 11  Current Ver     8192 non-null   object 
 12  Android Ver     8194 non-null   object 
dtypes: float64(1), object(12)
memory usage: 896.4+ KB


In [25]:
df['App'].nunique()

8196

In [26]:
df['Category'].unique()

array(['SPORTS', 'TOOLS', 'SHOPPING', 'TRAVEL_AND_LOCAL', 'FAMILY',
       'VIDEO_PLAYERS', 'COMMUNICATION', 'COMICS', 'GAME', 'MEDICAL',
       'ENTERTAINMENT', 'AUTO_AND_VEHICLES', 'PRODUCTIVITY', 'EDUCATION',
       'EVENTS', 'PARENTING', 'PHOTOGRAPHY', 'LIFESTYLE', 'BUSINESS',
       'WEATHER', 'BOOKS_AND_REFERENCE', 'PERSONALIZATION',
       'HEALTH_AND_FITNESS', 'NEWS_AND_MAGAZINES', 'DATING', 'FINANCE',
       'ART_AND_DESIGN', 'BEAUTY', 'SOCIAL', 'MAPS_AND_NAVIGATION',
       'LIBRARIES_AND_DEMO', 'FOOD_AND_DRINK', 'HOUSE_AND_HOME'],
      dtype=object)

In [27]:
df['Category'].nunique()

33

In [28]:
df['Category'].value_counts(dropna=False)

FAMILY                 1654
GAME                    895
TOOLS                   719
FINANCE                 302
PRODUCTIVITY            301
LIFESTYLE               301
PERSONALIZATION         298
MEDICAL                 290
PHOTOGRAPHY             263
BUSINESS                263
SPORTS                  260
COMMUNICATION           256
HEALTH_AND_FITNESS      244
NEWS_AND_MAGAZINES      204
SOCIAL                  203
TRAVEL_AND_LOCAL        187
SHOPPING                180
BOOKS_AND_REFERENCE     169
VIDEO_PLAYERS           149
DATING                  134
MAPS_AND_NAVIGATION     118
EDUCATION               105
FOOD_AND_DRINK           94
ENTERTAINMENT            87
AUTO_AND_VEHICLES        73
WEATHER                  72
LIBRARIES_AND_DEMO       64
HOUSE_AND_HOME           61
ART_AND_DESIGN           59
COMICS                   54
PARENTING                50
EVENTS                   45
BEAUTY                   42
Name: Category, dtype: int64

In [29]:
df['Reviews'].unique()

array(['9992', '999', '9975', ..., '100', '10', '1'], dtype=object)

In [30]:
df['Reviews'].nunique()

5321

In [31]:
df['Reviews'].value_counts(dropna=False)

2         82
3         76
4         74
5         74
1         67
          ..
435        1
4354       1
4355       1
43611      1
318142     1
Name: Reviews, Length: 5321, dtype: int64

In [32]:
df['Installs'].unique()

array(['1,000,000+', '100,000+', '500,000+', '50,000+', '10,000,000+',
       '5,000,000+', '10,000+', '50,000,000+', '100,000,000+', '5,000+',
       '1,000+', '1,000,000,000+', '500+', '100+', '10+', '50+',
       '500,000,000+', '5+', '1+'], dtype=object)

In [33]:
df['Installs'].nunique()

19

In [34]:
df['Installs'].value_counts(dropna=False)

1,000,000+        1416
100,000+          1095
10,000+            986
10,000,000+        934
1,000+             697
5,000,000+         608
500,000+           504
50,000+            457
5,000+             425
100+               303
50,000,000+        203
500+               199
100,000,000+       188
10+                 69
50+                 56
500,000,000+        24
1,000,000,000+      20
5+                   9
1+                   3
Name: Installs, dtype: int64

In [35]:
df['Type'].unique()

array(['Free', 'Paid'], dtype=object)

In [36]:
df['Type'].nunique()

2

In [37]:
df['Type'].value_counts(dropna=False)

Free    7592
Paid     604
Name: Type, dtype: int64

In [38]:
df['Content Rating'].unique()

array(['Everyone', 'Teen', 'Everyone 10+', 'Mature 17+',
       'Adults only 18+', 'Unrated'], dtype=object)

In [39]:
df['Content Rating'].nunique()

6

In [40]:
df['Content Rating'].value_counts(dropna=False)

Everyone           6618
Teen                912
Mature 17+          357
Everyone 10+        305
Adults only 18+       3
Unrated               1
Name: Content Rating, dtype: int64

In [41]:
df['Genres'].unique()

array(['Sports', 'Tools', 'Shopping', 'Travel & Local', 'Entertainment',
       'Video Players & Editors', 'Communication', 'Comics', 'Arcade',
       'Adventure', 'Medical', 'Auto & Vehicles', 'Productivity',
       'Action', 'Education', 'Strategy', 'Events', 'Parenting', 'Casual',
       'Photography', 'Lifestyle', 'Health & Fitness;Education',
       'Entertainment;Brain Games', 'Business', 'Simulation', 'Weather',
       'Books & Reference', 'Personalization', 'Health & Fitness',
       'News & Magazines', 'Education;Creativity', 'Dating',
       'Art & Design;Pretend Play', 'Board', 'Board;Brain Games',
       'Puzzle', 'Educational;Action & Adventure', 'Finance',
       'Art & Design;Creativity', 'Role Playing', 'Educational;Education',
       'Beauty', 'Action;Action & Adventure', 'Social',
       'Maps & Navigation', 'Educational;Creativity', 'Racing',
       'Educational', 'Trivia', 'Libraries & Demo',
       'Card;Action & Adventure', 'Puzzle;Brain Games', 'Food & Drink',
  

In [42]:
df['Genres'].nunique()

115

In [43]:
df['Genres'].value_counts(dropna=False)

Tools                         718
Entertainment                 471
Education                     429
Finance                       302
Productivity                  301
                             ... 
Entertainment;Education         1
Puzzle;Education                1
Art & Design;Pretend Play       1
Health & Fitness;Education      1
Strategy;Creativity             1
Name: Genres, Length: 115, dtype: int64

## Cleaning numbers

In [44]:
import re

# Define a function to clean the installs data
def clean_installs(installs):
    # Use a regular expression to remove the '+' symbol and any other non-numeric character
    return re.sub(r'[^\d]', '', installs)

# Apply the clean_installs function to the 'Installs' column
df['Installs'] = df['Installs'].apply(clean_installs)

# Convert the 'Installs' column to numeric type
df['Installs'] = pd.to_numeric(df['Installs'])

# Show the results
print(df['Installs'].unique())

[   1000000     100000     500000      50000   10000000    5000000
      10000   50000000  100000000       5000       1000 1000000000
        500        100         10         50  500000000          5
          1]


In [45]:
df['Installs'].value_counts(dropna=False)

1000000       1416
100000        1095
10000          986
10000000       934
1000           697
5000000        608
500000         504
50000          457
5000           425
100            303
50000000       203
500            199
100000000      188
10              69
50              56
500000000       24
1000000000      20
5                9
1                3
Name: Installs, dtype: int64

In [46]:
df.head()

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
2989,GollerCepte Live Score,SPORTS,4.2,9992,31M,1000000,Free,0,Everyone,Sports,"May 23, 2018",6.5,4.1 and up
4970,Ad Block REMOVER - NEED ROOT,TOOLS,3.3,999,91k,100000,Free,0,Everyone,Tools,"December 17, 2013",3.2,2.2 and up
2723,SnipSnap Coupon App,SHOPPING,4.2,9975,18M,1000000,Free,0,Everyone,Shopping,"January 22, 2018",1.4,4.3 and up
3079,US Open Tennis Championships 2018,SPORTS,4.0,9971,33M,1000000,Free,0,Everyone,Sports,"June 5, 2018",7.1,5.0 and up
3229,DreamTrips,TRAVEL_AND_LOCAL,4.7,9971,22M,500000,Free,0,Teen,Travel & Local,"August 6, 2018",1.28.1,5.0 and up


In [47]:
df.to_csv('googleplaystore_clean.csv', index=False)

## Simple analysis

In [48]:
# Sort the DataFrame by Rating, Reviews, and Installs in descending order
df_sorted = df.sort_values(by=['Rating', 'Reviews', 'Installs'], ascending=False)

# Show the top rows of the sorted DataFrame, including the App name along with the columns of interest
top_apps = df_sorted[['App', 'Rating', 'Reviews', 'Installs']].head(10) 

print(top_apps)

                                           App  Rating Reviews  Installs
9496                                Master E.K     5.0      90      1000
9008                                  DW Timer     5.0       9       100
9188                                EB Scanner     5.0       9        50
5245                                AJ RETAILS     5.0       9        10
8327       The Divine Feminine App: the DF App     5.0       8      1000
7170  Yazdani Cd Center EllahAbad Official App     5.0       8       500
9119                                   chat dz     5.0       8       100
6861                       BV Sridhara Maharaj     5.0       8       100
9293                                    EF App     5.0       8       100
6391                     BK Arogyam Task Track     5.0       8       100


In [49]:
top_five_categories = df['Category'].value_counts().head(5)
print("Top five app categories:")
print(top_five_categories)

Top five app categories:
FAMILY          1654
GAME             895
TOOLS            719
FINANCE          302
PRODUCTIVITY     301
Name: Category, dtype: int64
