## THIS SECTION DEALS WITH THE CLEANING OF THE DATA USING PANDAS AND NUMPY libraries

In [1]:
import numpy as np
import pandas as pd

In [128]:
gstore = pd.read_csv('C:/Users/BOLAJI/Documents/googleplaystore.csv')
gstore.head()

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19M,"10,000+",Free,0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7M,"5,000,000+",Free,0,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25M,"50,000,000+",Free,0,Teen,Art & Design,"June 8, 2018",Varies with device,4.2 and up
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8M,"100,000+",Free,0,Everyone,Art & Design;Creativity,"June 20, 2018",1.1,4.4 and up


## INPECTING THE HEADER/COLUMN
There are thirteen colums which are
1. Application name
2. Categories of Application
3. Application Rating
4. Number of reviews
5. Size of application
6. Number of installs
7. Types of application(free or paid)
8. Price of application
9. Content rating
10. Genres, the same as the category
11. Last updated date
12. Current version
13. Android version

In [23]:
gstore.columns

Index(['App', 'Category', 'Rating', 'Reviews', 'Size', 'Installs', 'Type',
       'Price', 'Content Rating', 'Genres', 'Last Updated', 'Current Ver',
       'Android Ver'],
      dtype='object')

## Removing unneed columns
The columns that are not need includes 
1. Current Version
2. Android version
3. Genres

In [129]:
gstore.drop(['Current Ver', 'Android Ver', 'Genres'], axis = 1, inplace = True)

## The next line of code is used to change last updated to datetime type

In [130]:
gstore['Last Updated'] = pd.to_datetime(gstore['Last Updated'], errors = 'coerce')

In [162]:
gstore.head()

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Last Updated
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19000000.0,10000.0,Free,0,Everyone,2018-01-07
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14000000.0,500000.0,Free,0,Everyone,2018-01-15
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8700000.0,5000000.0,Free,0,Everyone,2018-08-01
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25000000.0,50000000.0,Free,0,Teen,2018-06-08
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2800000.0,100000.0,Free,0,Everyone,2018-06-20


## Inpecting the types of element in each column

This line of code is run several times after cleaning each column, each element of the column came as an object type which is the native python word for string. After cleaning the data the column should have the following type:
* app as object
* category as object
* rating as float
* reviews as int
* size as float
* installs as float
* type as categorial data
* price as float
* content rating as category
* last updated as datetime

In [181]:
gstore.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10841 entries, 0 to 10840
Data columns (total 10 columns):
App               10841 non-null object
Category          10841 non-null object
Rating            9367 non-null float64
Reviews           10841 non-null int64
Size              9145 non-null float64
Installs          10840 non-null float64
Type              10840 non-null category
Price             10840 non-null float64
Content Rating    10840 non-null category
Last Updated      10840 non-null datetime64[ns]
dtypes: category(2), datetime64[ns](1), float64(4), int64(1), object(2)
memory usage: 699.2+ KB


## CLEANING THE REVIEWS COLUMN
- The first line of code checks whether 'reviews' is a number or not
- The next line removes the non-numeric object from the string
- The next line coverts the reviews to numeric

In [132]:
gstore[gstore['Reviews'].str.isnumeric() == False]

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Last Updated
10472,Life Made WI-Fi Touchscreen Photo Frame,1.9,19.0,3.0M,"1,000+",Free,0,Everyone,,NaT


In [133]:
gstore['Reviews'] = gstore['Reviews'].str.replace('.0M', '000000')

In [134]:
gstore['Reviews'] = pd.to_numeric(gstore['Reviews'])

## Cleaning the size columns
- The first line replaces the string statement with an empty string
- The next line is a function that replaces M, K, k with an empty string covert it to float and multipy it with the appropriate index where M = 1000000, K and k = 1000 and any other thing with numpy not a number
- The next line applies the function to the size column

In [135]:
gstore['Size'] = gstore['Size'].str.replace('Varies with device', '')

In [136]:
def cover(x):
    if 'M' in x:
        return float(x.replace('M', '')) * 1000000
    elif 'K' in x:
        return float(x.replace('K', '')) * 1000
    elif 'k' in x:
        return float(x.replace('k', '')) * 1000
    else:
        return np.nan

In [None]:
gstore['Size'] = gstore['Size'].apply(cover)

## Cleaning installs
* The first line replaces the 'free' with an numpy not a number
* The next line using the pandas strings method to replace + and , with empty string
* The next line coverts installs to numpy float object

In [159]:
gstore['Installs'] = gstore['Installs'].replace('Free', np.nan)

In [161]:
gstore['Installs'] = pd.to_numeric(gstore['Installs'].str.replace('+', '').str.replace(',', ''))

In [163]:
gstore['Installs'] = gstore['Installs'].astype(np.float)

## Cleaning type and content rating column
* This line coverts the above column to category so as to save space

In [170]:
gstore[['Type', 'Content Rating']] = gstore[['Type', 'Content Rating']].astype('category' )

## Cleaning the price column
* The first line inpect the last column to clean for elements to be cleaned
* The next line replaces everyone with numpy not a number
* The next line uses the series str method to remove '$' sign from the elements of the column and also covert it to a float object

In [180]:
gstore['Price'].unique()

array([  0.  ,   4.99,   3.99,   6.99,   1.49,   2.99,   7.99,   5.99,
         3.49,   1.99,   9.99,   7.49,   0.99,   9.  ,   5.49,  10.  ,
        24.99,  11.99,  79.99,  16.99,  14.99,   1.  ,  29.99,  12.99,
         2.49,  10.99,   1.5 ,  19.99,  15.99,  33.99,  74.99,  39.99,
         3.95,   4.49,   1.7 ,   8.99,   2.  ,   3.88,  25.99, 399.99,
        17.99, 400.  ,   3.02,   1.76,   4.84,   4.77,   1.61,   2.5 ,
         1.59,   6.49,   1.29,   5.  ,  13.99, 299.99, 379.99,  37.99,
        18.99, 389.99,  19.9 ,   8.49,   1.75,  14.  ,   4.85,  46.99,
       109.99, 154.99,   3.08,   2.59,   4.8 ,   1.96,  19.4 ,   3.9 ,
         4.59,  15.46,   3.04,   4.29,   2.6 ,   3.28,   4.6 ,  28.99,
         2.95,   2.9 ,   1.97, 200.  ,  89.99,   2.56,  30.99,   3.61,
       394.99,   1.26,    nan,   1.2 ,   1.04])

In [177]:
gstore['Price'] = gstore['Price'].replace('Everyone', np.nan)

In [179]:
gstore['Price'] = pd.to_numeric(gstore['Price'].str.lstrip('$'))

## Storing the clean data to csv and saving it as cleaned_gstore_play_app_data.csv

In [182]:
gstore.to_csv('cleaned_gstore_play_app_data.csv', index= False)

## WATCHOUT FOR THE ANALYSIS OF THE DATA CLEANED