In [73]:
import pandas as pd
import re
import numpy as np

In [74]:
df = pd.read_csv('../data/googleplaystore.csv')

## Data Exploration


In [76]:
df.dtypes

App                object
Category           object
Rating            float64
Reviews            object
Size               object
Installs           object
Type               object
Price              object
Content Rating     object
Genres             object
Last Updated       object
Current Ver        object
Android Ver        object
dtype: object

In [77]:
df.isnull().sum()

App                  0
Category             0
Rating            1474
Reviews              0
Size                 0
Installs             0
Type                 1
Price                0
Content Rating       1
Genres               0
Last Updated         0
Current Ver          8
Android Ver          3
dtype: int64

In [78]:
df.nunique()

App               9660
Category            34
Rating              40
Reviews           6002
Size               462
Installs            22
Type                 3
Price               93
Content Rating       6
Genres             120
Last Updated      1378
Current Ver       2832
Android Ver         33
dtype: int64

## Data Cleaning and transforming

#### Remove duplicates

In [79]:
df[df.duplicated(keep='first')]
df.drop_duplicates(subset='App', inplace=True)

#### Reviews

* while preprocessing the Review column, we found  that there text values in the column

In [90]:
df['Reviews'] = df['Reviews'].apply(lambda x: int(x))

* by viewing this we can say that the row has shifted value and cannot be used, so we will drop it

In [81]:

df.reset_index(inplace=True)
df[df['Reviews'] == '3.0M']

Unnamed: 0,index,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
9300,10472,Life Made WI-Fi Touchscreen Photo Frame,1.9,19.0,3.0M,"1,000+",Free,0,Everyone,,"February 11, 2018",1.0.19,4.0 and up,


In [89]:
df = df.drop(9300, axis=0)

#### Installs

* Remove "+" & "," and convert to int type

In [91]:
df['Installs'] = df['Installs'].apply(lambda x: int(x.replace('+', '').replace(',', '')))

### Size

* Remove "M" & "K" and convert KB size to MB size

In [92]:
df['Size'] = df['Size'].apply(lambda x: str(x).replace('M', '').replace(',', ''))

In [93]:
df['Size'] = df['Size'].apply(lambda x: float(str(x).replace('k', '')) / 1000 if 'k' in str(x) else x)

* while preprocessing the Size column, we found that there are some values with 'Varies with device' which is not a valid value, so will convert them into NAN 

In [96]:
df['Size'] = df['Size'].apply(lambda x: float(x))

* convert 'Varies with device' to NaN

In [95]:
df['Size'] = df['Size'].apply(lambda x: str(x).replace('Varies with device', 'NaN') if 'Varies with device' in str(x) else x)

In [None]:
df['Size'].describe()

count                   9659
unique                   461
top       Varies with device
freq                    1227
Name: Size, dtype: object

In [None]:
df['Size'].isnull().sum()

0

In [None]:
len(df[df['Size'].str.contains('Varies with device', case=False)])


1227

### Price

* Remove "$" sign and convert to float

In [None]:
df['Price'] = df['Price'].apply(lambda x: str(x).replace('$', '') if '$' in str(x) else str(x))
df['Price'] = df['Price'].apply(lambda x: float(x))

In [None]:
# df['Price'].isnull().sum()
df['Price'].describe()

count    9659.000000
mean        1.099299
std        16.852152
min         0.000000
25%         0.000000
50%         0.000000
75%         0.000000
max       400.000000
Name: Price, dtype: float64