In [222]:
import pandas as pd
import re
import numpy as np

In [223]:
df = pd.read_csv('../data/googleplaystore.csv')

## Data Exploration


In [224]:
df.dtypes

App                object
Category           object
Rating            float64
Reviews            object
Size               object
Installs           object
Type               object
Price              object
Content Rating     object
Genres             object
Last Updated       object
Current Ver        object
Android Ver        object
dtype: object

In [225]:
df.isnull().sum()

App                  0
Category             0
Rating            1474
Reviews              0
Size                 0
Installs             0
Type                 1
Price                0
Content Rating       1
Genres               0
Last Updated         0
Current Ver          8
Android Ver          3
dtype: int64

In [226]:
df.nunique()

App               9660
Category            34
Rating              40
Reviews           6002
Size               462
Installs            22
Type                 3
Price               93
Content Rating       6
Genres             120
Last Updated      1378
Current Ver       2832
Android Ver         33
dtype: int64

## Data Cleaning and transforming

#### Remove duplicates

In [227]:
df[df.duplicated(keep='first')]
df = df.sort_values(by=['App', 'Reviews'], ascending=[True, False])
df = df.drop_duplicates(subset=['App'], keep='first')

#### Reviews

* while preprocessing the Review column, we found  that there text values in the column

* by viewing this we can say that the row has shifted value and cannot be used, so we will drop it

In [228]:

df.reset_index(inplace=True)
df[df['Reviews'] == '3.0M']

Unnamed: 0,index,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
5806,10472,Life Made WI-Fi Touchscreen Photo Frame,1.9,19.0,3.0M,"1,000+",Free,0,Everyone,,"February 11, 2018",1.0.19,4.0 and up,


In [229]:
df = df.drop(5806, axis=0)

In [230]:
df['Reviews'] = df['Reviews'].apply(lambda x: int(x))

#### Installs

* Remove "+" & "," and convert to int type

In [231]:
df['Installs'] = df['Installs'].apply(lambda x: int(x.replace('+', '').replace(',', '')))

### Size

* Remove "M" & "K" and convert KB size to MB size

In [232]:
df['Size'] = df['Size'].apply(lambda x: str(x).replace('M', '').replace(',', ''))

In [233]:
df['Size'] = df['Size'].apply(lambda x: float(str(x).replace('k', '')) / 1000 if 'k' in str(x) else x)

* while preprocessing the Size column, we found that there are some values with 'Varies with device' which is not a valid value, so will convert them into NAN 

In [234]:
df['Size'] = df['Size'].apply(lambda x: str(x).replace('Varies with device', '0') if 'Varies with device' in str(x) else x)

In [235]:
df['Size'] = df['Size'].apply(lambda x: float(x) * 1_000_000)

* convert 'Varies with device' to NaN

In [236]:
df['Size'].describe()

count    9.659000e+03
mean     1.780249e+07
std      2.149347e+07
min      0.000000e+00
25%      2.900000e+06
50%      9.100000e+06
75%      2.500000e+07
max      1.000000e+08
Name: Size, dtype: float64

In [237]:
df['Size'].isnull().sum()

0

### Price

* Remove "$" sign and convert to float

In [238]:
df['Price'] = df['Price'].apply(lambda x: str(x).replace('$', '') if '$' in str(x) else str(x))
df['Price'] = df['Price'].apply(lambda x: float(x))

In [239]:
# df['Price'].isnull().sum()
df['Price'].describe()

count    9659.000000
mean        1.099299
std        16.852152
min         0.000000
25%         0.000000
50%         0.000000
75%         0.000000
max       400.000000
Name: Price, dtype: float64

### Dates

In [240]:
df['Last Updated'] = pd.to_datetime(df['Last Updated'])
df["year"] = df["Last Updated"].dt.year
df["month"] = df["Last Updated"].dt.month

In [241]:
df['year'].min(), df['year'].max()  

(2010, 2018)

## Transform

### Number of apps and average rating by Category 


In [242]:
groups = df.groupby('Category').filter(lambda x: len(x) > 300).reset_index()

# calculate the average rating
average_rating = np.nanmean(groups['Rating'])

# create a list of categories
categories = list(set(groups['Category']))

#prepare data for each category
category_data = []
for category in categories:
    category_dict = {
        'Category': category,
        'Average_Rating': np.nanmean(groups.loc[groups['Category'] == category]['Rating']),
        'Number_of_Apps': len(groups.loc[groups['Category'] == category]),
    }
    category_data.append(category_dict)


df_category_summary = pd.DataFrame(category_data)

df_category_summary


Unnamed: 0,Category,Average_Rating,Number_of_Apps
0,BUSINESS,4.098479,420
1,PRODUCTIVITY,4.183389,374
2,FAMILY,4.183525,1876
3,COMMUNICATION,4.121484,315
4,GAME,4.243527,943
5,PERSONALIZATION,4.332215,376
6,LIFESTYLE,4.093355,369
7,MEDICAL,4.166552,395
8,TOOLS,4.039917,828
9,FINANCE,4.115563,345


## Creating dummy variables

In [243]:
categories = pd.get_dummies(df['Category'], prefix='Category', drop_first=True)
types = pd.get_dummies(df['Type'], prefix='Type', drop_first=True)
content_rating = pd.get_dummies(df['Content Rating'], prefix='Content_Rating', drop_first=True)
genres = pd.get_dummies(df['Genres'], prefix='Genre', drop_first=True)
frames = [df, categories, types, content_rating, genres]
df = pd.concat(frames, axis=1)
df = df.drop(['Category', 'Type', 'Content Rating', 'Genres', 'Installs', 'Current Ver', 'Android Ver'], axis=1)

In [244]:
df.sample(5)

Unnamed: 0,index,App,Rating,Reviews,Size,Price,Last Updated,year,month,Category_AUTO_AND_VEHICLES,...,Genre_Tools;Education,Genre_Travel & Local,Genre_Travel & Local;Action & Adventure,Genre_Trivia,Genre_Trivia;Education,Genre_Video Players & Editors,Genre_Video Players & Editors;Creativity,Genre_Video Players & Editors;Music & Video,Genre_Weather,Genre_Word
1139,6438,BL Community Icon Pack,4.0,289,2700000.0,0.0,2013-05-06,2013,5,False,...,False,False,False,False,False,False,False,False,False,False
3625,9405,EI Mobile,3.8,4231,82000000.0,0.0,2018-07-19,2018,7,False,...,False,False,False,False,False,False,False,False,False,False
2724,5539,Crayola Color Blaster,3.8,17,30000000.0,0.0,2018-03-13,2018,3,False,...,False,False,False,False,False,False,False,False,False,False
2326,388,Calls & Text by Mo+,4.2,83239,14000000.0,0.0,2018-04-19,2018,4,False,...,False,False,False,False,False,False,False,False,False,False
4157,10322,FE Civil Engineering Exam Prep,2.8,9,21000000.0,0.0,2018-07-27,2018,7,False,...,False,False,False,False,False,False,False,False,False,False
