# Data Cleaning 

## Imports

In [21]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import re
from string import *
from datetime import datetime, date
import nbimporter
from Helper_Functions import *

## Loading the Data Set

In [22]:
play_store_data = pd.read_csv('../../Google-Playstore-Full.csv',
                              low_memory=False)

In [23]:
play_store_data.head()

Unnamed: 0,App Name,Category,Rating,Reviews,Installs,Size,Price,Content Rating,Last Updated,Minimum Version,Latest Version,Unnamed: 11,Unnamed: 12,Unnamed: 13,Unnamed: 14
0,DoorDash - Food Delivery,FOOD_AND_DRINK,4.548561573,305034,"5,000,000+",Varies with device,0,Everyone,"March 29, 2019",Varies with device,Varies with device,,,,
1,TripAdvisor Hotels Flights Restaurants Attract...,TRAVEL_AND_LOCAL,4.400671482,1207922,"100,000,000+",Varies with device,0,Everyone,"March 29, 2019",Varies with device,Varies with device,,,,
2,Peapod,SHOPPING,3.656329393,1967,"100,000+",1.4M,0,Everyone,"September 20, 2018",5.0 and up,2.2.0,,,,
3,foodpanda - Local Food Delivery,FOOD_AND_DRINK,4.107232571,389154,"10,000,000+",16M,0,Everyone,"March 22, 2019",4.2 and up,4.18.2,,,,
4,My CookBook Pro (Ad Free),FOOD_AND_DRINK,4.647752285,2291,"10,000+",Varies with device,$5.99,Everyone,"April 1, 2019",Varies with device,Varies with device,,,,


## First Look

In [24]:
# Counting Rows and Columns of the DF
print('The shape of the data (samples, features) is: ', play_store_data.shape)
play_store_data.info()

The shape of the data (samples, features) is:  (267052, 15)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 267052 entries, 0 to 267051
Data columns (total 15 columns):
App Name           267051 non-null object
Category           267051 non-null object
Rating             267052 non-null object
Reviews            267051 non-null object
Installs           267052 non-null object
Size               267052 non-null object
Price              267052 non-null object
Content Rating     267052 non-null object
Last Updated       267052 non-null object
Minimum Version    267051 non-null object
Latest Version     267049 non-null object
Unnamed: 11        18 non-null object
Unnamed: 12        3 non-null object
Unnamed: 13        2 non-null object
Unnamed: 14        1 non-null float64
dtypes: float64(1), object(14)
memory usage: 30.6+ MB


In [25]:
#Renaming all columns so that if they have a space, it's replaced by _
play_store_data.columns = play_store_data.columns.str.replace(" ", "_")

## Column Clean Up

### Removing Unneeded Columns

In [26]:
# Dropping Unnamed Columns
play_store_data.drop(play_store_data.columns[
    play_store_data.columns.str.contains('unnamed', case=False)], axis=1, inplace=True)

# Dropping Minimum Version (Not neccesary for Hypothesis Testing)
play_store_data.drop('Minimum_Version', axis=1, inplace=True)

### Category Column

In [27]:
# Grouping Game: Categories together under 'GAME'
for i in play_store_data.Category.unique():
    if 'GAME' in str(i):
        play_store_data.Category.replace(i, 'GAME',inplace = True)

### Rating Column

In [28]:
# If there are any non numbered strings in the ratings column, its rows are dropped
remove_non_num_str_rows(play_store_data, play_store_data.Rating)

# Converting the rating column to float
play_store_data['Rating'] = play_store_data['Rating'].astype(float).round(2)

### Review Column

In [29]:
# Ensuring that all the values of the reviews are string representations of integers
play_store_data.Reviews.str.isnumeric().sum()

# Converting string to integer
play_store_data.Reviews = play_store_data.Reviews.astype(int)

### Installs Column

In [30]:
# Strip commas and + signs to be able to convert strings to integers

play_store_data.Installs = play_store_data.Installs.apply(
    lambda x: x.replace(',', ''))
play_store_data.Installs = play_store_data.Installs.apply(
    lambda x: x.strip('+'))

# Convert non integer strings to NaN
play_store_data.Installs = play_store_data.Installs.replace(' Xmax X', np.nan)

### Size Column

In [31]:
# Converts M(megabyte) and k(kilobyte) to numerical representations
play_store_data.Size = play_store_data.Size.str.replace('k', 'e+3')
play_store_data.Size = play_store_data.Size.str.replace('M', 'e+6')

# Remove any commas in the numerical strings
# Replaces the one row that has 'e+6USIC_AND_AUDIO' with NaN value.
play_store_data.Size = play_store_data.Size.str.replace(',', '')
play_store_data.Size = play_store_data.Size.replace(
    'e+6USIC_AND_AUDIO', np.nan)

# Creating list of bools to see what can be converted to a float
# float_bool = play_store_data.Size.apply(lambda x: convertible_to_float(x))

# Replacing 'Varies with Device' with NaN value.
play_store_data.Size = play_store_data.Size.replace(
    'Varies with device', np.nan)

# Converting the Size Column to numeric type (convert e+6 to numeric representation)
play_store_data.Size = pd.to_numeric(play_store_data.Size)

### Price Column

In [32]:
#Remove the Dollar($) sign
play_store_data.Price = play_store_data.Price.apply(lambda x: x.strip('$'))

# Conversion to a numerical value
play_store_data.Price = pd.to_numeric(play_store_data.Price)

### Content Rating

In [33]:
#Have to replace the one column abnormality (3702) with NaN
play_store_data.Content_Rating = play_store_data.Content_Rating.replace('3702', np.nan)

### Last Updated

In [34]:
# Replacing non-date values with NaN
replace_non_date_str_rows(play_store_data, play_store_data.Last_Updated)

# Replacing 500,000+ with NaN
play_store_data.Last_Updated = play_store_data.Last_Updated.replace('500,000+', np.nan)

# Conversion to DateTime
play_store_data.Last_Updated = pd.to_datetime(play_store_data.Last_Updated)

### Latest Version

In [35]:
#Finds all numerical strings, however, if they are 'Varies with Device' sets them equal to the string
play_store_data.Latest_Version= play_store_data.Latest_Version.apply(
    lambda x: 'Varies with device' if x=='Varies with device'  
    else  re.findall('^[0-9]\.[0-9]|[\d]|W*',str(x))[0] )

# Replaces 'Varies with Device' with NaN values
play_store_data.Latest_Version = play_store_data.Latest_Version.replace('Varies with device', np.nan)

# Removes non-sized strs to NaN
replace_non_size_str_rows(play_store_data.Latest_Version)

# Replaces empty strings '' with NaN values
play_store_data.Latest_Version = play_store_data.Latest_Version.replace('', np.nan)

TypeError: 'float' object is not iterable

## Creation of New Columns

### Cost

In [None]:
#I'm concerned that the free apps(0.00) is going to far outtake the paid data and will skew the data to the left
#thus I am going to make a new column that depicts Cost, in Paid and Free

play_store_data['Cost'] = play_store_data.Price.apply(lambda x: 'Paid' if x > 0.00 else 'Free')

#Move Columns so that Cost is next to 'Price'
play_store_data = play_store_data[['App_Name', 'Category', 'Rating', 'Reviews', 'Installs', 'Size', 'Price',
                                   'Cost', 'Content_Rating', 'Last_Updated', 'Latest_Version']]

### Days Since Last Update

In [None]:
# Initializing the date the data was scraped
day_scrape = date(2019, 4, 4)

# Initialization of the column
# Difference in date scraped from date last updated
play_store_data['Days_Since_Update'] = play_store_data.Last_Updated.apply(
    lambda x: day_scrape - datetime.date(x))

# Convert to type string
play_store_data.Days_Since_Update = play_store_data.Days_Since_Update.astype(
    str)

# Remove commas and spaces
play_store_data.Days_Since_Update = play_store_data.Days_Since_Update.apply(
    lambda x: x.split(',')[0]).apply(lambda x: x.split('d')[0])
play_store_data.Days_Since_Update = play_store_data.Days_Since_Update.str.replace(
    ' ', '')

# Replacing and NaN's with 0 days
for i in play_store_data.Days_Since_Update:
    for j in i:
        if j == ':':
            play_store_data.Days_Since_Update = play_store_data.Days_Since_Update.replace(
                i, '0')

# Updating Days Since Update column to integers
play_store_data.Days_Since_Update = play_store_data.Days_Since_Update.astype(
    int)

## Final Edits

### Remove Columns with NaN values

In [None]:
play_store_data.dropna(how='any', inplace = True)

### Save Cleaned DataSet

In [None]:
play_store_data.to_csv('../Data/Cleaned_Play_Store_Data.csv')