# Data Cleaning 

## Imports

In [15]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import re
from string import *

from datetime import datetime, date

In [23]:
from ipynb.fs.full.Helper_Functions import *

## Loading in Data

In [3]:
play_store_data = pd.read_csv('../../Google-Playstore-Full.csv',
                              low_memory=False)

## First Look

In [4]:
# Counting Rows and Columns of the DF
print('The shape of the data (samples, features) is: ', play_store_data.shape)
play_store_data.info()

The shape of the data (samples, features) is:  (267052, 15)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 267052 entries, 0 to 267051
Data columns (total 15 columns):
App Name           267051 non-null object
Category           267051 non-null object
Rating             267052 non-null object
Reviews            267051 non-null object
Installs           267052 non-null object
Size               267052 non-null object
Price              267052 non-null object
Content Rating     267052 non-null object
Last Updated       267052 non-null object
Minimum Version    267051 non-null object
Latest Version     267049 non-null object
Unnamed: 11        18 non-null object
Unnamed: 12        3 non-null object
Unnamed: 13        2 non-null object
Unnamed: 14        1 non-null float64
dtypes: float64(1), object(14)
memory usage: 30.6+ MB


In [5]:
#Renaming all columns so that if they have a space, it's replaced by _
play_store_data.columns = play_store_data.columns.str.replace(" ", "_")

## Column Clean Up

### Removing Unneeded Columns

In [6]:
# Dropping Unnamed Columns
play_store_data.drop(play_store_data.columns[
    play_store_data.columns.str.contains('unnamed', case=False)], axis=1, inplace=True)

# Dropping Minimum Version (Not neccesary for Hypothesis Testing)
play_store_data.drop('Minimum_Version', axis=1, inplace=True)

### Rating Column

In [7]:
# If there are any non numbered strings in the ratings column, its rows are dropped
remove_non_num_str_rows(play_store_data, play_store_data.Rating)

# Converting the rating column to float
play_store_data['Rating'] = play_store_data['Rating'].astype(float).round(2)

### Review Column

In [8]:
# Ensuring that all the values of the reviews are string representations of integers
play_store_data.Reviews.str.isnumeric().sum()

# Converting string to integer
play_store_data.Reviews = play_store_data.Reviews.astype(int)

### Installs Column

In [10]:
# Strip commas and + signs to be able to convert strings to integers

play_store_data.Installs = play_store_data.Installs.apply(
    lambda x: x.replace(',', ''))
play_store_data.Installs = play_store_data.Installs.apply(
    lambda x: x.strip('+'))

# Convert non integer strings to NaN
play_store_data.Installs = play_store_data.Installs.replace(' Xmax X', np.nan)

### Size Column

In [11]:
# Converts M(megabyte) and k(kilobyte) to numerical representations
play_store_data.Size = play_store_data.Size.str.replace('k', 'e+3')
play_store_data.Size = play_store_data.Size.str.replace('M', 'e+6')

# Remove any commas in the numerical strings
# Replaces the one row that has 'e+6USIC_AND_AUDIO' with NaN value.
play_store_data.Size = play_store_data.Size.str.replace(',', '')
play_store_data.Size = play_store_data.Size.replace(
    'e+6USIC_AND_AUDIO', np.nan)

# Creating list of bools to see what can be converted to a float
# float_bool = play_store_data.Size.apply(lambda x: convertible_to_float(x))

# Replacing 'Varies with Device' with NaN value.
play_store_data.Size = play_store_data.Size.replace(
    'Varies with device', np.nan)

# Converting the Size Column to numeric type (convert e+6 to numeric representation)
play_store_data.Size = pd.to_numeric(play_store_data.Size)

### Price Column

In [12]:
#Remove the Dollar($) sign
play_store_data.Price = play_store_data.Price.apply(lambda x: x.strip('$'))

# Conversion to a numerical value
play_store_data.Price = pd.to_numeric(play_store_data.Price)

### Content Rating

In [13]:
#Have to replace the one column abnormality (3702) with NaN
play_store_data.Content_Rating = play_store_data.Content_Rating.replace('3702', np.nan)

### Last Updated

In [24]:
#Checking for the non-date valuesabs
