# Working with data

In this part we are going to delve into the details of how to work with the data

### Import Numpy and Pandas

In [1]:
import pandas as pd
import numpy as np

### Reading From CSV File

In [2]:
df = pd.read_csv('sample.csv') # header assumed by default, to ignore use header=None
df

Unnamed: 0,id,name,age,occupation
0,101,John,32,Software Engineer
1,104,Jennifer,35,Medical Doctor
2,107,Micheal,22,Physician Assistant
3,210,Ahmad,45,Professor of Physics


### Reading CSV From a URL

In [4]:
### https://www.kaggle.com/lava18/google-play-store-apps
url = 'https://raw.githubusercontent.com/bulutmf/data-science-toolbox/master/data-wrangling/datasets/googleplaystore.csv'
df = pd.read_csv(url)
df.head()

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19M,"10,000+",Free,0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7M,"5,000,000+",Free,0,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25M,"50,000,000+",Free,0,Teen,Art & Design,"June 8, 2018",Varies with device,4.2 and up
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8M,"100,000+",Free,0,Everyone,Art & Design;Creativity,"June 20, 2018",1.1,4.4 and up


In [12]:
# Let's see how many rows we have
print("Number of rows: {}".format(len(df)))

# Let's see the column names
print("Column names: {}".format(df.columns.tolist()))

Number of rows: 10841
Column names: ['App', 'Category', 'Rating', 'Reviews', 'Size', 'Installs', 'Type', 'Price', 'Content Rating', 'Genres', 'Last Updated', 'Current Ver', 'Android Ver']


### Categorizing Data

In [35]:
# Let's see the different category of apps
groups = df.groupby(['Category'])
print("Unique categories:\n{}\n\n".format(groups.groups.keys()))

# We can iterate through a group
for name,group in groups:
    if name == 'MAPS_AND_NAVIGATION':
        print(name)
        break

# Let's now see a specific cateogory
groups.get_group('MAPS_AND_NAVIGATION').head(5)

Unique categories:
dict_keys(['LIFESTYLE', 'MAPS_AND_NAVIGATION', 'EVENTS', 'HEALTH_AND_FITNESS', 'HOUSE_AND_HOME', 'FAMILY', 'GAME', 'COMMUNICATION', 'BEAUTY', '1.9', 'SHOPPING', 'BOOKS_AND_REFERENCE', 'SOCIAL', 'TOOLS', 'FOOD_AND_DRINK', 'BUSINESS', 'PHOTOGRAPHY', 'DATING', 'SPORTS', 'VIDEO_PLAYERS', 'LIBRARIES_AND_DEMO', 'ENTERTAINMENT', 'MEDICAL', 'TRAVEL_AND_LOCAL', 'FINANCE', 'PARENTING', 'COMICS', 'AUTO_AND_VEHICLES', 'WEATHER', 'ART_AND_DESIGN', 'EDUCATION', 'PERSONALIZATION', 'PRODUCTIVITY', 'NEWS_AND_MAGAZINES'])


MAPS_AND_NAVIGATION


Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
3820,"Waze - GPS, Maps, Traffic Alerts & Live Naviga...",MAPS_AND_NAVIGATION,4.6,7232629,Varies with device,"100,000,000+",Free,0,Everyone,Maps & Navigation,"July 29, 2018",Varies with device,Varies with device
3821,"T map (te map, T map, navigation)",MAPS_AND_NAVIGATION,4.2,15681,Varies with device,"5,000,000+",Free,0,Everyone,Maps & Navigation,"August 2, 2018",Varies with device,Varies with device
3822,"MapQuest: Directions, Maps, GPS & Navigation",MAPS_AND_NAVIGATION,4.1,53481,Varies with device,"10,000,000+",Free,0,Everyone,Maps & Navigation,"May 22, 2018",Varies with device,Varies with device
3823,"Yahoo! transit guide free timetable, operation...",MAPS_AND_NAVIGATION,4.4,104800,22M,"10,000,000+",Free,0,Everyone,Maps & Navigation,"July 26, 2018",7.0.1,4.0 and up
3824,乗換NAVITIME　Timetable & Route Search in Japan T...,MAPS_AND_NAVIGATION,4.4,50459,Varies with device,"5,000,000+",Free,0,Everyone,Maps & Navigation,"July 26, 2018",Varies with device,Varies with device


In [45]:
### We can get the count of each category of apps
groups.size()

Category
1.9                       1
ART_AND_DESIGN           65
AUTO_AND_VEHICLES        85
BEAUTY                   53
BOOKS_AND_REFERENCE     231
BUSINESS                460
COMICS                   60
COMMUNICATION           387
DATING                  234
EDUCATION               156
ENTERTAINMENT           149
EVENTS                   64
FAMILY                 1972
FINANCE                 366
FOOD_AND_DRINK          127
GAME                   1144
HEALTH_AND_FITNESS      341
HOUSE_AND_HOME           88
LIBRARIES_AND_DEMO       85
LIFESTYLE               382
MAPS_AND_NAVIGATION     137
MEDICAL                 463
NEWS_AND_MAGAZINES      283
PARENTING                60
PERSONALIZATION         392
PHOTOGRAPHY             335
PRODUCTIVITY            424
SHOPPING                260
SOCIAL                  295
SPORTS                  384
TOOLS                   843
TRAVEL_AND_LOCAL        258
VIDEO_PLAYERS           175
WEATHER                  82
dtype: int64

In [46]:
# We can calculate statistics on some other column, in this case "Rating"
groups.agg({'Rating': ['mean', 'count']})

Unnamed: 0_level_0,Rating,Rating
Unnamed: 0_level_1,mean,count
Category,Unnamed: 1_level_2,Unnamed: 2_level_2
1.9,19.0,1
ART_AND_DESIGN,4.358065,62
AUTO_AND_VEHICLES,4.190411,73
BEAUTY,4.278571,42
BOOKS_AND_REFERENCE,4.346067,178
BUSINESS,4.121452,303
COMICS,4.155172,58
COMMUNICATION,4.158537,328
DATING,3.970769,195
EDUCATION,4.389032,155


### Dealing with Missing Values

In [8]:
# Lets first check if there is any missing values
df.isna().sum()

App                  0
Category             0
Rating            1474
Reviews              0
Size                 0
Installs             0
Type                 1
Price                0
Content Rating       1
Genres               0
Last Updated         0
Current Ver          8
Android Ver          3
dtype: int64

In [None]:
# Looks like we have missing values in "Rating", "Type", "Content Rating", "Content Ver" and "Android Ver"
