# Introduction to Pandas & NumPy

## Importing the librairies

In [1]:
# import the pandas and NumPy librairies
import pandas as pd
import numpy as np

## Introduction to NumPy

### Basic NumPy functions

In [4]:
# Create an array with 21 elements uniformly spaced between 0 and 10: first_array

first_array = np.linspace(0 ,10 , 21) # 起点为0，终点为10，取21个点
 
print(first_array)

[ 0.   0.5  1.   1.5  2.   2.5  3.   3.5  4.   4.5  5.   5.5  6.   6.5
  7.   7.5  8.   8.5  9.   9.5 10. ]


In [5]:
# Create an array of 21 integers randomly drawn between 0 and 10: rnd_array

rnd_array = np.random.randint(0, 10, 21)
 
print(rnd_array)

[7 6 1 8 2 2 8 1 8 9 3 3 8 6 7 3 8 5 2 0 2]


In [8]:
# Cast first_array to an array of integers: first_updated

first_updated = first_array.astype(int)
 
# stack together first_updated and rnd_array: stack_array

stack_array = np.hstack((first_updated, rnd_array))

print(stack_array)

[ 0  0  1  1  2  2  3  3  4  4  5  5  6  6  7  7  8  8  9  9 10  7  6  1
  8  2  2  8  1  8  9  3  3  8  6  7  3  8  5  2  0  2]


In [10]:
# What is the average value of stack_array?

Avg = np.mean(stack_array)
 
# What is the row-average of stack_array?

row_average = np.average(stack_array,axis=0)

print(Avg)
print(row_average)

4.738095238095238
4.738095238095238


## Introduction to Pandas

### Inspecting DataFrames

In [11]:
# Load the googleplaystore.csv file into a pandas DataFrame

df = pd.read_csv('data/googleplaystore.csv')

In [12]:
# Print the columns of your DataFrame

df.columns.tolist()


['App',
 'Category',
 'Rating',
 'Reviews',
 'Size',
 'Installs',
 'Type',
 'Price',
 'Content Rating',
 'Genres',
 'Last Updated',
 'Current Ver',
 'Android Ver']

In [14]:
# Select the first 5 rows of your DataFrame

df.head(5)


Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19M,"10,000+",Free,0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7M,"5,000,000+",Free,0,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25M,"50,000,000+",Free,0,Teen,Art & Design,"June 8, 2018",Varies with device,4.2 and up
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8M,"100,000+",Free,0,Everyone,Art & Design;Creativity,"June 20, 2018",1.1,4.4 and up


In [15]:
# Show some basic statistical details of your DataFrame

df.describe()


Unnamed: 0,Rating
count,9367.0
mean,4.193338
std,0.537431
min,1.0
25%,4.0
50%,4.3
75%,4.5
max,19.0


In [24]:
# Show all the unique values in your DataFrame

df.max()
#df.min()
#df.mean()


App             🔥 Football Wallpapers 4K | Full HD Backgrounds 😍
Category                                                 WEATHER
Rating                                                        19
Reviews                                                     9992
Size                                          Varies with device
Installs                                                    Free
Price                                                   Everyone
Genres                                                      Word
Last Updated                                   September 9, 2017
dtype: object

### Updating DataFrames

In [38]:
# Create a new DataFrame called 'updated_apps' that doesn't contain the string 'Varies with device' in the 'Size' column

updated_apps = df[~df['Size'].str.contains('Varies with device', na=False)]
updated_apps.head()

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19M,"10,000+",Free,0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7M,"5,000,000+",Free,0,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25M,"50,000,000+",Free,0,Teen,Art & Design,"June 8, 2018",Varies with device,4.2 and up
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8M,"100,000+",Free,0,Everyone,Art & Design;Creativity,"June 20, 2018",1.1,4.4 and up


In [39]:
# Let's create a function to update a single string of the 'Size' column into a number

def update_value_loop(s):
    # replace 1,000
    s = s.replace('1,000','1000')
 
    # get value and character strings
    val = s[:-1]
    char = s[-1:]
 
    # convert to value
    number = float(val)
 
    # multiply value
    if char == 'k':
        return (number * 1000)
    elif char == 'M':
        return (number * 1000000)
    else:
        return number

First, let's update all the values in the 'Size' column by iterating over the DataFrames' rows and appending the results

In [40]:
%%timeit
app_sizes = []
 
for index, row in updated_apps.iterrows():
    app_sizes.append(update_value_loop(row['Size']))

533 ms ± 17.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


Now, let's do the same using pandas' apply() function

In [41]:
%%timeit
app_sizes = updated_apps.apply(lambda row: update_value_loop(row['Size']), axis=1)

55.2 ms ± 1.3 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


The apply() method is 5x faster!

In [42]:
# Let's create a similar function that operates on an entire Series at a time

def update_size_values_vectorized(s):
    # replace 1,000
    s = s.replace('1,000+','1000+')
    
    # remove last string
    val = s.str[:-1]
    char = s.str[-1:]
 
    # convert to value
    number = val.astype(float)

    # scale number based on suffix
    number[char == 'k'] = number[char == 'k'] * 1_000
    number[char == 'M'] = number[char == 'M'] * 1_000_000

    return number

It is possible to go even faster by using a vectorized function

In [43]:
%%timeit
app_sizes = update_size_values_vectorized(updated_apps['Size'])

10.6 ms ± 274 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


There is a 10x improvement!

In [49]:
# Create a similar vectorized function to update the 'Price' variable of the DataFrame
# Tips: 
# (1) replaces 'Everyone' by '$0' and '0' by '$0'
# (2) removes the $ symbol, 
# (3) converts the strings to floats. 

def update_price(s):
    # Deal with the 'Everyone' issue
    
    s = s.replace('Everyone','$0')

    # get value without the $ symbol
    
    val = s[0:]
 
    # convert to float value and return the result
    
    number = val.astype(float)
    
    return number


Let's see how fast our function is updating the Price variable

In [None]:
%%timeit
app_prices = update_price(updated_apps['Price'])

### Memory Management

In [45]:
# Let's create a function to compare the memory usage when storing a DataFrame's column as string vs category

def mem_usage(pandas_obj):
    if isinstance(pandas_obj,pd.DataFrame):
        usage_b = pandas_obj.memory_usage(deep=True).sum()
    else: # we assume if not a df it's a series
        usage_b = pandas_obj.memory_usage(deep=True)
    usage_mb = usage_b / 1024 # convert bytes to KB
    return "{:03.1f} kB".format(usage_mb)

In [47]:
# let's compare the memory usage when storing the DataFrame's 'Category' column as string vs category
print(mem_usage(updated_apps['Category']))
print(mem_usage(updated_apps['Category'].astype('category')))

979.3 kB
403.9 kB


Converting the column to caterogicals enabled ~50x memory improvement with 0 information loss!

### Pickling

In [64]:
# Apply the updates to the dataset before saving it
updated_apps['Size'] = updated_apps.apply(lambda row: update_value_loop(row['Size']), axis=1)
updated_apps['Price'] = update_price(updated_apps['Price'])

In [51]:
# Save the updated_apps dataset to pickle

df.to_pickle('updated_apps.pickle')


In [54]:
# Save the updated_apps dataset to pickle and compress it to bz2

df.to_pickle('updated_apps.pickle', 'bz2')

In [63]:
# Reload the compressed pickle file

reload_df = pd.read_pickle('updated_apps.pickle','bz2')
