In [1]:
!pip install snakeviz
!pip install joblib
!pip install dask distributed graphviz



In [78]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import sklearn
import re
from sklearn.preprocessing import MultiLabelBinarizer
%matplotlib inline

# Load data

In [81]:
!curl https://raw.githubusercontent.com/jasonchang0/kaggle-google-apps/master/google-play-store-apps/googleplaystore.csv -o googleplaystore.csv

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 1328k  100 1328k    0     0  1696k      0 --:--:-- --:--:-- --:--:-- 1694k


In [82]:
!curl https://raw.githubusercontent.com/jasonchang0/kaggle-google-apps/master/google-play-store-apps/googleplaystore_user_reviews.csv -o googleplaystore_user_reviews.csv

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 7489k  100 7489k    0     0  6858k      0  0:00:01  0:00:01 --:--:-- 6858k


In [89]:
googleplaystore = pd.read_csv('googleplaystore.csv')
googleplaystore.head()

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19M,"10,000+",Free,0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7M,"5,000,000+",Free,0,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25M,"50,000,000+",Free,0,Teen,Art & Design,"June 8, 2018",Varies with device,4.2 and up
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8M,"100,000+",Free,0,Everyone,Art & Design;Creativity,"June 20, 2018",1.1,4.4 and up


In [90]:
googleplaystore.shape

(10841, 13)

In [91]:
googleplaystore_user_reviews = pd.read_csv('googleplaystore_user_reviews.csv')
googleplaystore_user_reviews.head()

Unnamed: 0,App,Translated_Review,Sentiment,Sentiment_Polarity,Sentiment_Subjectivity
0,10 Best Foods for You,I like eat delicious food. That's I'm cooking ...,Positive,1.0,0.533333
1,10 Best Foods for You,This help eating healthy exercise regular basis,Positive,0.25,0.288462
2,10 Best Foods for You,,,,
3,10 Best Foods for You,Works great especially going grocery store,Positive,0.4,0.875
4,10 Best Foods for You,Best idea us,Positive,1.0,0.3


In [92]:
googleplaystore_user_reviews.shape

(64295, 5)

# Task 1: Lots of string operations

Genres in `googleplaystore` are separated with the `';'` and ',' symbols. 

We want to transform them to dummy variables

In [98]:
def get_lists_naive(df):
    return df['Genres'].apply(lambda g: re.split('[;,]', g)).values.tolist()

In [99]:
genre_lists = get_lists_naive(googleplaystore)
genre_lists[:3]

[['Art & Design'], ['Art & Design', 'Pretend Play'], ['Art & Design']]

In [96]:
def make_dummies(genre_lists):
    mlb = MultiLabelBinarizer()
    dummies_df = pd.DataFrame(mlb.fit_transform(genre_lists),
                 columns=mlb.classes_,
                 index=googleplaystore.index)
    return dummies_df
true_dummies = make_dummies(genre_lists)
true_dummies

Unnamed: 0,2018,Action,Action & Adventure,Adventure,Arcade,Art & Design,Auto & Vehicles,Beauty,Board,Books & Reference,...,Simulation,Social,Sports,Strategy,Tools,Travel & Local,Trivia,Video Players & Editors,Weather,Word
0,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10836,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10837,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10838,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10839,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


**Task: Parallelize the operation of splitting the strings**

In [137]:
%%timeit 
get_lists_naive(googleplaystore)

12.5 ms ± 547 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


### A. Using joblib

In [38]:
from joblib import Parallel, delayed

In [None]:
# Your functions here

def get_lists_joblib(df):
    genre_lists = None
    
    # Your code here
    
    return genre_lists

In [42]:
%%timeit 
# Execute your computation here
# Hint: split the dataframe into chunks, process each chunk in parallel, then concatenate
pass

12 ns ± 0.325 ns per loop (mean ± std. dev. of 7 runs, 100000000 loops each)


In [101]:
# Check for correctness
assert (make_dummies(get_lists_joblib(googleplaystore)).values == true_dummies.values).all()

### B. Using numba

In [46]:
from numba import jit, prange

In [44]:
# Your functions here

def get_lists_numba(df):
    genre_lists = None
    
    # Your code here
    
    return genre_lists

In [45]:
%%timeit 
# Execute your computation here
pass

12.1 ns ± 0.726 ns per loop (mean ± std. dev. of 7 runs, 100000000 loops each)


In [None]:
# Check for correctness
assert (make_dummies(get_lists_numba(googleplaystore)).values == true_dummies.values).all()

### C. Using raw dask

In [66]:
from dask.distributed import Client
from dask import delayed

client = Client(n_workers=4)

Perhaps you already have a cluster running?
Hosting the HTTP server on port 37501 instead


In [79]:
# Your functions and scheduling here

result = None

In [None]:
# Visualize your computation graph

result.visualize()

In [49]:
%%timeit 
# Execute your computation here
pass

11.8 ns ± 0.183 ns per loop (mean ± std. dev. of 7 runs, 100000000 loops each)


In [50]:
client.close()

In [None]:
# Check for correctness
assert (make_dummies(result).values == true_dummies.values).all()

#### Side note: we could use Dask DataFrame API

In [74]:
import dask.dataframe as dd

In [75]:
df_dask = dd.from_pandas(googleplaystore, npartitions=10)
df_dask.head()

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19M,"10,000+",Free,0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7M,"5,000,000+",Free,0,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25M,"50,000,000+",Free,0,Teen,Art & Design,"June 8, 2018",Varies with device,4.2 and up
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8M,"100,000+",Free,0,Everyone,Art & Design;Creativity,"June 20, 2018",1.1,4.4 and up


In [76]:
#Schedule
result = df_dask['Genres'].apply(lambda g: re.split('[;,]', g),  meta=('Genres', 'object')).values

result.visualize()

In [77]:
%%timeit 
result.compute()

60.7 ms ± 3.37 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


**Question:** why is the Dask DataFrame computation slower? In what case this is to be expected? 

**Side note 2**: We could also use Modin

In [129]:
import modin.pandas as mpd

df_modin = mpd.DataFrame(googleplaystore)

In [136]:
%%timeit
get_lists_naive(df_modin)

83.6 ms ± 6.94 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [132]:
# Check for correctness
assert (make_dummies(get_lists_naive(df_modin)).values == true_dummies.values).all()

# Task 2: Optimizing computation-heavy code 

One way to estimate the value of $\pi$ is the Monte-Carlo simulation method.
It goes like this:
1. Pick a random point on a 2D pane
2. Check if it's within the unit circle or outside of the unit circle
3. The ratio `points_in_circle/total_points` tends towards the area of the circle.
4. Knowing the area of a rectangle an estimate of the area of the circle within, we can estimate $\pi$ as $\pi = 4 \times circle\_area$ 

Visualization here: https://www.geogebra.org/m/UeqCG547

Original exercise: https://github.com/Yurlungur/cython-example/blob/master/Cython%20Pi%20Monte%20Carlo.ipynb

In [116]:
# First import libraries we need:
import random

def estimate_pi(square_count,circle_count):
    """
    The area of the circle divided by the area of the square is pi/4.
    """
    if square_count != 0:
        return 4*float(circle_count)/float(square_count)
    else:
        return 0

def monte_carlo_for_pi(num_random_samples):
    """
    Calculates the value for pi by monte carlo algorithm. Generates
    data that can be animated.
    """
    # variable definitions
    radius = 1 # Radius of the circle and square
    rectangle_xmin = -radius
    rectangle_ymin = rectangle_xmin
    rectangle_width = 2*radius
    rectangle_height = rectangle_width
    rectangle_xmax = rectangle_xmin + rectangle_width
    rectangle_ymax = rectangle_ymin + rectangle_height
    
    # The total counta nd circle count
    total_count = 0
    circle_count = 0

    # Start the pseudorandom process
    random.seed()

    # throw darts and see if they land in the circle
    for i in range(num_random_samples):
        x = random.uniform(rectangle_xmin,rectangle_xmax)
        y = random.uniform(rectangle_ymin,rectangle_ymax)
        if x**2 + y**2 <= radius:
            circle_count += 1
        total_count += 1

    return estimate_pi(total_count,circle_count)

In [117]:
monte_carlo_for_pi(1000)

3.164

In [118]:
monte_carlo_for_pi(100000)

3.14792

In [119]:
%%timeit
monte_carlo_for_pi(1000000)

1.03 s ± 38.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


**Task: Optimize the code above**

### A. Using numba

In [None]:
from numba import jit, prange

In [120]:
@jit(nopython=True)
def estimate_pi_numba(square_count, circle_count):
    # Your code here
    pass

@jit(nopython=True) # Consider trying parallel=true too
def monte_carlo_for_pi_numba(num_random_samples):
    # Your code here
    pass

In [123]:
# Compile before measuring speed
estimate_pi_numba(1, 1)
monte_carlo_for_pi_numba(1)

In [None]:
# Check correctness
monte_carlo_for_pi_numba(1000000)

In [124]:
%%timeit
monte_carlo_for_pi_numba(1000000)

218 ns ± 6.68 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)


### Using Cython

In [109]:
%load_ext cython

In [114]:
%%cython
import random
def estimate_pi_cy1(square_count, circle_count):
    # Your code here
    pass

def monte_carlo_for_pi_cy1(num_random_samples):
    # Your code here
    pass

In [None]:
# Check correctness
monte_carlo_for_pi_cy1(1000000)

In [115]:
%%timeit
monte_carlo_for_pi_cy1(1000000)

46.9 ns ± 1.46 ns per loop (mean ± std. dev. of 7 runs, 10000000 loops each)
