<a href="https://colab.research.google.com/github/carloslme/wizeline-bootcamp/blob/main/pyspark/tokenizer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Import dependencies

In [1]:
# Common imports
import numpy as np
import os

# Parquet imports
import pyarrow as pa
import pyarrow.parquet as pq

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)


import pandas as pd
import seaborn as sns
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

## Setup PySpark session

In [2]:
# Install dependencies
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://downloads.apache.org/spark/spark-3.1.2/spark-3.1.2-bin-hadoop3.2.tgz 
!tar -xvf spark-3.1.2-bin-hadoop3.2.tgz
!pip install -q findspark
!pip install pyspark
!pip install fsspec
!pip install gcsfs

  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.2.0-py2.py3-none-any.whl size=281805912 sha256=900701ea23b32bb60a49befcf2b9636f5e72283f2d84fe5efadedefcf9e3378d
  Stored in directory: /root/.cache/pip/wheels/0b/de/d2/9be5d59d7331c6c2a7c1b6d1a4f463ce107332b1ecd4e80718
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9.2 pyspark-3.2.0


In [3]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as f
from pyspark.sql.functions import *
from pyspark.sql.types import StringType, IntegerType
import pyspark
from pyspark import SparkContext
from pyspark import SparkConf

sc = SparkContext()
spark = SparkSession.builder.getOrCreate()

### Read data from raw layer bucket
Instead of pulling data from bucket, we are going to upload the CSV file directly on this Colab session and work with it. 

Once this script is deployed in Airflow using Dataproc, there should be not necessary to setup credentials.

In [21]:
df = spark.read.options(header=True).csv('/movie_review.csv')

In [23]:
df.head(5)

[Row(cid='13756', review_str="Once again Mr. Costner has dragged out a movie for far longer than necessary. Aside from the terrific sea rescue sequences, of which there are very few I just did not care about any of the characters. Most of us have ghosts in the closet, and Costner's character are realized early on, and then forgotten until much later, by which time I did not care. The character we should really care about is a very cocky, overconfident Ashton Kutcher. The problem is he comes off as kid who thinks he's better than anyone else around him and shows no signs of a cluttered closet. His only obstacle appears to be winning over Costner. Finally when we are well past the half way point of this stinker, Costner tells us all about Kutcher's ghosts. We are told why Kutcher is driven to be the best with no prior inkling or foreshadowing. No magic here, it was all I could do to keep from turning it off an hour in."),
 Row(cid='15738', review_str="This is an example of why the majori

In [25]:
df.columns

['cid', 'review_str']

In [22]:
from pyspark.ml.feature import RegexTokenizer
from pyspark.ml.feature import StopWordsRemover
from pyspark.sql.functions import col, udf, when
from pyspark.sql.types import BooleanType, IntegerType

# Tokenizing the words
regexTokenizer = RegexTokenizer(
    inputCol='review_str',
    outputCol='review_token',
    pattern='\\W'
    )
regexTokenized = regexTokenizer.transform(df)

In [27]:
# Removing stop words
remover = StopWordsRemover(
    inputCol="review_token",
    outputCol="filtered",
    #stopWords=stopwordList
    )

In [28]:
clean_df = remover.transform(regexTokenized).select('cid', 'filtered')

In [29]:
# Filter positive reviews
word_good = udf(lambda words: 'good' in words, BooleanType())
reviews_bool = clean_df.withColumn('positive_review_bool', word_good(col('filtered')))
                       .select('cid', 'positive_review_bool')

In [31]:
reviews_bool.head(5)

[Row(cid='13756', positive_review_bool=False),
 Row(cid='15738', positive_review_bool=False),
 Row(cid='15727', positive_review_bool=False),
 Row(cid='17954', positive_review_bool=False),
 Row(cid='16579', positive_review_bool=True)]

In [33]:
# Converting "positive_review_bool" column from boolean to int
reviews = reviews_bool.withColumn("positive_review",
                       when(reviews_bool.positive_review_bool == True, 1)
                      .otherwise(0)).select('cid','positive_review')

In [35]:
reviews.show(5)

+-----+---------------+
|  cid|positive_review|
+-----+---------------+
|13756|              0|
|15738|              0|
|15727|              0|
|17954|              0|
|16579|              1|
+-----+---------------+
only showing top 5 rows



In [None]:
# Saving data frame as parquet
reviews.write.parquet('gs://BUCKET-NAME-HERE/reviews.parquet')

## Script for Dataproc


In [None]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import RegexTokenizer
from pyspark.ml.feature import StopWordsRemover
from pyspark.sql.functions import col, udf, when
from pyspark.sql.types import BooleanType, IntegerType

df = spark.read.options(header=True).csv('/movie_review.csv')

# Tokenizing the words
regexTokenizer = RegexTokenizer(
    inputCol='review_str',
    outputCol='review_token',
    pattern='\\W'
    )
regexTokenized = regexTokenizer.transform(df)

# Removing stop words
remover = StopWordsRemover(
    inputCol="review_token",
    outputCol="filtered",
    #stopWords=stopwordList
    )

clean_df = remover.transform(regexTokenized).select('cid', 'filtered')

# Filter positive reviews
word_good = udf(lambda words: 'good' in words, BooleanType())
reviews_bool = clean_df.withColumn('positive_review_bool', word_good(col('filtered')))
                       .select('cid', 'positive_review_bool')

# Converting "positive_review_bool" column from boolean to int
reviews = reviews_bool.withColumn("positive_review",
                       when(reviews_bool.positive_review_bool == True, 1)
                      .otherwise(0)).select('cid','positive_review')

# Saving data frame as parquet
reviews.write.parquet('gs://BUCKET-NAME-HERE/reviews.parquet')