<a href="https://colab.research.google.com/github/carloslme/wizeline-bootcamp/blob/main/pyspark/user_behavior.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Common imports
import numpy as np
import os

# Parquet imports
import pyarrow as pa
import pyarrow.parquet as pq

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)


import pandas as pd
import seaborn as sns
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [None]:
# Install dependencies
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://downloads.apache.org/spark/spark-3.1.2/spark-3.1.2-bin-hadoop3.2.tgz 
!tar -xvf spark-3.1.2-bin-hadoop3.2.tgz
!pip install -q findspark
!pip install pyspark
!pip install fsspec
!pip install gcsfs

In [3]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as f
from pyspark.sql.functions import *
from pyspark.sql.types import StringType, IntegerType
import pyspark
from pyspark import SparkContext
from pyspark import SparkConf

sc = SparkContext()
spark = SparkSession.builder.getOrCreate()

In [4]:
from google.oauth2 import service_account
from google.cloud.storage import client
import io
import pandas as pd
from io import BytesIO
import json
import filecmp

In [6]:
credentials = service_account.Credentials.from_service_account_file(
    '/content/gcs_service_account.json',
    scopes=["https://www.googleapis.com/auth/cloud-platform"],
)

client = client.Client(
    credentials=credentials,
    project=credentials.project_id,
)

In [7]:
BUCKET = 'staging-layer-330021'

In [8]:
def download_file(local_filename, remote_filename):
    bucket = client.get_bucket(BUCKET)
    blob = bucket.blob(remote_filename)
    blob.download_to_filename(local_filename)

In [9]:
download_file("/content/user_purchase.csv","user_purchase.csv")

In [13]:
download_file("/content/part-00000-59bf5da5-e43a-4eba-8c9b-153c96ee2a85-c000.snappy.parquet", "reviews.parquet/part-00000-59bf5da5-e43a-4eba-8c9b-153c96ee2a85-c000.snappy.parquet")
download_file("/content/part-00001-59bf5da5-e43a-4eba-8c9b-153c96ee2a85-c000.snappy.parquet", "reviews.parquet/part-00001-59bf5da5-e43a-4eba-8c9b-153c96ee2a85-c000.snappy.parquet")

In [31]:
df_reviews = spark.read.options(header=True).parquet('*.parquet')

In [32]:
df_user_purchase = spark.read.options(header=True).csv('*.csv')

In [25]:
df_reviews.columns

['user_id', 'positive_review']

In [27]:
df_user_purchase.columns

['invoice_number',
 'stock_code',
 'detail',
 'quantity',
 'invoice_date',
 'unit_price',
 'customer_id',
 'country']

In [35]:
from pyspark.sql.functions import *

data = (
    df_reviews.join(df_user_purchase, df_reviews.user_id == df_user_purchase.customer_id).select(df_reviews["positive_review"], df_user_purchase["*"])
)

In [36]:
data.show(10)

+---------------+--------------+----------+--------------------+--------+------------+----------+-----------+--------------+
|positive_review|invoice_number|stock_code|              detail|quantity|invoice_date|unit_price|customer_id|       country|
+---------------+--------------+----------+--------------------+--------+------------+----------+-----------+--------------+
|              0|        536365|    85123A|WHITE HANGING HEA...|       6|1291191960.0|      2.55|      17850|United Kingdom|
|              1|        536365|    85123A|WHITE HANGING HEA...|       6|1291191960.0|      2.55|      17850|United Kingdom|
|              0|        536365|    85123A|WHITE HANGING HEA...|       6|1291191960.0|      2.55|      17850|United Kingdom|
|              0|        536365|    85123A|WHITE HANGING HEA...|       6|1291191960.0|      2.55|      17850|United Kingdom|
|              0|        536365|    85123A|WHITE HANGING HEA...|       6|1291191960.0|      2.55|      17850|United Kingdom|
