In [236]:
import os
import json
import logging
from pathlib import Path
import configparser
from shutil import rmtree

In [114]:
def load_config_file(filepath: str):
    config = configparser.ConfigParser()
    config.read(filepath)
    return config

def setup_aws_env():
    config = load_config_file('./aws-config.cfg')
    os.environ['AWS_ACCESS_KEY_ID'] = config['AWS']['AWS_ACCESS_KEY_ID']
    os.environ['AWS_SECRET_ACCESS_KEY'] = config['AWS']['AWS_SECRET_ACCESS_KEY']
    os.environ['AWS_DEFAULT_REGION'] = config['AWS']['AWS_DEFAULT_REGION']
setup_aws_env()

In [115]:
logger = logging.getLogger()
logger.handlers = []
logger.setLevel(logging.INFO)
logger.addHandler(logging.StreamHandler())
logger.info("Logging Set-up")

Logging Set-up


### Write test data to S3 - Reduced size datasets

In [23]:
import s3fs
bucket_name = 'yelp-customer-reviews'
s3 = s3fs.S3FileSystem(anon=False)


raw_data_path = "./data/raw"
test_raw_data_path = "./data/raw-test"

paths = {}
for entry in os.listdir(raw_data_path):
    if entry.endswith('.json'):
        path = Path(raw_data_path) / Path(entry)
        paths[path.stem] = path

for filename, path in paths.items():
    s3_uri = f's3://{bucket_name}/raw-test/{filename}.json'
    with open(path, 'r') as f_in:     
        with s3.open(s3_uri, 'w') as f_out:
            for index, line in enumerate(f_in):
                f_out.write(line)
                if index > 1000:
                    break

{'yelp_academic_dataset_checkin': PosixPath('data/raw/yelp_academic_dataset_checkin.json'),
 'yelp_academic_dataset_user': PosixPath('data/raw/yelp_academic_dataset_user.json'),
 'yelp_academic_dataset_business': PosixPath('data/raw/yelp_academic_dataset_business.json'),
 'yelp_academic_dataset_tip': PosixPath('data/raw/yelp_academic_dataset_tip.json'),
 'yelp_academic_dataset_review': PosixPath('data/raw/yelp_academic_dataset_review.json')}

### Process Data in S3 Using PySpark

In [31]:
TEST = True
bucket_name = 'yelp-customer-reviews'
root_path = 'raw' if not TEST else 'raw-test'

dataset_uris_dict = {}
for entry in s3.ls(f"{bucket_name}/{root_path}"):
    dataset_uris_dict[Path(entry).stem.split('_')[-1]] = f"s3://{entry}"
dataset_uris_dict

{'business': 's3://yelp-customer-reviews/raw-test/yelp_academic_dataset_business.json',
 'checkin': 's3://yelp-customer-reviews/raw-test/yelp_academic_dataset_checkin.json',
 'review': 's3://yelp-customer-reviews/raw-test/yelp_academic_dataset_review.json',
 'tip': 's3://yelp-customer-reviews/raw-test/yelp_academic_dataset_tip.json',
 'user': 's3://yelp-customer-reviews/raw-test/yelp_academic_dataset_user.json'}

In [230]:
def move_directory_to_s3(local_directory:str, bucket_name:str, root_prefix:str, filetype:str):
    path = Path(local_directory)
    
    for index, entry in enumerate(path.rglob(f'*.{filetype}')):
        local_path = str(entry)
        object_key = local_path.replace(str(path), root_prefix)
        s3_uri = f's3://{bucket_name}/{object_key}'
        print(s3_uri)
        s3.put(str(entry), s3_uri)

In [250]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, to_timestamp, month,dayofmonth, year
from pyspark.sql.types import StringType, IntegerType, TimestampType, DateType
from pyspark.sql.functions import udf

class SparkDF(object):
    """
    Utility class to handle common operation related to Spark Dataframes
    """

    def __init__(self, filepath:str):
        self.spark = self.create_spark_session()
        self.df = self._load_json_data(filepath)
        
    def create_spark_session(self):
        """Create a Spark session"""
        os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages com.amazonaws:aws-java-sdk-pom:1.10.34,org.apache.hadoop:hadoop-aws:2.7.2 pyspark-shell'

        spark = SparkSession \
            .builder \
            .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:2.7.0") \
            .getOrCreate()
        return spark

    def _load_json_data(self, filepath:str):
        """
        Load JSON data from S3 to a Dataframe

        Returns:
            Spark Dataframe -- Spark dataframe with contents of JSON files
        """
        try:
            logger.info(f"Loading file: {filepath}")
            return self.spark.read.json(filepath)
        except Exception as e:
            if "No FileSystem for scheme: s3" in str(e):
                logger.warning("Switching to slow S3a loading method")
                filepath = filepath.replace("s3://", "s3a://")
                return self.spark.read.json(filepath)
            else:
                raise e
        
        return 
    
    def subset_df(self, columns:list, option:str):
        if option =='keep':
            self.df = self.df.select(*columns)
        elif option == 'drop':
            self.df = self.df.drop(*columns)

    def _write_to_parquet(self, s3_output_path: str, mode: str = 'overwrite', partitions: list = []):
        """
        Writes Spark Dataframe to S3 in the Parquet Format

        Arguments:
            s3_output_path {str} -- Output path in S3

        Keyword Arguments:
            mode {str} -- Writing mode (default: {'overwrite'})
            partitions {list} -- List of field to partition the data by (default: {[]})

        Raises:
            e: Raises any error thrown by the write.parqet method from the Spark dataframe
        """
        local_temp_dir = Path('./temp')
        os.makedirs(local_temp_dir, exist_ok=True)
        bucket_name, root_prefix, _ = s3.split_path(s3_output_path)
           
        try:
            logger.info(s3_output_path)
            self.df.write.parquet(
                str(local_temp_dir),
                mode=mode,
                partitionBy=partitions
            )
            move_directory_to_s3(local_temp_dir, bucket_name, root_prefix, 'parquet')
            rmtree(local_temp_dir)
        except Exception as e:
            if "No FileSystem for scheme: s3" in str(e):
                logger.warning("Switching to slow S3 output method")
                s3_output_path = s3_output_path.replace("s3://", "s3a://")
                self.df.write.parquet(
                    s3_output_path,
                    mode=mode,
                    partitionBy=partitions
                )
            else:
                raise e

In [253]:
class Business(SparkDF):
    
    def __init__(self, dataset_uris_dict:dict):
        super().__init__(dataset_uris_dict[self.name])
        
    @property
    def name(self):
        return 'business'
    
    def get_partitions(self):
        return ['pstate','pcity']
    
    def process(self):
        columns_to_keep = [
            'business_id',
            'name',
            'categories',
            'state',
            'city',
            'address',
            'postal_code', 
            'review_count',
            'stars'     
        ]
        self.subset_df(columns_to_keep, option='keep')
            
    def apply_partitioning(self):       
        self.df = (self.df
                   .select('*', 
                           col("state").alias("pstate"),
                           col("city").alias("pcity")
                          )
            )
        
        
    def write_to_s3(self, s3_path:str, partitioned:bool=False):
        if partitioned:
            partitions=self.get_partitions()
        else:
            partitions=[]
        
        s3_path =  f"{s3_path}/{self.name}" 
        self._write_to_parquet(s3_path, partitions=partitions)


In [254]:
business = Business(dataset_uris_dict)
business.process()
business.write_to_s3(processed_uri, partitioned=False)
business.apply_partitioning()
business.write_to_s3(data_lake_uri, partitioned=True)

Loading file: s3://yelp-customer-reviews/raw-test/yelp_academic_dataset_business.json
Switching to slow S3a loading method
s3://yelp-customer-reviews/processed/business


s3://yelp-customer-reviews/processed/business/part-00000-b4c4f34a-4ab9-40a8-a91e-b11e15a930b2-c000.snappy.parquet


s3://yelp-customer-reviews/data-lake/business


s3://yelp-customer-reviews/data-lake/business/pstate=SC/pcity=Rock Hill/part-00000-fe4b0038-cbfe-480e-ba28-2943eba1b170.c000.snappy.parquet
s3://yelp-customer-reviews/data-lake/business/pstate=SC/pcity=Fort Mill/part-00000-fe4b0038-cbfe-480e-ba28-2943eba1b170.c000.snappy.parquet
s3://yelp-customer-reviews/data-lake/business/pstate=QC/pcity=Westmount/part-00000-fe4b0038-cbfe-480e-ba28-2943eba1b170.c000.snappy.parquet
s3://yelp-customer-reviews/data-lake/business/pstate=QC/pcity=Outremont/part-00000-fe4b0038-cbfe-480e-ba28-2943eba1b170.c000.snappy.parquet
s3://yelp-customer-reviews/data-lake/business/pstate=QC/pcity=Longueuil/part-00000-fe4b0038-cbfe-480e-ba28-2943eba1b170.c000.snappy.parquet
s3://yelp-customer-reviews/data-lake/business/pstate=QC/pcity=Montreal-Nord/part-00000-fe4b0038-cbfe-480e-ba28-2943eba1b170.c000.snappy.parquet
s3://yelp-customer-reviews/data-lake/business/pstate=QC/pcity=Saint-Laurent/part-00000-fe4b0038-cbfe-480e-ba28-2943eba1b170.c000.snappy.parquet
s3://yelp-cu

s3://yelp-customer-reviews/data-lake/business/pstate=OH/pcity=Hinckley/part-00000-fe4b0038-cbfe-480e-ba28-2943eba1b170.c000.snappy.parquet
s3://yelp-customer-reviews/data-lake/business/pstate=OH/pcity=Lorain/part-00000-fe4b0038-cbfe-480e-ba28-2943eba1b170.c000.snappy.parquet
s3://yelp-customer-reviews/data-lake/business/pstate=NC/pcity=Charlotte/part-00000-fe4b0038-cbfe-480e-ba28-2943eba1b170.c000.snappy.parquet
s3://yelp-customer-reviews/data-lake/business/pstate=NC/pcity=Pineville/part-00000-fe4b0038-cbfe-480e-ba28-2943eba1b170.c000.snappy.parquet
s3://yelp-customer-reviews/data-lake/business/pstate=NC/pcity=Waxhaw/part-00000-fe4b0038-cbfe-480e-ba28-2943eba1b170.c000.snappy.parquet
s3://yelp-customer-reviews/data-lake/business/pstate=NC/pcity=Concord/part-00000-fe4b0038-cbfe-480e-ba28-2943eba1b170.c000.snappy.parquet
s3://yelp-customer-reviews/data-lake/business/pstate=NC/pcity=Mint Hill/part-00000-fe4b0038-cbfe-480e-ba28-2943eba1b170.c000.snappy.parquet
s3://yelp-customer-reviews/da

s3://yelp-customer-reviews/data-lake/business/pstate=WI/pcity=Cottage Grove/part-00000-fe4b0038-cbfe-480e-ba28-2943eba1b170.c000.snappy.parquet
s3://yelp-customer-reviews/data-lake/business/pstate=WI/pcity=Stoughton/part-00000-fe4b0038-cbfe-480e-ba28-2943eba1b170.c000.snappy.parquet
s3://yelp-customer-reviews/data-lake/business/pstate=WI/pcity=Fitchburg/part-00000-fe4b0038-cbfe-480e-ba28-2943eba1b170.c000.snappy.parquet
s3://yelp-customer-reviews/data-lake/business/pstate=WI/pcity=Black Earth/part-00000-fe4b0038-cbfe-480e-ba28-2943eba1b170.c000.snappy.parquet
s3://yelp-customer-reviews/data-lake/business/pstate=WI/pcity=Monona/part-00000-fe4b0038-cbfe-480e-ba28-2943eba1b170.c000.snappy.parquet
s3://yelp-customer-reviews/data-lake/business/pstate=WI/pcity=Mc Farland/part-00000-fe4b0038-cbfe-480e-ba28-2943eba1b170.c000.snappy.parquet
s3://yelp-customer-reviews/data-lake/business/pstate=WI/pcity=De Forest/part-00000-fe4b0038-cbfe-480e-ba28-2943eba1b170.c000.snappy.parquet
s3://yelp-custom

In [255]:
processed_uri = f's3://{bucket_name}/processed'
data_lake_uri = f's3://{bucket_name}/data-lake'

In [233]:
class User(SparkDF):

    def __init__(self, dataset_uris_dict: dict):
        super().__init__(dataset_uris_dict[self.name])

    @property
    def name(self):
        return 'user'

    def get_partitions(self):
        return ['pyear', 'pmonth', 'pday']

    def process(self):
        self.df = (self.df.
                   select(
                       '*',
                       to_timestamp(
                           col('yelping_since'), 'yyyy-MM-dd HH:mm:ss').alias('yelping_since_dt')
                   )
                   )
        self.subset_df(['yelping_since'], option='drop')
        self.df = self.df.withColumnRenamed(
            "yelping_since_dt", "yelping_since")

    def apply_partitioning(self):
        self.df = (self.df
                   .withColumn("pmonth", month("yelping_since"))
                   .withColumn("pyear", year("yelping_since"))
                   .withColumn("pday", dayofmonth("yelping_since"))
                   .select('*')
                   )

    def write_to_s3(self, s3_path: str, partitioned: bool = False):
        if partitioned:
            partitions = self.get_partitions()
        else:
            partitions = []

        s3_path = f"{s3_path}/{self.name}"
        self._write_to_parquet(s3_path, partitions=partitions)


In [192]:
user = User(dataset_uris_dict)
user.process()
user.write_to_s3(processed_uri, partitioned=False)
user.apply_partitioning()
user.write_to_s3(data_lake_uri, partitioned=True)



Loading file: s3://yelp-customer-reviews/raw-test/yelp_academic_dataset_user.json
Switching to slow S3a loading method
s3://yelp-customer-reviews/processed/user
Switching to slow S3 output method
s3://yelp-customer-reviews/data-lake/user
Switching to slow S3 output method


In [191]:
user.df.select('name', 'average_stars', 'yelping_since', 'pyear', 'pmonth', 'pday').show(5)

+--------+-------------+-------------------+-----+------+----+
|    name|average_stars|      yelping_since|pyear|pmonth|pday|
+--------+-------------+-------------------+-----+------+----+
|  Rafael|         3.57|2007-07-06 03:27:11| 2007|     7|   6|
|Michelle|         3.84|2008-04-28 01:29:25| 2008|     4|  28|
|  Martin|         3.44|2008-08-28 23:40:05| 2008|     8|  28|
|    John|         3.08|2008-09-20 00:08:14| 2008|     9|  20|
|    Anne|         4.37|2008-08-09 00:30:27| 2008|     8|   9|
+--------+-------------+-------------------+-----+------+----+
only showing top 5 rows



In [256]:
class Review(SparkDF):
    
    def __init__(self, dataset_uris_dict:dict):
        super().__init__(dataset_uris_dict[self.name])
        
    @property
    def name(self):
        return 'review'
    
    def get_partitions(self):
        return ['pyear','pmonth', 'pday']
    
    def process(self):
        self.df = (self.df.
                   select(
                       '*',
                       to_timestamp(col('date'), 'yyyy-MM-dd HH:mm:ss').alias('dt')
                   )
                  )
        self.subset_df(['date'], option='drop')
    
    
    def apply_partitioning(self):                     
        self.df = (self.df
                   .withColumn("pmonth", month("dt"))
                           .withColumn("pyear", year("dt"))
                           .withColumn("pday", dayofmonth("dt"))
                   .select('*')
            )
        
        
    def write_to_s3(self, s3_path:str, partitioned:bool=False):
        if partitioned:
            partitions=self.get_partitions()
        else:
            partitions=[]
        
        s3_path =  f"{s3_path}/{self.name}" 
        self._write_to_parquet(s3_path, partitions=partitions)


In [257]:
review = Review(dataset_uris_dict)
review.process()
review.write_to_s3(processed_uri, partitioned=False)
review.apply_partitioning()
review.write_to_s3(data_lake_uri, partitioned=True)

Loading file: s3://yelp-customer-reviews/raw-test/yelp_academic_dataset_review.json
Switching to slow S3a loading method
s3://yelp-customer-reviews/processed/review


s3://yelp-customer-reviews/processed/review/part-00000-43025d86-7d48-4b67-99c3-39a257a9b67e-c000.snappy.parquet


s3://yelp-customer-reviews/data-lake/review


s3://yelp-customer-reviews/data-lake/review/pyear=2013/pmonth=11/pday=14/part-00000-1a105bca-657b-4051-b38e-4faaf8ba43f8.c000.snappy.parquet
s3://yelp-customer-reviews/data-lake/review/pyear=2013/pmonth=5/pday=7/part-00000-1a105bca-657b-4051-b38e-4faaf8ba43f8.c000.snappy.parquet
s3://yelp-customer-reviews/data-lake/review/pyear=2013/pmonth=5/pday=8/part-00000-1a105bca-657b-4051-b38e-4faaf8ba43f8.c000.snappy.parquet
s3://yelp-customer-reviews/data-lake/review/pyear=2013/pmonth=5/pday=28/part-00000-1a105bca-657b-4051-b38e-4faaf8ba43f8.c000.snappy.parquet
s3://yelp-customer-reviews/data-lake/review/pyear=2013/pmonth=12/pday=7/part-00000-1a105bca-657b-4051-b38e-4faaf8ba43f8.c000.snappy.parquet
s3://yelp-customer-reviews/data-lake/review/pyear=2013/pmonth=12/pday=29/part-00000-1a105bca-657b-4051-b38e-4faaf8ba43f8.c000.snappy.parquet
s3://yelp-customer-reviews/data-lake/review/pyear=2013/pmonth=12/pday=28/part-00000-1a105bca-657b-4051-b38e-4faaf8ba43f8.c000.snappy.parquet
s3://yelp-customer-

s3://yelp-customer-reviews/data-lake/review/pyear=2017/pmonth=7/pday=18/part-00000-1a105bca-657b-4051-b38e-4faaf8ba43f8.c000.snappy.parquet
s3://yelp-customer-reviews/data-lake/review/pyear=2017/pmonth=7/pday=28/part-00000-1a105bca-657b-4051-b38e-4faaf8ba43f8.c000.snappy.parquet
s3://yelp-customer-reviews/data-lake/review/pyear=2017/pmonth=2/pday=19/part-00000-1a105bca-657b-4051-b38e-4faaf8ba43f8.c000.snappy.parquet
s3://yelp-customer-reviews/data-lake/review/pyear=2017/pmonth=2/pday=9/part-00000-1a105bca-657b-4051-b38e-4faaf8ba43f8.c000.snappy.parquet
s3://yelp-customer-reviews/data-lake/review/pyear=2010/pmonth=11/pday=22/part-00000-1a105bca-657b-4051-b38e-4faaf8ba43f8.c000.snappy.parquet
s3://yelp-customer-reviews/data-lake/review/pyear=2010/pmonth=5/pday=14/part-00000-1a105bca-657b-4051-b38e-4faaf8ba43f8.c000.snappy.parquet
s3://yelp-customer-reviews/data-lake/review/pyear=2010/pmonth=3/pday=12/part-00000-1a105bca-657b-4051-b38e-4faaf8ba43f8.c000.snappy.parquet
s3://yelp-customer-r

s3://yelp-customer-reviews/data-lake/review/pyear=2014/pmonth=2/pday=23/part-00000-1a105bca-657b-4051-b38e-4faaf8ba43f8.c000.snappy.parquet
s3://yelp-customer-reviews/data-lake/review/pyear=2014/pmonth=2/pday=5/part-00000-1a105bca-657b-4051-b38e-4faaf8ba43f8.c000.snappy.parquet
s3://yelp-customer-reviews/data-lake/review/pyear=2012/pmonth=11/pday=6/part-00000-1a105bca-657b-4051-b38e-4faaf8ba43f8.c000.snappy.parquet
s3://yelp-customer-reviews/data-lake/review/pyear=2012/pmonth=5/pday=28/part-00000-1a105bca-657b-4051-b38e-4faaf8ba43f8.c000.snappy.parquet
s3://yelp-customer-reviews/data-lake/review/pyear=2012/pmonth=12/pday=2/part-00000-1a105bca-657b-4051-b38e-4faaf8ba43f8.c000.snappy.parquet
s3://yelp-customer-reviews/data-lake/review/pyear=2012/pmonth=12/pday=4/part-00000-1a105bca-657b-4051-b38e-4faaf8ba43f8.c000.snappy.parquet
s3://yelp-customer-reviews/data-lake/review/pyear=2012/pmonth=3/pday=1/part-00000-1a105bca-657b-4051-b38e-4faaf8ba43f8.c000.snappy.parquet
s3://yelp-customer-rev

s3://yelp-customer-reviews/data-lake/review/pyear=2006/pmonth=4/pday=21/part-00000-1a105bca-657b-4051-b38e-4faaf8ba43f8.c000.snappy.parquet
s3://yelp-customer-reviews/data-lake/review/pyear=2016/pmonth=11/pday=10/part-00000-1a105bca-657b-4051-b38e-4faaf8ba43f8.c000.snappy.parquet
s3://yelp-customer-reviews/data-lake/review/pyear=2016/pmonth=11/pday=21/part-00000-1a105bca-657b-4051-b38e-4faaf8ba43f8.c000.snappy.parquet
s3://yelp-customer-reviews/data-lake/review/pyear=2016/pmonth=11/pday=9/part-00000-1a105bca-657b-4051-b38e-4faaf8ba43f8.c000.snappy.parquet
s3://yelp-customer-reviews/data-lake/review/pyear=2016/pmonth=5/pday=7/part-00000-1a105bca-657b-4051-b38e-4faaf8ba43f8.c000.snappy.parquet
s3://yelp-customer-reviews/data-lake/review/pyear=2016/pmonth=5/pday=31/part-00000-1a105bca-657b-4051-b38e-4faaf8ba43f8.c000.snappy.parquet
s3://yelp-customer-reviews/data-lake/review/pyear=2016/pmonth=5/pday=19/part-00000-1a105bca-657b-4051-b38e-4faaf8ba43f8.c000.snappy.parquet
s3://yelp-customer-

s3://yelp-customer-reviews/data-lake/review/pyear=2018/pmonth=8/pday=4/part-00000-1a105bca-657b-4051-b38e-4faaf8ba43f8.c000.snappy.parquet
s3://yelp-customer-reviews/data-lake/review/pyear=2018/pmonth=6/pday=8/part-00000-1a105bca-657b-4051-b38e-4faaf8ba43f8.c000.snappy.parquet
s3://yelp-customer-reviews/data-lake/review/pyear=2018/pmonth=6/pday=19/part-00000-1a105bca-657b-4051-b38e-4faaf8ba43f8.c000.snappy.parquet
s3://yelp-customer-reviews/data-lake/review/pyear=2018/pmonth=9/pday=14/part-00000-1a105bca-657b-4051-b38e-4faaf8ba43f8.c000.snappy.parquet
s3://yelp-customer-reviews/data-lake/review/pyear=2018/pmonth=9/pday=25/part-00000-1a105bca-657b-4051-b38e-4faaf8ba43f8.c000.snappy.parquet
s3://yelp-customer-reviews/data-lake/review/pyear=2018/pmonth=9/pday=3/part-00000-1a105bca-657b-4051-b38e-4faaf8ba43f8.c000.snappy.parquet
s3://yelp-customer-reviews/data-lake/review/pyear=2018/pmonth=4/pday=15/part-00000-1a105bca-657b-4051-b38e-4faaf8ba43f8.c000.snappy.parquet
s3://yelp-customer-revi

In [258]:
review.df.select('stars','text', 'pyear', 'pmonth', 'pday').show(5)

+-----+--------------------+-----+------+----+
|stars|                text|pyear|pmonth|pday|
+-----+--------------------+-----+------+----+
|  2.0|As someone who ha...| 2015|     4|  15|
|  1.0|I am actually hor...| 2013|    12|   7|
|  5.0|I love Deagan's. ...| 2015|    12|   5|
|  1.0|Dismal, lukewarm,...| 2011|     5|  27|
|  4.0|Oh happy day, fin...| 2017|     1|  14|
+-----+--------------------+-----+------+----+
only showing top 5 rows



In [259]:
class Tip(SparkDF):
    
    def __init__(self, dataset_uris_dict:dict):
        super().__init__(dataset_uris_dict[self.name])
        
    @property
    def name(self):
        return 'tip'
    
    def get_partitions(self):
        return ['pyear','pmonth', 'pday']
    
    def process(self):
        self.df = (self.df.
                   select(
                       '*',
                       to_timestamp(col('date'), 'yyyy-MM-dd HH:mm:ss').alias('dt')
                   )
                  )
        self.subset_df(['date'], option='drop')
    
    
    def apply_partitioning(self):                     
        self.df = (self.df
                   .withColumn("pmonth", month("dt"))
                           .withColumn("pyear", year("dt"))
                           .withColumn("pday", dayofmonth("dt"))
                   .select('*')
            )
        
        
    def write_to_s3(self, s3_path:str, partitioned:bool=False):
        if partitioned:
            partitions=self.get_partitions()
        else:
            partitions=[]
        
        s3_path =  f"{s3_path}/{self.name}" 
        self._write_to_parquet(s3_path, partitions=partitions)


In [260]:
tip = Tip(dataset_uris_dict)
tip.process()
tip.write_to_s3(processed_uri, partitioned=False)
tip.apply_partitioning()
tip.write_to_s3(data_lake_uri, partitioned=True)

Loading file: s3://yelp-customer-reviews/raw-test/yelp_academic_dataset_tip.json
Switching to slow S3a loading method
s3://yelp-customer-reviews/processed/tip


s3://yelp-customer-reviews/processed/tip/part-00000-7b4f0554-74e3-41e4-858f-a239c48ec9e7-c000.snappy.parquet


s3://yelp-customer-reviews/data-lake/tip


s3://yelp-customer-reviews/data-lake/tip/pyear=2013/pmonth=11/pday=26/part-00000-1520a270-2f65-4039-aa1c-a2939ffb6ec0.c000.snappy.parquet
s3://yelp-customer-reviews/data-lake/tip/pyear=2013/pmonth=11/pday=28/part-00000-1520a270-2f65-4039-aa1c-a2939ffb6ec0.c000.snappy.parquet
s3://yelp-customer-reviews/data-lake/tip/pyear=2013/pmonth=11/pday=30/part-00000-1520a270-2f65-4039-aa1c-a2939ffb6ec0.c000.snappy.parquet
s3://yelp-customer-reviews/data-lake/tip/pyear=2013/pmonth=5/pday=20/part-00000-1520a270-2f65-4039-aa1c-a2939ffb6ec0.c000.snappy.parquet
s3://yelp-customer-reviews/data-lake/tip/pyear=2013/pmonth=5/pday=17/part-00000-1520a270-2f65-4039-aa1c-a2939ffb6ec0.c000.snappy.parquet
s3://yelp-customer-reviews/data-lake/tip/pyear=2013/pmonth=5/pday=25/part-00000-1520a270-2f65-4039-aa1c-a2939ffb6ec0.c000.snappy.parquet
s3://yelp-customer-reviews/data-lake/tip/pyear=2013/pmonth=5/pday=2/part-00000-1520a270-2f65-4039-aa1c-a2939ffb6ec0.c000.snappy.parquet
s3://yelp-customer-reviews/data-lake/ti

s3://yelp-customer-reviews/data-lake/tip/pyear=2013/pmonth=6/pday=5/part-00000-1520a270-2f65-4039-aa1c-a2939ffb6ec0.c000.snappy.parquet
s3://yelp-customer-reviews/data-lake/tip/pyear=2013/pmonth=6/pday=3/part-00000-1520a270-2f65-4039-aa1c-a2939ffb6ec0.c000.snappy.parquet
s3://yelp-customer-reviews/data-lake/tip/pyear=2013/pmonth=6/pday=1/part-00000-1520a270-2f65-4039-aa1c-a2939ffb6ec0.c000.snappy.parquet
s3://yelp-customer-reviews/data-lake/tip/pyear=2013/pmonth=6/pday=9/part-00000-1520a270-2f65-4039-aa1c-a2939ffb6ec0.c000.snappy.parquet
s3://yelp-customer-reviews/data-lake/tip/pyear=2013/pmonth=6/pday=12/part-00000-1520a270-2f65-4039-aa1c-a2939ffb6ec0.c000.snappy.parquet
s3://yelp-customer-reviews/data-lake/tip/pyear=2013/pmonth=6/pday=28/part-00000-1520a270-2f65-4039-aa1c-a2939ffb6ec0.c000.snappy.parquet
s3://yelp-customer-reviews/data-lake/tip/pyear=2013/pmonth=9/pday=17/part-00000-1520a270-2f65-4039-aa1c-a2939ffb6ec0.c000.snappy.parquet
s3://yelp-customer-reviews/data-lake/tip/pyea

s3://yelp-customer-reviews/data-lake/tip/pyear=2017/pmonth=6/pday=17/part-00000-1520a270-2f65-4039-aa1c-a2939ffb6ec0.c000.snappy.parquet
s3://yelp-customer-reviews/data-lake/tip/pyear=2017/pmonth=6/pday=14/part-00000-1520a270-2f65-4039-aa1c-a2939ffb6ec0.c000.snappy.parquet
s3://yelp-customer-reviews/data-lake/tip/pyear=2017/pmonth=6/pday=25/part-00000-1520a270-2f65-4039-aa1c-a2939ffb6ec0.c000.snappy.parquet
s3://yelp-customer-reviews/data-lake/tip/pyear=2017/pmonth=6/pday=24/part-00000-1520a270-2f65-4039-aa1c-a2939ffb6ec0.c000.snappy.parquet
s3://yelp-customer-reviews/data-lake/tip/pyear=2017/pmonth=6/pday=18/part-00000-1520a270-2f65-4039-aa1c-a2939ffb6ec0.c000.snappy.parquet
s3://yelp-customer-reviews/data-lake/tip/pyear=2017/pmonth=6/pday=26/part-00000-1520a270-2f65-4039-aa1c-a2939ffb6ec0.c000.snappy.parquet
s3://yelp-customer-reviews/data-lake/tip/pyear=2017/pmonth=6/pday=13/part-00000-1520a270-2f65-4039-aa1c-a2939ffb6ec0.c000.snappy.parquet
s3://yelp-customer-reviews/data-lake/tip/

s3://yelp-customer-reviews/data-lake/tip/pyear=2010/pmonth=1/pday=23/part-00000-1520a270-2f65-4039-aa1c-a2939ffb6ec0.c000.snappy.parquet
s3://yelp-customer-reviews/data-lake/tip/pyear=2010/pmonth=1/pday=30/part-00000-1520a270-2f65-4039-aa1c-a2939ffb6ec0.c000.snappy.parquet
s3://yelp-customer-reviews/data-lake/tip/pyear=2010/pmonth=8/pday=21/part-00000-1520a270-2f65-4039-aa1c-a2939ffb6ec0.c000.snappy.parquet
s3://yelp-customer-reviews/data-lake/tip/pyear=2010/pmonth=6/pday=7/part-00000-1520a270-2f65-4039-aa1c-a2939ffb6ec0.c000.snappy.parquet
s3://yelp-customer-reviews/data-lake/tip/pyear=2010/pmonth=6/pday=10/part-00000-1520a270-2f65-4039-aa1c-a2939ffb6ec0.c000.snappy.parquet
s3://yelp-customer-reviews/data-lake/tip/pyear=2010/pmonth=6/pday=18/part-00000-1520a270-2f65-4039-aa1c-a2939ffb6ec0.c000.snappy.parquet
s3://yelp-customer-reviews/data-lake/tip/pyear=2010/pmonth=6/pday=13/part-00000-1520a270-2f65-4039-aa1c-a2939ffb6ec0.c000.snappy.parquet
s3://yelp-customer-reviews/data-lake/tip/p

s3://yelp-customer-reviews/data-lake/tip/pyear=2011/pmonth=1/pday=10/part-00000-1520a270-2f65-4039-aa1c-a2939ffb6ec0.c000.snappy.parquet
s3://yelp-customer-reviews/data-lake/tip/pyear=2011/pmonth=1/pday=8/part-00000-1520a270-2f65-4039-aa1c-a2939ffb6ec0.c000.snappy.parquet
s3://yelp-customer-reviews/data-lake/tip/pyear=2011/pmonth=1/pday=18/part-00000-1520a270-2f65-4039-aa1c-a2939ffb6ec0.c000.snappy.parquet
s3://yelp-customer-reviews/data-lake/tip/pyear=2011/pmonth=1/pday=27/part-00000-1520a270-2f65-4039-aa1c-a2939ffb6ec0.c000.snappy.parquet
s3://yelp-customer-reviews/data-lake/tip/pyear=2011/pmonth=1/pday=9/part-00000-1520a270-2f65-4039-aa1c-a2939ffb6ec0.c000.snappy.parquet
s3://yelp-customer-reviews/data-lake/tip/pyear=2011/pmonth=8/pday=20/part-00000-1520a270-2f65-4039-aa1c-a2939ffb6ec0.c000.snappy.parquet
s3://yelp-customer-reviews/data-lake/tip/pyear=2011/pmonth=8/pday=25/part-00000-1520a270-2f65-4039-aa1c-a2939ffb6ec0.c000.snappy.parquet
s3://yelp-customer-reviews/data-lake/tip/py

s3://yelp-customer-reviews/data-lake/tip/pyear=2014/pmonth=3/pday=24/part-00000-1520a270-2f65-4039-aa1c-a2939ffb6ec0.c000.snappy.parquet
s3://yelp-customer-reviews/data-lake/tip/pyear=2014/pmonth=3/pday=26/part-00000-1520a270-2f65-4039-aa1c-a2939ffb6ec0.c000.snappy.parquet
s3://yelp-customer-reviews/data-lake/tip/pyear=2014/pmonth=3/pday=29/part-00000-1520a270-2f65-4039-aa1c-a2939ffb6ec0.c000.snappy.parquet
s3://yelp-customer-reviews/data-lake/tip/pyear=2014/pmonth=3/pday=13/part-00000-1520a270-2f65-4039-aa1c-a2939ffb6ec0.c000.snappy.parquet
s3://yelp-customer-reviews/data-lake/tip/pyear=2014/pmonth=3/pday=19/part-00000-1520a270-2f65-4039-aa1c-a2939ffb6ec0.c000.snappy.parquet
s3://yelp-customer-reviews/data-lake/tip/pyear=2014/pmonth=3/pday=27/part-00000-1520a270-2f65-4039-aa1c-a2939ffb6ec0.c000.snappy.parquet
s3://yelp-customer-reviews/data-lake/tip/pyear=2014/pmonth=3/pday=21/part-00000-1520a270-2f65-4039-aa1c-a2939ffb6ec0.c000.snappy.parquet
s3://yelp-customer-reviews/data-lake/tip/

s3://yelp-customer-reviews/data-lake/tip/pyear=2012/pmonth=11/pday=17/part-00000-1520a270-2f65-4039-aa1c-a2939ffb6ec0.c000.snappy.parquet
s3://yelp-customer-reviews/data-lake/tip/pyear=2012/pmonth=11/pday=14/part-00000-1520a270-2f65-4039-aa1c-a2939ffb6ec0.c000.snappy.parquet
s3://yelp-customer-reviews/data-lake/tip/pyear=2012/pmonth=11/pday=2/part-00000-1520a270-2f65-4039-aa1c-a2939ffb6ec0.c000.snappy.parquet
s3://yelp-customer-reviews/data-lake/tip/pyear=2012/pmonth=11/pday=8/part-00000-1520a270-2f65-4039-aa1c-a2939ffb6ec0.c000.snappy.parquet
s3://yelp-customer-reviews/data-lake/tip/pyear=2012/pmonth=11/pday=5/part-00000-1520a270-2f65-4039-aa1c-a2939ffb6ec0.c000.snappy.parquet
s3://yelp-customer-reviews/data-lake/tip/pyear=2012/pmonth=11/pday=6/part-00000-1520a270-2f65-4039-aa1c-a2939ffb6ec0.c000.snappy.parquet
s3://yelp-customer-reviews/data-lake/tip/pyear=2012/pmonth=11/pday=9/part-00000-1520a270-2f65-4039-aa1c-a2939ffb6ec0.c000.snappy.parquet
s3://yelp-customer-reviews/data-lake/ti

s3://yelp-customer-reviews/data-lake/tip/pyear=2012/pmonth=1/pday=10/part-00000-1520a270-2f65-4039-aa1c-a2939ffb6ec0.c000.snappy.parquet
s3://yelp-customer-reviews/data-lake/tip/pyear=2012/pmonth=1/pday=8/part-00000-1520a270-2f65-4039-aa1c-a2939ffb6ec0.c000.snappy.parquet
s3://yelp-customer-reviews/data-lake/tip/pyear=2012/pmonth=1/pday=4/part-00000-1520a270-2f65-4039-aa1c-a2939ffb6ec0.c000.snappy.parquet
s3://yelp-customer-reviews/data-lake/tip/pyear=2012/pmonth=1/pday=3/part-00000-1520a270-2f65-4039-aa1c-a2939ffb6ec0.c000.snappy.parquet
s3://yelp-customer-reviews/data-lake/tip/pyear=2012/pmonth=1/pday=11/part-00000-1520a270-2f65-4039-aa1c-a2939ffb6ec0.c000.snappy.parquet
s3://yelp-customer-reviews/data-lake/tip/pyear=2012/pmonth=1/pday=6/part-00000-1520a270-2f65-4039-aa1c-a2939ffb6ec0.c000.snappy.parquet
s3://yelp-customer-reviews/data-lake/tip/pyear=2012/pmonth=1/pday=1/part-00000-1520a270-2f65-4039-aa1c-a2939ffb6ec0.c000.snappy.parquet
s3://yelp-customer-reviews/data-lake/tip/pyear

s3://yelp-customer-reviews/data-lake/tip/pyear=2015/pmonth=5/pday=29/part-00000-1520a270-2f65-4039-aa1c-a2939ffb6ec0.c000.snappy.parquet
s3://yelp-customer-reviews/data-lake/tip/pyear=2015/pmonth=5/pday=19/part-00000-1520a270-2f65-4039-aa1c-a2939ffb6ec0.c000.snappy.parquet
s3://yelp-customer-reviews/data-lake/tip/pyear=2015/pmonth=5/pday=21/part-00000-1520a270-2f65-4039-aa1c-a2939ffb6ec0.c000.snappy.parquet
s3://yelp-customer-reviews/data-lake/tip/pyear=2015/pmonth=5/pday=22/part-00000-1520a270-2f65-4039-aa1c-a2939ffb6ec0.c000.snappy.parquet
s3://yelp-customer-reviews/data-lake/tip/pyear=2015/pmonth=5/pday=30/part-00000-1520a270-2f65-4039-aa1c-a2939ffb6ec0.c000.snappy.parquet
s3://yelp-customer-reviews/data-lake/tip/pyear=2015/pmonth=12/pday=23/part-00000-1520a270-2f65-4039-aa1c-a2939ffb6ec0.c000.snappy.parquet
s3://yelp-customer-reviews/data-lake/tip/pyear=2015/pmonth=12/pday=31/part-00000-1520a270-2f65-4039-aa1c-a2939ffb6ec0.c000.snappy.parquet
s3://yelp-customer-reviews/data-lake/ti

s3://yelp-customer-reviews/data-lake/tip/pyear=2015/pmonth=7/pday=8/part-00000-1520a270-2f65-4039-aa1c-a2939ffb6ec0.c000.snappy.parquet
s3://yelp-customer-reviews/data-lake/tip/pyear=2015/pmonth=2/pday=20/part-00000-1520a270-2f65-4039-aa1c-a2939ffb6ec0.c000.snappy.parquet
s3://yelp-customer-reviews/data-lake/tip/pyear=2015/pmonth=2/pday=17/part-00000-1520a270-2f65-4039-aa1c-a2939ffb6ec0.c000.snappy.parquet
s3://yelp-customer-reviews/data-lake/tip/pyear=2015/pmonth=2/pday=16/part-00000-1520a270-2f65-4039-aa1c-a2939ffb6ec0.c000.snappy.parquet
s3://yelp-customer-reviews/data-lake/tip/pyear=2015/pmonth=2/pday=7/part-00000-1520a270-2f65-4039-aa1c-a2939ffb6ec0.c000.snappy.parquet
s3://yelp-customer-reviews/data-lake/tip/pyear=2015/pmonth=2/pday=23/part-00000-1520a270-2f65-4039-aa1c-a2939ffb6ec0.c000.snappy.parquet
s3://yelp-customer-reviews/data-lake/tip/pyear=2015/pmonth=2/pday=8/part-00000-1520a270-2f65-4039-aa1c-a2939ffb6ec0.c000.snappy.parquet
s3://yelp-customer-reviews/data-lake/tip/pye

s3://yelp-customer-reviews/data-lake/tip/pyear=2016/pmonth=1/pday=1/part-00000-1520a270-2f65-4039-aa1c-a2939ffb6ec0.c000.snappy.parquet
s3://yelp-customer-reviews/data-lake/tip/pyear=2016/pmonth=1/pday=29/part-00000-1520a270-2f65-4039-aa1c-a2939ffb6ec0.c000.snappy.parquet
s3://yelp-customer-reviews/data-lake/tip/pyear=2016/pmonth=1/pday=13/part-00000-1520a270-2f65-4039-aa1c-a2939ffb6ec0.c000.snappy.parquet
s3://yelp-customer-reviews/data-lake/tip/pyear=2016/pmonth=1/pday=9/part-00000-1520a270-2f65-4039-aa1c-a2939ffb6ec0.c000.snappy.parquet
s3://yelp-customer-reviews/data-lake/tip/pyear=2016/pmonth=1/pday=12/part-00000-1520a270-2f65-4039-aa1c-a2939ffb6ec0.c000.snappy.parquet
s3://yelp-customer-reviews/data-lake/tip/pyear=2016/pmonth=8/pday=17/part-00000-1520a270-2f65-4039-aa1c-a2939ffb6ec0.c000.snappy.parquet
s3://yelp-customer-reviews/data-lake/tip/pyear=2016/pmonth=8/pday=14/part-00000-1520a270-2f65-4039-aa1c-a2939ffb6ec0.c000.snappy.parquet
s3://yelp-customer-reviews/data-lake/tip/py

s3://yelp-customer-reviews/data-lake/tip/pyear=2018/pmonth=4/pday=6/part-00000-1520a270-2f65-4039-aa1c-a2939ffb6ec0.c000.snappy.parquet
s3://yelp-customer-reviews/data-lake/tip/pyear=2018/pmonth=4/pday=27/part-00000-1520a270-2f65-4039-aa1c-a2939ffb6ec0.c000.snappy.parquet
s3://yelp-customer-reviews/data-lake/tip/pyear=2018/pmonth=4/pday=9/part-00000-1520a270-2f65-4039-aa1c-a2939ffb6ec0.c000.snappy.parquet
s3://yelp-customer-reviews/data-lake/tip/pyear=2018/pmonth=2/pday=9/part-00000-1520a270-2f65-4039-aa1c-a2939ffb6ec0.c000.snappy.parquet


In [184]:
tip.df.printSchema()

root
 |-- business_id: string (nullable = true)
 |-- compliment_count: long (nullable = true)
 |-- text: string (nullable = true)
 |-- user_id: string (nullable = true)
 |-- dt: timestamp (nullable = true)
 |-- pmonth: integer (nullable = true)
 |-- pyear: integer (nullable = true)
 |-- pday: integer (nullable = true)



In [185]:
tip.df.show(5)

+--------------------+----------------+--------------------+--------------------+-------------------+------+-----+----+
|         business_id|compliment_count|                text|             user_id|                 dt|pmonth|pyear|pday|
+--------------------+----------------+--------------------+--------------------+-------------------+------+-----+----+
|UYX5zL_Xj9WEc_Wp-...|               0|Here for a quick mtg|hf27xTME3EiCp6NL6...|2013-11-26 18:20:08|    11| 2013|  26|
|Ch3HkwQYv1YKw_FO0...|               0|Cucumber strawber...|uEvusDwoSymbJJ0au...|2014-06-15 22:26:45|     6| 2014|  15|
|rDoT-MgxGRiYqCmi0...|               0|Very nice good se...|AY-laIws3S7YXNl_f...|2016-07-18 22:03:42|     7| 2016|  18|
|OHXnDV01gLokiX1EL...|               0|It's a small plac...|Ue_7yUlkEbX4AhnYd...|2014-06-06 01:10:34|     6| 2014|   6|
|GMrwDXRlAZU2zj5nH...|               0|8 sandwiches, $24...|LltbT_fUMqZ-ZJP-v...|2011-04-08 18:12:01|     4| 2011|   8|
+--------------------+----------------+-