In [1]:
from pyspark.sql import SparkSession
import os
from pyspark.sql.functions import split, explode,to_date
import shutil
import gc

spark = SparkSession.builder \
    .appName("refine_gkg_counts") \
    .config("spark.driver.memory", "20g")\
    .getOrCreate()
    # .config("spark.memory.offHeap.enabled",True)\
    # .config("spark.executor.memory", "10g")\
    # .config("spark.memory.offHeap.size","8g") \

base_dir = '/home/oscar/budasbi-repos/factored-datathon-2024-voyager/bucket_contents/raw/gkg_counts'
parquet_path = '/home/oscar/budasbi-repos/factored-datathon-2024-voyager/parquet/'
if os.path.exists(parquet_path):
    shutil.rmtree(parquet_path)
    os.makedirs(parquet_path)


24/08/19 18:09:30 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [2]:
spark.sparkContext

In [3]:
def cleaned_to_parquet(csv_filepath):
    gkg_counts = spark.read.csv(csv_filepath, header=True, sep='\t',inferSchema=True)
    gkg_colums=['date','numarts', 'count_type', 'number', 'object_type', 'geo_type', 'geo_fullname', 'geo_country_code', 'geo_adm1_code', 'geo_lat', 'geo_long','geo_feature_id', 'cameo_event_ids', 'sources', 'source_urls' ]
    gkg_counts_columns = gkg_counts.toDF(*gkg_colums)
    gkg_counts_date = gkg_counts_columns.withColumn('date', to_date('date', 'yyyyMMdd'))
    #Split Cameo_event_ids
    gkg_counts_split = gkg_counts_date.withColumn("cameo_event_ids_split", split(gkg_counts_date["cameo_event_ids"], ","))
    gkg_counts_explode = gkg_counts_split.withColumn("cameo_event_ids", explode(gkg_counts_split["cameo_event_ids_split"]))
    gkg_counts_ex = gkg_counts_explode.drop("cameo_event_ids_split")
    print(csv_filepath)
    gkg_counts_nodups = gkg_counts_ex.dropDuplicates()
    
    
    gkg_counts.unpersist()
    gkg_counts_columns.unpersist()
    gkg_counts_date.unpersist()
    gkg_counts_split.unpersist()
    gkg_counts_explode.unpersist()
    gkg_counts_ex.unpersist()
    
    gkg_counts_nodups.write\
    .mode('append')\
    .format('parquet')\
    .option('compression', 'snappy')\
    .partitionBy('date')\
    .save(parquet_path)
    
    gkg_counts_nodups.unpersist()
    gc.collect()

In [4]:
gkg_files = []
for root, dirs, files in os.walk(base_dir):
    for file in files:
        if file.endswith(".csv"):
            gkg_files.append(os.path.join(root, file))


In [5]:
if gkg_files:
    for file in gkg_files:
        cleaned_to_parquet(file)
        break
    # gkg_counts = spark.read.csv(gkg_files, header=True, sep='\t',inferSchema=True)
else:   
    print("We didn't find any CSV file")


/home/oscar/budasbi-repos/factored-datathon-2024-voyager/bucket_contents/raw/gkg_counts/year=2024/month=08/day=05/20240805.gkgcounts.csv


                                                                                

/home/oscar/budasbi-repos/factored-datathon-2024-voyager/bucket_contents/raw/gkg_counts/year=2024/month=08/day=15/20240815.gkgcounts.csv


                                                                                

/home/oscar/budasbi-repos/factored-datathon-2024-voyager/bucket_contents/raw/gkg_counts/year=2024/month=08/day=14/20240814.gkgcounts.csv


                                                                                

/home/oscar/budasbi-repos/factored-datathon-2024-voyager/bucket_contents/raw/gkg_counts/year=2024/month=08/day=10/20240810.gkgcounts.csv


                                                                                

/home/oscar/budasbi-repos/factored-datathon-2024-voyager/bucket_contents/raw/gkg_counts/year=2024/month=08/day=11/20240811.gkgcounts.csv


                                                                                

/home/oscar/budasbi-repos/factored-datathon-2024-voyager/bucket_contents/raw/gkg_counts/year=2024/month=08/day=07/20240807.gkgcounts.csv


                                                                                

/home/oscar/budasbi-repos/factored-datathon-2024-voyager/bucket_contents/raw/gkg_counts/year=2024/month=08/day=12/20240812.gkgcounts.csv


                                                                                

/home/oscar/budasbi-repos/factored-datathon-2024-voyager/bucket_contents/raw/gkg_counts/year=2024/month=08/day=02/20240802.gkgcounts.csv


                                                                                

/home/oscar/budasbi-repos/factored-datathon-2024-voyager/bucket_contents/raw/gkg_counts/year=2024/month=08/day=16/20240816.gkgcounts.csv


                                                                                

/home/oscar/budasbi-repos/factored-datathon-2024-voyager/bucket_contents/raw/gkg_counts/year=2024/month=08/day=06/20240806.gkgcounts.csv


                                                                                

/home/oscar/budasbi-repos/factored-datathon-2024-voyager/bucket_contents/raw/gkg_counts/year=2024/month=08/day=03/20240803.gkgcounts.csv


                                                                                

/home/oscar/budasbi-repos/factored-datathon-2024-voyager/bucket_contents/raw/gkg_counts/year=2024/month=08/day=08/20240808.gkgcounts.csv


                                                                                

/home/oscar/budasbi-repos/factored-datathon-2024-voyager/bucket_contents/raw/gkg_counts/year=2024/month=08/day=09/20240809.gkgcounts.csv


                                                                                

/home/oscar/budasbi-repos/factored-datathon-2024-voyager/bucket_contents/raw/gkg_counts/year=2024/month=08/day=17/20240817.gkgcounts.csv


                                                                                

/home/oscar/budasbi-repos/factored-datathon-2024-voyager/bucket_contents/raw/gkg_counts/year=2024/month=08/day=04/20240804.gkgcounts.csv


                                                                                

/home/oscar/budasbi-repos/factored-datathon-2024-voyager/bucket_contents/raw/gkg_counts/year=2024/month=08/day=13/20240813.gkgcounts.csv


                                                                                