In [18]:
import pandas as pd
import numpy as np
import pyspark
from pyspark.sql import *
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark import SparkContext, SparkConf
from pyspark.sql import SQLContext # https://spark.apache.org/docs/1.6.1/sql-programming-guide.html
from os.path import join, abspath
import os

In [21]:
spark.stop()

In [22]:
# https://stackoverflow.com/questions/21138751/spark-java-lang-outofmemoryerror-java-heap-space/22742982#22742982
# https://luminousmen.com/post/spark-partitions
# https://spark.apache.org/docs/latest/sql-data-sources-hive-tables.html
warehouse_location = abspath('../data/spark-warehouse')
# Create the session
conf = (SparkConf()
    .set("spark.ui.port", "4041")
    .set('spark.sql.warehouse.dir', warehouse_location)
#    .set('spark.executor.memory', '6G')
#    .set('spark.driver.memory', '6G')
#    .set('spark.storage.memoryFraction', '.5')
#    .set('spark.driver.maxResultSize', '2G')
#    .set("spark.dynamicAllocation.enabled", "true")   
#    .set("spark.executor.cores", "4")
#    .set("spark.dynamicAllocation.minExecutors","1")
#    .set("spark.dynamicAllocation.maxExecutors","5")
#    .set("spark.sql.legacy.allowCreatingManagedTableUsingNonemptyLocation","true")
       )

# Create the context
sc = pyspark.SparkContext(conf=conf)
spark = (SparkSession.builder
    .appName('Spark Practice')
    .getOrCreate())
sqlContext = SQLContext(sc)

In [2]:
print(f"Spark version = {spark.version}")
print(f"Hadoop version = {sc._jvm.org.apache.hadoop.util.VersionInfo.getVersion()}")


Spark version = 3.0.1
Hadoop version = 3.2.0


In [3]:
spark.sql('SHOW DATABASES;').show()

+---------+
|namespace|
+---------+
|  default|
+---------+



In [4]:
sc.getConf().getAll()

[('spark.app.id', 'local-1606608100151'),
 ('spark.driver.host', '6ef172d4d6e2'),
 ('spark.rdd.compress', 'True'),
 ('spark.sql.warehouse.dir', '/home/jovyan/cse451/spark-warehouse/test'),
 ('spark.ui.port', '4041'),
 ('spark.serializer.objectStreamReset', '100'),
 ('spark.driver.port', '37455'),
 ('spark.master', 'local[*]'),
 ('spark.submit.pyFiles', ''),
 ('spark.executor.id', 'driver'),
 ('spark.submit.deployMode', 'client'),
 ('spark.ui.showConsoleProgress', 'true'),
 ('spark.app.name', 'pyspark-shell')]

In [24]:
# https://stackoverflow.com/questions/56927329/spark-option-inferschema-vs-header-true/56933052
df_master = (spark.read.format('csv')
    .option('header', 'true') # first row is column names.
    .option("inferSchema", "true") # set column types       
    .load('../data/Open990/IRS_Master_File/*.csv'))

df_master = df_master.toDF(*[c.lower() for c in df_master.columns]) # create lowercase columns

In [25]:
df_contractor = (spark.read.format('csv')
    .option('header', 'true')
    .option("inferSchema", "true")
    .load('../data/Open990/Open990_Contractor_Compensation_Snack_Set_Public/Open990_Contractor_Compensation_Snack_Set_Public_2019-01-24.csv'))
df_governance = (spark.read.format('csv')
    .option('header', 'true')
    .option("inferSchema", "true")
    .load('../data/Open990/Open990_Governance_Snack_Set_Public/Open990_Governance_Snack_Set_Public_2019-01-15.csv'))


In [26]:
df_executive_charity = (spark.read.format('csv')
    .option('header', 'true')
    .option("inferSchema", "true")
    .load('../data/Open990/Open990_SnackSet_Executive_Compensation/Compensation_charities_Open990.csv'))
df_executive_charity = df_executive_charity.toDF(*[c.lower().replace(' ', "_").replace('-', '_') for c in df_executive_charity.columns])

In [27]:
df_executive_foundations = (spark.read.format('csv')
    .option('header', 'true')
    .option("inferSchema", "true")
    .load('../data/Open990/Open990_SnackSet_Executive_Compensation/Compensation_foundations_Open990.csv'))
df_executive_foundations = df_executive_foundations.toDF(*[c.lower().replace(' ', "_").replace('-', '_') for c in df_executive_foundations.columns])

In [28]:
df_grants = (spark.read.format('csv')
    .option('header', 'true')
    .option("inferSchema", "true")
    .option("comment", "F")
    .load('../data/Open990/Open990_SnackSet_Foundations_Grants/Grants.csv'))
df_grants = df_grants.toDF(*[c.lower().replace(' ', "_").replace('-', '_') for c in df_grants.columns])

In [29]:
df_foundations = df_executive = (spark.read.format('csv')
    .option('header', 'true', )
    .option("inferSchema", "true")
    .option("comment", "F")
    .load('../data/Open990/Open990_SnackSet_Foundations_Grants/Foundations.csv'))
df_foundations = df_foundations.toDF(*[c.lower().replace(' ', "_").replace('-', '_') for c in df_foundations.columns])

In [30]:
print(df_foundations.count())
print(df_executive_foundations.count()) 
print(df_contractor.count()) 
print(df_grants.count())
print(df_governance.count()) 
print(df_master.count())
print(df_executive_charity.count()) 

123581
234492
237448
913386
1453208
1757712
3990749


In [31]:
df_foundations = df_foundations.repartition(20)
df_executive_foundations = df_executive_foundations.repartition(20)
df_contractor = df_contractor.repartition(20) 
df_grants = df_grants.repartition(20)

In [32]:
df_governance = df_governance.repartition(200)
df_master = df_master.repartition(200)
df_executive_charity = df_executive_charity.repartition(250)

In [33]:
spark.sql("CREATE DATABASE Open990")
spark.sql("USE Open990")

DataFrame[]

In [None]:
# df_executive_foundations.columns
# df_grants.show(3)

### Problems

_Need to set the docker desktop memory setting to be bigger than 4 gigs._

In [16]:
spark.sql('SHOW TABLES IN Open990').show()

+--------+---------+-----------+
|database|tableName|isTemporary|
+--------+---------+-----------+
+--------+---------+-----------+



In [34]:
df_foundations.write.saveAsTable("foundations")
df_executive_foundations.write.saveAsTable("executive_foundations")
df_contractor.write.saveAsTable("contractor")
df_grants.write.saveAsTable("grants")
df_governance.write.saveAsTable("governance")
df_master.write.saveAsTable("master")
df_executive_charity.write.saveAsTable("executive_charity")