In [1]:
from pyspark.sql.session import SparkSession
import pandas as pd

path = "/opt/conda/miniconda3/lib/python3.8/site-packages/irsx/CSV/index_2021.csv"
spark = SparkSession.builder.master("local[1]").appName("SparkByExamples.com").getOrCreate()
df21 = pd.read_csv(path, index_col=False, dtype=str) # read all as string, not beautiful but we only need object id anyways
df21.head()


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


Unnamed: 0,RETURN_ID,FILING_TYPE,EIN,TAX_PERIOD,SUB_DATE,TAXPAYER_NAME,RETURN_TYPE,DLN,OBJECT_ID
0,17606342,EFILE,452772761,201906,1/21/2021 10:02:51 AM,CAMDENS CHARTER SCHOOL NETWORK INC,990,93493065013010,202010659349301301
1,17606343,EFILE,237061115,201906,1/21/2021 10:02:51 AM,JACKSON STATE UNIVERSITY DEVELOPMENT FOUNDATIO...,990,93493072000410,202010729349300041
2,17606347,EFILE,344427516,201904,1/21/2021 10:02:52 AM,TIFFIN UNIVERSITY,990,93493072000210,202010729349300021
3,17606350,EFILE,840865247,201912,1/21/2021 10:03:29 AM,NETWORK MINISTRIES INC,990,93493072008360,202010729349300836
4,17606351,EFILE,205647589,201912,1/21/2021 10:03:29 AM,FAMILY PROMISE OF BEAUFORT COUNTY,990,93493066001350,202000669349300135


In [4]:
from pyspark.sql.types import StringType
sdf = spark.createDataFrame(df21["OBJECT_ID"], StringType())

In [3]:
from irsx.xmlrunner import XMLRunner
from pyspark.sql.types import StringType, StructType, StructField, IntegerType
from pyspark.sql.functions import udf

xml_runner = XMLRunner()
def transform_data(e):
    filing = xml_runner.run_filing(e)
    schedules = filing.list_schedules()
    
    ein = 0
    state = 0
    name = 0
    revenue = 0
    revenueEZ = 0
    
    if "ReturnHeader990x" in schedules:
        header = filing.get_parsed_sked("ReturnHeader990x")
        header_part_i = header[0]["schedule_parts"]["returnheader990x_part_i"]
        ein = header_part_i["ein"]
        try:
            state = header_part_i["USAddrss_SttAbbrvtnCd"]
        except KeyError:
            state = XX
        name = header_part_i["BsnssNm_BsnssNmLn1Txt"]
        
    if "IRS990EZ" in schedules:
        irs990ez = filing.get_parsed_sked("IRS990EZ")
        irs990ez_part_i = irs990ez[0]["schedule_parts"]["ez_part_i"]
        revenueEZ = irs990ez_part_i["TtlRvnAmt"]        
    
    if "IRS990" in schedules:
        irs990 = filing.get_parsed_sked("IRS990")
        irs990_part_i = irs990[0]["schedule_parts"]["part_i"]
        revenue = irs990_part_i["CYTtlRvnAmt"]
    
    revenue = int(revenue) + int(revenueEZ)
    return [e, ein, state, name, revenue]
     
    
my_schema = StructType([
    StructField("ObjectID", StringType(), nullable=False),
    StructField("EIN", StringType(), nullable=False),
    StructField("State", StringType(), nullable=False),
    StructField("Name", StringType(), nullable=False),
    StructField("Revenue", IntegerType(), nullable=False),
])

spark_transform_data = udf(lambda z: transform_data(z), my_schema)
spark.udf.register("spark_transform_data", spark_transform_data)

<function __main__.<lambda>(z)>

In [5]:
small_sdf = sdf.sample(0.0002).repartition(10) #get 72 entries
small_sdf2 = small_sdf.withColumn('valuelist', spark_transform_data('value')).select("valuelist.*")
small_sdf2.show()
#TODO write somewhere measure time

22/01/14 11:18:02 WARN org.apache.spark.scheduler.TaskSetManager: Stage 0 contains a task of very large size (10391 KiB). The maximum recommended task size is 1000 KiB.

+------------------+---------+-----+--------------------+--------+
|          ObjectID|      EIN|State|                Name| Revenue|
+------------------+---------+-----+--------------------+--------+
|202120119349100127|566036060|   NV|EMMA MILLER CRUTE...|       0|
|202021969349305247|850365466|   NM|FARMINGTON PUBLIC...|   55568|
|202023049349300237|237108470|   MN|SOUTHWEST MINNESO...| 4232311|
|202021959349100107|471578294|   IA|CENTRAL STATES ST...|       0|
|202013189349308441|262465531|   WA|NEW FAMILY TRADIT...|  573523|
|202012659349300621|952232881|   CA|    MATURANGO MUSEUM|  381985|
|202042869349301609|251740106|   PA|TURKEYTOWN SOUTH ...|  349390|
|202003159349300515|510209843|   DE|THE MINISTRY OF C...|11446827|
|202032869349301538|352550450|   IN|ALPHA XI DELTA FR...|  381964|
|202043219349208074|592737710|   TN|GOLD COAST PROMOT...|   85793|
|202033429349300618|741690314|   CA|PINE MOUNTAIN CLU...| 5558885|
|202041969349202244|420672550|   IA|VETERANS OF FOREI...|   10

                                                                                

## Start a Sparkession

In [7]:
from pyspark.sql.session import SparkSession
from pyspark import SparkContext
sc = SparkContext(appName="BigDataIRS3") #TODO use getOrCreate
spark = SparkSession(sc)

ValueError: Cannot run multiple SparkContexts at once; existing SparkContext(app=SparkByExamples.com, master=local[1]) created by getOrCreate at /tmp/ipykernel_2197/4270203480.py:5 

## Einlesen des Files  
Das File kann entweder lokal oder mittels hdfs eingelesen werden  
Falls das File lokal eingelesen wird muss es auf jedem Node vorhanden sein, deswegen empfiehlt sich die Verwendung von HDFS 

In [None]:
from pyspark.sql.session import SparkSession
spark2 = SparkSession.builder.appName("Test1").getOrCreate()

In [None]:
fs_path = "file:///revenue_2021_100.csv" #local_fs
hdfs_path = "hdfs://spark-jupyter-m/user/hdfs/spark_csv/revenue_2021_100.csv" # hdfs
df = spark2.read.csv(fs_path, header=True)

type(df), df.printSchema(), df.head(5)

## Simple Aggregation
Das DF kann entweder direkt aggregiert werden, oder vorher in ein RDD umgewandelt werden

In [None]:
!less spark_csv/revenue_2021_100.csv

In [None]:
# Aggregation als DF
grouped_df = df.groupby("State").sum("Revenue").sort("Sum(Revenue)", ascending = False)

# Aggregation als RDD
rdd = df.rdd.drop("ObjectID", "EIN", "Name", "RevenueEZ") # U
from operator import add
reduced_rdd = rdd.reduceByKey(add).sortBy(lambda x: x[1], ascending = False)
reduced_rdd.collect()

In [None]:
#from google.cloud import storage
#client = storage.Client()
# https://console.cloud.google.com/storage/browser/[bucket-id]/
#bucket = client.get_bucket('sparkbucket02')
# Then do other things...
#blob = bucket.get_blob("revenue_2021_100.csv") #('remote/path/to/file.txt')

#df = pd.read_csv(blob.download_as_string())
#df

In [None]:
#spark = SparkSession \
#    .builder \
#    .appName("Protob Conversion to Parquet") \
#    .config("spark.some.config.option", "some-value") \
#    .getOrCreate()\

#df = spark.read.csv('/home/hadoop/observations_temp.csv, header=True)

sudo su - hdfs  
hdfs dfsadmin -safemode leave  

hdfs dfs -mkdir spark_csv  
hdfs dfs -put /spark_csv/revenue_2021_100.csv spark_csv/revenue_2021_100.csv  
hdfs dfs -ls spark_csv  

https://stackoverflow.com/questions/42091575/pyspark-load-file-path-does-not-exist
https://stackoverflow.com/questions/33055403/how-to-navigate-directories-in-hadoop-hdfs
https://stackoverflow.com/questions/28213116/hadoop-copy-a-local-file-system-folder-to-hdfs
https://stackoverflow.com/questions/61197811/can-i-read-csv-files-from-google-storage-using-spark-in-more-than-one-executor
https://groups.google.com/g/cloud-dataproc-discuss/c/cubkWrjkk2g?pli=1
https://stackoverflow.com/questions/56448009/storing-source-file-in-google-dataproc-hdfs-vs-google-cloud-storagegoogle-bucke

hdfs dfs -put /spark_csv/revenue_2021_100.csv /user/root/spark_csv/revenue_2021_100.csv