In [1]:
#Download Index file
!irsx_index --year=2021 --verbose

Getting index file for year: 2021 remote=https://s3.amazonaws.com/irs-form-990/index_2021.csv local=/opt/conda/miniconda3/lib/python3.8/site-packages/irsx/CSV/index_2021.csv
Beginning streaming download of https://s3.amazonaws.com/irs-form-990/index_2021.csv
Total file size: 55.76 MB
Download completed to /opt/conda/miniconda3/lib/python3.8/site-packages/irsx/CSV/index_2021.csv in 0:00:07.584222


In [1]:
# To have time runtime for cells
try:
    %load_ext autotime
except:
    !pip install ipython-autotime
    %load_ext autotime

time: 422 µs (started: 2022-01-25 12:34:33 +00:00)


In [2]:
from pyspark.sql.session import SparkSession
import pandas as pd

path = "/opt/conda/miniconda3/lib/python3.8/site-packages/irsx/CSV/index_2021.csv"

df21 = pd.read_csv(path, index_col=False, dtype=str) # read all as string, not beautiful but we only need object id anyways
df21.head()
spark = SparkSession.builder.getOrCreate()

time: 2.83 s (started: 2022-01-25 12:34:37 +00:00)


In [3]:
from pyspark.sql.types import StringType
sdf = spark.createDataFrame(df21["OBJECT_ID"], StringType())

time: 3.22 s (started: 2022-01-25 12:35:13 +00:00)


In [4]:
from irsx.xmlrunner import XMLRunner
from pyspark.sql.types import StringType, StructType, StructField, IntegerType
from pyspark.sql.functions import udf

xml_runner = XMLRunner()
def transform_data(e):
    try:
        filing = xml_runner.run_filing(e)
        schedules = filing.list_schedules()
    except:
        print(f"Transform error for id {e}")
        return ["","","","",0]
    
    ein = 0
    state = 0
    name = 0
    revenue = 0
    revenueEZ = 0
    
    if "ReturnHeader990x" in schedules:
        header = filing.get_parsed_sked("ReturnHeader990x")
        header_part_i = header[0]["schedule_parts"]["returnheader990x_part_i"]
        ein = header_part_i["ein"]
        state = header_part_i.get("USAddrss_SttAbbrvtnCd", "XX")
        name = header_part_i["BsnssNm_BsnssNmLn1Txt"]
        
    if "IRS990EZ" in schedules:
        irs990ez = filing.get_parsed_sked("IRS990EZ")
        irs990ez_part_i = irs990ez[0]["schedule_parts"].get("ez_part_i", None)
        if irs990ez_part_i:
            revenueEZ = irs990ez_part_i.get("TtlRvnAmt", 0)        
    
    if "IRS990" in schedules:
        irs990 = filing.get_parsed_sked("IRS990")
        irs990_part_i = irs990[0]["schedule_parts"]["part_i"]
        revenue = irs990_part_i["CYTtlRvnAmt"]
    
    revenue = int(revenue) + int(revenueEZ)
    return [e, ein, state, name, revenue]
     
    
my_schema = StructType([
    StructField("ObjectID", StringType(), nullable=False),
    StructField("EIN", StringType(), nullable=False),
    StructField("State", StringType(), nullable=False),
    StructField("Name", StringType(), nullable=False),
    StructField("Revenue", IntegerType(), nullable=False),
])

spark_transform_data = udf(lambda z: transform_data(z), my_schema)
spark.udf.register("spark_transform_data", spark_transform_data)

<function __main__.<lambda>(z)>

time: 518 ms (started: 2022-01-25 12:35:20 +00:00)


In [None]:
# Sample
small_sdf = sdf.sample(0.4, seed=43).repartition(10)
anz = small_sdf.count()
print(anz)
small_sdf2 = small_sdf.withColumn('valuelist', spark_transform_data('value')).select("valuelist.*")
#small_sdf2.show()
small_sdf.explain()
small_sdf2.toPandas().to_csv(f"BIGData/{anz}.csv", index=None)

22/01/25 12:37:15 WARN org.apache.spark.scheduler.TaskSetManager: Stage 6 contains a task of very large size (5208 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

184781
== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- Exchange RoundRobinPartitioning(10), REPARTITION_WITH_NUM, [id=#154]
   +- Sample 0.0, 0.4, false, 43
      +- Scan ExistingRDD[value#0]




22/01/25 12:37:17 WARN org.apache.spark.scheduler.TaskSetManager: Stage 12 contains a task of very large size (5208 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

time: 1h 14min 7s (started: 2022-01-25 12:37:15 +00:00)


In [None]:
# full file
anz = sdf.count()
print(anz)
sdf2 = sdf.withColumn('valuelist', spark_transform_data('value')).select("valuelist.*")
sdf2.explain()
sdf2.toPandas().to_csv(f"hdfs://big-spark-cluster-m/user/root/{anz}.csv", index=False)

22/01/25 14:49:49 WARN org.apache.spark.scheduler.TaskSetManager: Stage 18 contains a task of very large size (5208 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

461887
== Physical Plan ==
*(2) Project [pythonUDF0#69.ObjectID AS ObjectID#59, pythonUDF0#69.EIN AS EIN#60, pythonUDF0#69.State AS State#61, pythonUDF0#69.Name AS Name#62, pythonUDF0#69.Revenue AS Revenue#63]
+- BatchEvalPython [<lambda>(value#0)], [pythonUDF0#69]
   +- *(1) Scan ExistingRDD[value#0]




22/01/25 14:49:51 WARN org.apache.spark.scheduler.TaskSetManager: Stage 21 contains a task of very large size (5208 KiB). The maximum recommended task size is 1000 KiB.

### Zeiten von erfolgreichen Läufen

|Anzahl | Zeit| Kommentar |  
-------|--------|---------------
|4710 | time: 2min 44s (started: 2022-01-17 18:46:15 +00:00) | erster Versuch|
|46099| time: 12min 22s (started: 2022-01-17 20:42:31 +00:00) | erster Versuch mit ErrorHandling|
|184781| time: 1h 14min 7s (started: 2022-01-25 12:37:15 +00:00) | - |

## Start a Sparkession

## Einlesen des Files  
Das File kann entweder lokal oder mittels hdfs eingelesen werden  
Falls das File lokal eingelesen wird muss es auf jedem Node vorhanden sein, deswegen empfiehlt sich die Verwendung von HDFS 

In [9]:
from pyspark.sql.session import SparkSession
spark2 = SparkSession.builder.appName("Test1").getOrCreate()

In [40]:
fs_path = "file:///revenue_2021_100.csv" #local_fs
hdfs_path = "hdfs://big-spark-cluster-m/user/root/46099.csv" # hdfs
df = spark2.read.csv(hdfs_path, header=True, inferSchema=True)

type(df), df.printSchema(), df.head(5)

                                                                                

root
 |-- _c0: integer (nullable = true)
 |-- ObjectID: long (nullable = true)
 |-- EIN: integer (nullable = true)
 |-- State: string (nullable = true)
 |-- Name: string (nullable = true)
 |-- Revenue: integer (nullable = true)



(pyspark.sql.dataframe.DataFrame,
 None,
 [Row(_c0=0, ObjectID=202023169349201962, EIN=113463183, State='NY', Name='77th PRECINT COMMUNITY COUNCIL INC', Revenue=3800),
  Row(_c0=1, ObjectID=202013049349200436, EIN=202347170, State='MD', Name='SOUTHERN MARYLAND FAST PITCH ORGANIZATION', Revenue=25259),
  Row(_c0=2, ObjectID=202120609349300112, EIN=475378165, State='CA', Name='OAKLAND WINE FESTIVAL AND FOUNDATION', Revenue=225),
  Row(_c0=3, ObjectID=202100489349300620, EIN=582003159, State='AR', Name='TOTAL LIFE COMMUNITY EDUC FOUNDATION', Revenue=363307),
  Row(_c0=4, ObjectID=202023189349306407, EIN=760536563, State='CT', Name='BOZRAH INTERNATIONAL MINISTRIES INC', Revenue=927063)])

## Simple Aggregation
Das DF kann entweder direkt aggregiert werden, oder vorher in ein RDD umgewandelt werden

In [1]:
!head BIGData/46099.csv

,ObjectID,EIN,State,Name,Revenue
0,202023169349201962,113463183,NY,77th PRECINT COMMUNITY COUNCIL INC,3800
1,202013049349200436,202347170,MD,SOUTHERN MARYLAND FAST PITCH ORGANIZATION,25259
2,202120609349300112,475378165,CA,OAKLAND WINE FESTIVAL AND FOUNDATION,225
3,202100489349300620,582003159,AR,TOTAL LIFE COMMUNITY EDUC FOUNDATION,363307
4,202023189349306407,760536563,CT,BOZRAH INTERNATIONAL MINISTRIES INC,927063
5,202043219349317174,822282576,OH,ONECITY FOR RECOVERY INC,51184
6,202013219349211076,470873877,LA,Grace Place Ministries Inc,164849
7,202110539349200126,870420899,UT,UTAH ACADEMY OF GENERAL DENTISTRY,12617
8,202033229349300123,450255772,ND,UNITED WAY OF GRAND FORKS EAST GRAND,565469


In [38]:
# Aggregation als RDD
two_col_df = df.drop("ObjectID", "EIN", "Name", "RevenueEZ", "_c0") #col _c0 may (not) exist so drop or not drop it
two_col_df.printSchema()
rdd = two_col_df.rdd

from operator import add
reduced_rdd = rdd.reduceByKey(add).sortBy(lambda x: x[1], ascending = False)
reduced_rdd.collect()

root
 |-- State: string (nullable = true)
 |-- Revenue: integer (nullable = true)



[('NY', 26282974544),
 ('CA', 22599757286),
 ('PA', 14021197075),
 ('MO', 9767605990),
 ('TX', 8408306891),
 ('FL', 6441797903),
 ('OH', 6364770882),
 ('TN', 6352786890),
 ('GA', 6155584013),
 ('MD', 6043965992),
 ('VA', 5411817304),
 ('NC', 5311445382),
 ('WA', 4838719241),
 ('IL', 4763859142),
 ('MN', 4430065258),
 ('IN', 4276565192),
 ('NJ', 3866619086),
 ('MA', 3838411645),
 ('DC', 3286803190),
 ('MI', 3224375440),
 ('KY', 3165665585),
 ('IA', 2725559896),
 ('NH', 2717014995),
 ('WI', 2610280298),
 ('AZ', 2317167743),
 ('OR', 2296702694),
 ('SD', 2251631584),
 ('NE', 2024525833),
 ('CO', 1986296862),
 ('ME', 1911403229),
 ('CT', 1671984299),
 ('LA', 1642874943),
 ('DE', 1548520157),
 ('MT', 1419515866),
 ('OK', 1255335798),
 ('XX', 1113835035),
 ('ID', 1033109314),
 ('SC', 985934691),
 ('KS', 956909342),
 ('AL', 816325971),
 ('NV', 575109238),
 ('RI', 569357888),
 ('VT', 518079533),
 ('UT', 499304361),
 ('ND', 488519360),
 ('WY', 453860239),
 ('MS', 438280172),
 ('NM', 434322177),
