In [None]:
# Casey Reyes & Joaquin Feria

In [3]:
!hdfs dfs -D dfs.replication=1 -cp -f data/*.csv hdfs://nn:9000/

# Part 1: Filtering: RDDs, DataFrames, and Spark

In [32]:
from pyspark.sql import SparkSession
spark = (SparkSession.builder.appName("cs544")
         .master("spark://boss:7077")
         .config("spark.executor.memory", "512M")
         .config("spark.sql.warehouse.dir", "hdfs://nn:9000/user/hive/warehouse")
         .enableHiveSupport()
         .getOrCreate())

In [33]:
#q1: how many banks contain the word "first" in their name, ignoring case? Use an RDD to answer.

# TODO: modify to treat the first row as a header
# TODO: modify to infer the schema
banks_df = (spark.read.format("csv")
            .option("header", True)
            .option("inferSchema", True)
            .load("hdfs://nn:9000/arid2017_to_lei_xref_csv.csv"))
rdd = banks_df.rdd
filtered_banks = rdd.filter(lambda x: "first" in x[0].lower())
filtered_banks.count()

# filtered_bank_names = filtered_banks.collect()
# for name in filtered_bank_names:
#     print(name[0])

                                                                                

525

In [34]:
#q2 how many banks contain the word "first" in their name, ignoring case? Use a DataFrame to answer.
from pyspark.sql.functions import expr, col, lower

col("respondent_name")
expr("respondent_name")

filtered_df = banks_df.filter(lower(expr("respondent_name")).like("%first%"))
filtered_pandas_df = filtered_df.select("respondent_name")
filtered_pandas_df.count()

525

In [35]:
#q3 how many banks contain the word "first" in their name, ignoring case? Use Spark SQL to answer.

banks_df.write.saveAsTable("banks", mode="overwrite")

banks_df
banks_df.createOrReplaceTempView("names")
banks_df.withColumnRenamed("respondent_name", "name").createOrReplaceTempView("names")
filtered_df = spark.sql("SELECT * FROM names WHERE LOWER(name) LIKE '%first%'")
filtered_df.count()

                                                                                

525

## Part 2: Hive Data Warehouse

In [36]:
loans_df = (spark.read
            .format("csv")
            .option("header", True)
            .option("inferSchema", True)
            .load("hdfs://nn:9000/hdma-wi-2021.csv"))
            # .createOrReplaceTempView("codes"))
(loans_df.write.format("csv")
            .bucketBy(8, 'county_code')
            .mode("overwrite")
            .saveAsTable('loans'))

# loans_df.printSchema
views_list = ["ethnicity", "race", "sex", "states", "counties", "tracts", "action_taken", "denial_reason", "loan_type", "loan_purpose", "preapproval", "property_type"]
for view in views_list:
    loans_df.createOrReplaceTempView(view)

23/11/07 06:02:20 WARN HiveExternalCatalog: Couldn't find corresponding Hive SerDe for data source provider csv. Persisting data source table `spark_catalog`.`default`.`loans` into Hive metastore in Spark SQL specific format, which is NOT compatible with Hive.


In [37]:
#q4 what tables are in our warehouse?

# spark.sql("SHOW TABLES").show()
tables_df = spark.sql("SHOW TABLES")
table_list = tables_df.collect()
table_dict = {row['tableName']: row['isTemporary'] for row in table_list}
table_dict

{'banks': False,
 'loans': False,
 'action_taken': True,
 'codes': True,
 'counties': True,
 'denial_reason': True,
 'ethnicity': True,
 'loan_purpose': True,
 'loan_type': True,
 'names': True,
 'preapproval': True,
 'property_type': True,
 'race': True,
 'sex': True,
 'states': True,
 'tracts': True}

In [38]:
#q5 how many loan applications has the bank "University of Wisconsin Credit Union" received in 2020 in this dataset?

bank_name = "University of Wisconsin Credit Union"
total_df = banks_df.join(
    loans_df,
    loans_df["lei"] == banks_df["lei_2020"],
    "inner"
).filter(banks_df["respondent_name"] == bank_name)
total_df.count()

                                                                                

19739

In [40]:
#q6 what does .explain("formatted") tell us about how Spark executes Q5?

total_df.explain("formatted")

== Physical Plan ==
AdaptiveSparkPlan (7)
+- BroadcastHashJoin Inner BuildLeft (6)
   :- BroadcastExchange (3)
   :  +- Filter (2)
   :     +- Scan csv  (1)
   +- Filter (5)
      +- Scan csv  (4)


(1) Scan csv 
Output [5]: [respondent_name#6459, arid_2017#6460, lei_2018#6461, lei_2019#6462, lei_2020#6463]
Batched: false
Location: InMemoryFileIndex [hdfs://nn:9000/arid2017_to_lei_xref_csv.csv]
PushedFilters: [IsNotNull(respondent_name), EqualTo(respondent_name,University of Wisconsin Credit Union), IsNotNull(lei_2020)]
ReadSchema: struct<respondent_name:string,arid_2017:string,lei_2018:string,lei_2019:string,lei_2020:string>

(2) Filter
Input [5]: [respondent_name#6459, arid_2017#6460, lei_2018#6461, lei_2019#6462, lei_2020#6463]
Condition : ((isnotnull(respondent_name#6459) AND (respondent_name#6459 = University of Wisconsin Credit Union)) AND isnotnull(lei_2020#6463))

(3) BroadcastExchange
Input [5]: [respondent_name#6459, arid_2017#6460, lei_2018#6461, lei_2019#6462, lei_2020#6463