# A3 Section B - Working with DataFrames and SQL

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

In [2]:
import pyspark
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession, SQLContext
from pyspark.sql.types import *
from pyspark.sql.functions import *

In [3]:
# setting random seed for notebook reproducability
rnd_seed=23
np.random.seed=rnd_seed
np.random.set_state=rnd_seed

In [9]:
spark_session = SparkSession.builder \
   .master("spark://192.168.2.156:7077") \
   .appName("alexander_sundquist_A3_B") \
   .config("spark.dynamicAllocation.enabled", True) \
   .config("spark.dynamicAllocation.shuffleTracking.enabled", True) \
   .config("spark.shuffle.service.enabled", False) \
   .config("spark.dynamicAllocation.executorIdleTimeout", "30s") \
   .config("spark.cores.max", 8) \
   .config("spark.driver.port",9999)\
   .config("spark.blockManager.port",10005)\
   .getOrCreate() 

spark_context = spark_session.sparkContext
spark_context.setLogLevel("ERROR")

In [5]:
sqlContext = SQLContext(spark_session.sparkContext)
sqlContext



<pyspark.sql.context.SQLContext at 0x7f4e801fb820>

In [6]:
#Loading Data to dataframe, store in cache memory to increase speed
df = sqlContext.read.csv("hdfs://192.168.2.156:9000/data/los-angeles-parking-citations", header="true", inferSchema="true").cache()

                                                                                

In [8]:
df.show(10)

+-------------+--------------------+----------+--------+-----------+--------------+-----------------+----+----+----------+-----+------------------+-----+------+--------------+---------------------+-----------+---------+---------+------------------+-----------------+----------------------+
|Ticket number|          Issue Date|Issue time|Meter Id|Marked Time|RP State Plate|Plate Expiry Date| VIN|Make|Body Style|Color|          Location|Route|Agency|Violation code|Violation Description|Fine amount| Latitude|Longitude|Agency Description|Color Description|Body Style Description|
+-------------+--------------------+----------+--------+-----------+--------------+-----------------+----+----+----------+-----+------------------+-----+------+--------------+---------------------+-----------+---------+---------+------------------+-----------------+----------------------+
|   1103341116|2015-12-21T00:00:...|      1251|    NULL|       NULL|            CA|           200304|NULL|HOND|        PA|   GY|  

In [10]:
#getting the schema info from the dataframe
df.printSchema()

root
 |-- Ticket number: string (nullable = true)
 |-- Issue Date: string (nullable = true)
 |-- Issue time: string (nullable = true)
 |-- Meter Id: string (nullable = true)
 |-- Marked Time: string (nullable = true)
 |-- RP State Plate: string (nullable = true)
 |-- Plate Expiry Date: string (nullable = true)
 |-- VIN: string (nullable = true)
 |-- Make: string (nullable = true)
 |-- Body Style: string (nullable = true)
 |-- Color: string (nullable = true)
 |-- Location: string (nullable = true)
 |-- Route: string (nullable = true)
 |-- Agency: string (nullable = true)
 |-- Violation code: string (nullable = true)
 |-- Violation Description: string (nullable = true)
 |-- Fine amount: string (nullable = true)
 |-- Latitude: string (nullable = true)
 |-- Longitude: string (nullable = true)
 |-- Agency Description: string (nullable = true)
 |-- Color Description: string (nullable = true)
 |-- Body Style Description: string (nullable = true)



In [11]:
# Count total number of rows
print(f"Total Rows: {df.count()}")

# Maximum rows in excel: 1,048,576
# https://support.office.com/en-us/article/excel-specifications-and-limits-1672b34d-7043-467e-8e27-269d656771c3

# Count the total number of columns
print(f"Total Cols: {len(df.columns)}")



Total Rows: 13079582
Total Cols: 22


                                                                                

In [12]:
df.rdd.getNumPartitions()

16