## NYC_Parking_Violations_IBM_Capstone
### Notebook 2

In [1]:
import os
os.environ["PYSPARK_PYTHON"] = "python3"
os.environ["PYSPARK_DRIVER_PYTHON"] = "python3"
os.environ["JAVA_HOME"] = "/Library/java/JavaVirtualMachines/adoptopenjdk-8.jdk/contents/Home/"

In [2]:
import findspark
findspark.init()

In [3]:
import numpy as np
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer
from pyspark.sql.functions import (
    dayofmonth, dayofyear, month, year, weekofyear, datediff, to_date, col)

In [4]:
spark = SparkSession.builder.appName('NYC_Parking').getOrCreate()

In [5]:
# reading from the file stored locally
df = spark.read.csv('Open_Parking_and_Camera_Violations.csv',
                    inferSchema=True, header=True)

## Feature Engineering

In [6]:
ignore_list = ['Plate',
               'Summons Number',
               'Violation Time',
               'Judgment Entry Date',
               'Interest Amount',
               'Violation Status',
               'Summons Image']


def ignore_columns(df):
    """Drop columns from the ignore_list."""
    df_new = df.drop(*ignore_list).na.drop()

    return df_new


def rename_columns(df):
    """Rename License Type and Issuing Agency."""
    df_new = df.withColumnRenamed('License Type', 'License_Type').\
        withColumnRenamed('Issuing Agency', 'Issuing_Agency')

    return df_new


def filter_data(df):
    """Filter data to select top 12 states, top 4 violation categories,
    top 10 counties, top 10 license types, and top 5 issusing agencies """

    df_new = df.filter('State in ("NY", "NJ", "PA", "FL", "CT",\
                              "IN", "MA", "VA", "NC", "MD","TX","GA")')\
        .filter('Violation in ("NO PARKING-STREET CLEANING", \
                   "PHTO SCHOOL ZN SPEED VIOLATION", \
                   "FAIL TO DSPLY MUNI METER RECPT", \
                   "NO STANDING-DAY/TIME LIMITS")') \
        .filter('County in ("NY", "K", "Q", "BX", "BK", "QN", "ST", "R", "MN", "QUEEN") ')\
        .filter('License_Type in ("PAS", "COM", "OMT", "OMS", "SRF",\
                                     "999","APP","IRP","MOT","TRC") ')\
        .filter('Issuing_Agency in ("TRAFFIC", "DEPARTMENT OF TRANSPORTATION", \
                                       "DEPARTMENT OF SANITATION", "POLICE DEPARTMENT",\
                                       "OTHER/UNKNOWN AGENCIES")')

    return df_new


def add_dates(df):
    """Add Year, Month, and Day columns."""
    df_temp = df.withColumn(
        "Issue Date", to_date("Issue Date", "MM/dd/yyyy"))
    df_new = df_temp.withColumn('Year', year(df_temp['Issue Date']))\
        .withColumn('Month', month(df_temp['Issue Date']))\
        .withColumn('Day', dayofmonth(df_temp['Issue Date']))

    return df_new


def take_sample(df, fraction=0.05):
    """ Take a sample of the data."""
    df_new = df.sample(fraction)

    return df_new


def clean_dates(df):
    """Remove incorrect dates."""
    df_new = df[(df['Year'] <= 2020) & (df['Month'] <= 12)
                & (df['Day'] <= 31)].drop('Issue Date')

    return df_new

def preprocess_data(df):
    """
        1. Remove selected columns. 
        2. Rename columns.
        3. Filter data of interest.
        4. Add dates.
        5. Take a sample from the full dataset.
        6. Clean dates.
    """
    df_1 = ignore_columns(df)
    df_2 = rename_columns(df_1)
    df_3 = filter_data(df_2)
    df_4 = add_dates(df_3)
    df_5 = take_sample(df_4)
    df_sampled = clean_dates(df_5)
        
    return df_sampled   

In [7]:
df_preprocessed = preprocess_data(df)

In [8]:
df_pd = df_preprocessed.toPandas()

In [9]:
df_pd.head()

Unnamed: 0,State,License_Type,Violation,Fine Amount,Penalty Amount,Reduction Amount,Payment Amount,Amount Due,Precinct,County,Issuing_Agency,Year,Month,Day
0,NY,OMT,NO PARKING-STREET CLEANING,45.0,60.0,0.0,0.0,116.39,114,Q,TRAFFIC,2018,7,18
1,NY,PAS,FAIL TO DSPLY MUNI METER RECPT,35.0,60.0,0.0,0.0,110.73,61,K,TRAFFIC,2017,11,14
2,NY,PAS,PHTO SCHOOL ZN SPEED VIOLATION,50.0,25.0,0.0,83.03,0.0,0,BK,DEPARTMENT OF TRANSPORTATION,2018,5,11
3,PA,PAS,NO PARKING-STREET CLEANING,45.0,10.0,0.0,0.0,55.0,68,K,TRAFFIC,2018,2,9
4,TX,PAS,NO STANDING-DAY/TIME LIMITS,115.0,10.0,0.0,0.0,125.0,69,K,TRAFFIC,2018,3,23


In [10]:
# Save the sampled data for downstream analytics processes
df_pd.to_csv('nyc_sampled.csv',index=False)