In [1]:
# import dependencies
import json
import os
import csv
from pandas import DataFrame
import pandas as pd
import numpy as np
from datetime import datetime
import datetime as dt
import time
import shutil
import fnmatch
from pyproj import Proj, transform

In [2]:
# create working directories for your data 
def work_dir():
    working_directory = os.getcwd()
    folders = ["data/missed_lat_long_db_ready", "data/citations_db_ready", "data/api_responses/processed"]
    for folder in folders:
        try:
            os.makedirs(os.path.join(working_directory, folder))
        except OSError:
            print ("Creation of working directories failed/directory exists")
        else:
            print ("Successfully created "f"{folder}")
        
    print("========================================================")

work_dir()

Successfully created data/missed_lat_long_db_ready
Successfully created data/citations_db_ready
Creation of working directories failed/directory exists


In [2]:
# create a list of json files in api_resonses folder for processing
files=[]

def json_finder():
    
    for file in os.listdir('data/api_responses/'):
        if fnmatch.fnmatch(file, '*.json'):
            files.append(file)
    print(files)

json_finder()

['0000000001.json', '0000000002.json', '0000000003.json', '0000000004.json']


In [3]:
# iterate through selected json files and move processed files to another folder 
data_list = []

def json_loader():
    for f in files[0:4]:
        filename = 'data/api_responses/'+f
        with open(filename) as json_file:
            data = json.load(json_file)
            for s in data:
                data_list.append(s)
        shutil.move('data/api_responses/'+f, 'data/api_responses/processed/'+f)
        
    return print("Files moved to processed directory")
    
json_loader()

Files moved to processed directory


In [4]:
# load iteration results into dataframe for data transormation 
transformed_df = pd.DataFrame(data_list)
transformed_df

Unnamed: 0,agency,body_style,color,fine_amount,issue_date,issue_time,latitude,location,longitude,make,marked_time,meter_id,plate_expiry_date,route,rp_state_plate,ticket_number,vin,violation_code,violation_description
0,1,PA,GY,50,2015-12-21T00:00:00.000,1251,99999,13147 WELBY WAY,99999,HOND,,,200304,01521,CA,1103341116,,4000A1,NO EVIDENCE OF REG
1,1,VN,WH,50,2015-12-21T00:00:00.000,1435,99999,525 S MAIN ST,99999,GMC,,,201512,1C51,CA,1103700150,,4000A1,NO EVIDENCE OF REG
2,2,PA,BK,58,2015-12-21T00:00:00.000,2055,6439997.9,200 WORLD WAY,1802686.4,NISS,,,201503,2R2,CA,1104803000,,8939,WHITE CURB
3,2,PA,WH,,2015-12-26T00:00:00.000,1515,6440041.1,100 WORLD WAY,1802686.2,ACUR,,,,2F11,CA,1104820732,,000,17104h
4,1,PA,BK,93,2015-09-15T00:00:00.000,115,99999,GEORGIA ST/OLYMPIC,99999,CHEV,,,200316,1FB70,CA,1105461453,,8069A,NO STOPPING/STANDING
5,1,VN,GY,50,2015-09-15T00:00:00.000,19,99999,SAN PEDRO S/O BOYD,99999,CHEV,,,201507,1A35W,CA,1106226590,,4000A1,NO EVIDENCE OF REG
6,1,PA,BL,163,2015-12-17T00:00:00.000,1710,99999,SUNSET/ALVARADO,99999,MAZD,,,201605,00217,CA,1106500452,,8070,PARK IN GRID LOCK ZN
7,1,PA,BK,163,2015-12-17T00:00:00.000,1710,99999,SUNSET/ALVARADO,99999,TOYO,,,201602,00217,CA,1106500463,,8070,PARK IN GRID LOCK ZN
8,1,PA,BR,93,2015-12-22T00:00:00.000,945,99999,721 S WESTLAKE,99999,CHEV,,,201605,2A75,CA,1106506402,,8069AA,NO STOP/STAND AM
9,1,PA,SI,93,2015-12-22T00:00:00.000,1100,99999,1159 HUNTLEY DR,99999,NISS,,,201701,2A75,CA,1106506413,,8069AA,NO STOP/STAND AM


In [5]:
# preview the missing values and the % of missing values in each column
df = transformed_df
def missing_values_table(df):
        mis_val = df.isnull().sum()
        mis_val_percent = 100 * df.isnull().sum() / len(df)
        mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)
        mis_val_table_ren_columns = mis_val_table.rename(
        columns = {0 : 'Missing Values', 1 : '% of Total Values'})
        mis_val_table_ren_columns = mis_val_table_ren_columns[
            mis_val_table_ren_columns.iloc[:,1] != 0].sort_values(
        '% of Total Values', ascending=False).round(1)
        print ("Selected dataframe has " + str(df.shape[1]) + " columns.\n"      
            "There are " + str(mis_val_table_ren_columns.shape[0]) +
              " columns that have missing values.")
        return mis_val_table_ren_columns

missing_values_table(df)

Selected dataframe has 19 columns.
There are 15 columns that have missing values.


Unnamed: 0,Missing Values,% of Total Values
vin,1983691,99.2
marked_time,1926673,96.3
meter_id,1446476,72.3
plate_expiry_date,174710,8.7
route,14545,0.7
make,2392,0.1
body_style,2297,0.1
fine_amount,2072,0.1
color,1177,0.1
issue_time,765,0.0


In [6]:
# drop unnessesary columns. 
# for the first 2 million records drop "vin" number too.
def drop_columns():
    transformed_df.drop(["vin", "issue_time", "marked_time", "plate_expiry_date", "rp_state_plate"], axis = 1, inplace = True)
    return print("Columns dropped")

drop_columns()

Columns dropped


In [7]:
def date_transformation():
    
    # transform "issue_date" to a date/time format
    transformed_df['issue_date'] = pd.to_datetime(transformed_df['issue_date'], dayfirst=True)
    
    return transformed_df

date_transformation().head()

Unnamed: 0,agency,body_style,color,fine_amount,issue_date,latitude,location,longitude,make,meter_id,route,ticket_number,violation_code,violation_description
0,1,PA,GY,50.0,2015-12-21,99999.0,13147 WELBY WAY,99999.0,HOND,,01521,1103341116,4000A1,NO EVIDENCE OF REG
1,1,VN,WH,50.0,2015-12-21,99999.0,525 S MAIN ST,99999.0,GMC,,1C51,1103700150,4000A1,NO EVIDENCE OF REG
2,2,PA,BK,58.0,2015-12-21,6439997.9,200 WORLD WAY,1802686.4,NISS,,2R2,1104803000,8939,WHITE CURB
3,2,PA,WH,,2015-12-26,6440041.1,100 WORLD WAY,1802686.2,ACUR,,2F11,1104820732,000,17104h
4,1,PA,BK,93.0,2015-09-15,99999.0,GEORGIA ST/OLYMPIC,99999.0,CHEV,,1FB70,1105461453,8069A,NO STOPPING/STANDING


In [8]:
# create unique number for csv file name based on the api_responses list
files_list = files[0:1]
z = [i.split('.', 1)[0] for i in files_list]
csv_number = z[0]

In [42]:
# create unique number for csv file name based on the api_responses list
files_list = files[0:1]
z = [i.split('.', 1)[0] for i in files_list]
csv_number = z[0]

def missing_lat_long_processing():
    
    # save latitude with 99999 values into a separate csv file for later processing    
    parking_citations_lat_long_na = transformed_df[transformed_df["latitude"]=="99999"]
    parking_citations_lat_long_na.to_csv(f"missed_lat_long_{csv_number}.csv", encoding='utf-8', index=False)
    print ("Missing latitude/longitude data extracted")
    print("========================================================")


    shutil.move(f"missed_lat_long_{csv_number}.csv", 'data/missed_lat_long_db_ready')
    print("CSV file "f"missed_lat_long_{csv_number}.csv moved")
    
missing_lat_long_processing()

Missing latitude/longitude data extracted
CSV file missed_lat_long_0000000017.csv moved


In [9]:
#
def lat_long_transformation():
    # drop parking tickets with latitude / longitude values of 99999.0
    db_ready_df = transformed_df[transformed_df["latitude"] != "99999"]
    
    # assign coordinate transformation engine
    pm = '+proj=lcc +lat_1=34.03333333333333 +lat_2=35.46666666666667 +lat_0=33.5 +lon_0=-118 +x_0=2000000 ' \
     '+y_0=500000.0000000002 +ellps=GRS80 +datum=NAD83 +to_meter=0.3048006096012192 +no_defs'

    # convert latitude and longitude to geographic coordinates
    x_in,y_in = db_ready_df['latitude'].values, db_ready_df['longitude'].values
    db_ready_df['longitude'],db_ready_df['latitude'] = transform(Proj(pm, preserve_units = True), Proj("+init=epsg:4326"), x_in,y_in)
    
    db_ready_df.to_csv(f"citations_{csv_number}.csv", encoding = 'utf-8', index = False)
    shutil.move(f"citations_{csv_number}.csv", 'data/citations_db_ready')
    
    return print("Coordinates conversion completed and csv file created")

lat_long_transformation()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if sys.path[0] == '':


Coordinates conversion completed and csv file created
