In [1]:
#lots of imports
import pandas as pd
import numpy as np
from scipy import stats
from dateutil.parser import parse
import warnings, os, time, requests
from pathlib import Path

warnings.filterwarnings('ignore')
%matplotlib inline

## New York Crime Data Scraper.
* <b>Data Sources</b> -
    * The data for this analysis is taken from two sources namely - 
        - [NYPD Complaint Data Historic](https://data.cityofnewyork.us/Public-Safety/NYPD-Complaint-Data-Historic/qgea-i56i)
           : This dataset includes all valid felony, misdemeanor, and violation crimes reported to the New York City Police Department (NYPD) from 2006 to the end of 2017 
        - [NYPD Complaint Data Year To Date](https://data.cityofnewyork.us/Public-Safety/NYPD-Complaint-Data-Current-Year-To-Date-/5uac-w243)
           : This dataset includes all valid felony, misdemeanor, and violation crimes reported to the New York City Police Department (NYPD) for all complete quarters so far from 2018 to 2019
        
    * Both the datasets belong to the New York Crime Department. This dataset is publicly avaiable via the Socrata Open Data API. More about the Socrata Open Data API can be found [here](https://dev.socrata.com/)
    

* <b>Note on limitations</b> - 
    1. Historic data has been collected from 2013 to 2017. For each year 6 months data has been collected from January to June
    2. Year to Date data has been collected from 2018 to 2019. For each year 6 months data has been collected from January to June
    3. Manually change dates in the scraping utility to get data

### Set default configuration variables that can be used globally

1. `setDefaults()` : acts as setter function that initialises global variables which can be referenced throughout the notebook 
2. `isValid()` : validation checks to check if the request parameters and other globals are passed correctly to the API  
3. `isValidAndSetDefaults()` : helper function for setDefaults() and isValid()

In [357]:
vars_ = {key : None for key in ['APP_TOKEN', 'LIMIT', 'DATASET', 'BASE_URL', 'URI_IDENTIFIERS', 'SAVE_TO_DIRECTORY', \
                                                                 'START_DATE', 'END_DATE', 'RES_PER_RESPONSE']}

def setDefaults(dataset, start_date, end_date, save_to_directory, results_per_response=10000, limit=100000):
    '''
    Setters for default configuration vars required in data collection process
    '''
    global vars_
    vars_['APP_TOKEN'] = "jlyZazYtAq9hSQNf0Ow4pNySj"
    vars_['LIMIT'] = limit
    vars_['DATASET'] = dataset
    vars_['BASE_URL'] = "https://data.cityofnewyork.us/resource/"
    vars_['URI_IDENTIFIERS'] = ['5uac-w243', 'qgea-i56i']
    vars_['START_DATE'] = start_date
    vars_['END_DATE'] = end_date
    vars_['RES_PER_RESPONSE'] = results_per_response
    vars_['SAVE_FILE_ENDING'] = 'Historic' if vars_['DATASET'] == 'qgea-i56i' else 'YTD'
    vars_['SAVE_TO_DIRECTORY'] = save_to_directory + '_' + vars_['SAVE_FILE_ENDING']
    
    
    if all(map(lambda x: x is not None, vars_.values())):
         print("Log: Defaults successfully loaded.")
    else:
         print("Log: Some defaults weren't loaded correctly.")

def isValid(dataset, start_date, end_date, save_to_directory, results_per_response=10000, limit=100000):
    '''
    Performs validation checks on the input from user
    @params:
    data_uri := uniform resource identifier associated with the data to scrape from the API
    data_sz := limit / dataset size to scrape

    @return:
    #need to reframe this..
        1. [If dataset == none || data_sz  != int || data_sz != +ve int || (start_date && end_date) != datettime] := returns -1
        2. [If data_sz == none := returns 0 and sets default results_per_responsee = 1000
        3. [Everything works perfect := returns 1] 
    '''
    
    global vars_  
    def validDate(date): #util for date validation
         try: 
            parse(date, fuzzy=True)
            print("Correct date")
            return True
         except ValueError:
            print("Error: Incorrect date.")
            return False

    if dataset is None: #dataset uri is empty
        print("Error: Invalid URI.")
        return -1

    elif results_per_response is None: #limit is empty
        print("Log: Results per response is empty. Setting default as 10000.")
        return 0

    elif start_date is None: #start_date is empty
        print("Error: Start date is empty.")
        return -1
        
    elif end_date is None: #start_date is empty
        print("Error: End date is empty.")
        return -1
         
    elif save_to_directory is None: #savetodirectory is empty
        print("Error: Directory to write data is not present.")
        return -1
    
    elif results_per_response is None: #max limit is empty
        print("Log: Results per response is empty. Setting default as 10000.")
        return 0
    
    elif limit is None: #max limit is empty
        print("Log: Default limit is empty. Setting default as 100000.")
        return 0
         
    elif dataset and start_date and end_date:  #params aren't empty
        #performing validation checks on data 
        try: 
            results_per_response = int(results_per_response) #check if results_per_response is !<= 0
            if results_per_response <= 0:
                print("Error: Specifid results per response is not > 0.")
                return -1
        except ValueError:
            print("Error: Not an integer.")
            return -1
        
        try: 
            limit = int(limit) #check if limit is !<= 0
            if limit <= 0:
                print("Error: Specifid results per response is not > 0.")
                return -1
        except ValueError:
            print("Error: Not an integer.")
            return -1

        if validDate(start_date) != True: #start_date is invalid
            return -1

        if validDate(end_date) != True: #end_date is invalid
            return -1

        parts = str(dataset).split('-') #check if dataset_uri length is perfect
        if len(parts[0]) != 4 or len(parts[1]) != 4:
            print("Error: Invalid URI.")
            return -1

        #Some cheating here... hardcoding values for validation checks
        if dataset not in {'5uac-w243','qgea-i56i'}: #check if its the right dataset_uri
            print("Error: Invalid Dataset URI.")
            return -1
        
        p1 = Path(os.getcwd() + '\data_Historic')
        p2 = Path(os.getcwd() + '\data_YTD')
        
        if not p1.exists():
            print(str(p1))
            print("Error: Path to save csv files for Historic data doesn't exist.")
            return -1
        
        if not p2.exists():
            print(str(p2))
            print("Error: Path to save csv files for Year To Date data doesn't exist.")
            return -1

        
        print("Log: Validations passed.")
        return 1

def isValidAndSetDefaults(dataset_uri, start_date, end_date, save_to_directory, results_per_response=10000, limit=100000):
    '''
    check if everything isvalid and defaults are loaded
    @return: boolean variable True or False
    @params:
    dataset_uri := uniform resource identifier associated with the data to scrape from the API
    start_date := date to start scraping the dataset from
    end_date := date to end scraping the dataset from
    save_to_directory := target location where the csvs are to be stored
    results_per_responses := number of records to scrape per API request
    limit := max limit on number of records to scrape globally
    '''
    
    ret_value = isValid(dataset_uri, start_date, end_date, save_to_directory)
    if ret_value == -1: #something isn't working correctly. Check logs
        print("Error: Unable to set defaults.")
        return False
    elif ret_value == 0: #everything is working correctly, except no default params specified in the data fetch operation.
        setDefaults(dataset_uri, start_date, end_date, save_to_directory)
        return True     
    elif ret_value == 1:
        setDefaults(dataset_uri, start_date, end_date, save_to_directory, results_per_response, limit)
        return True
    else:
        print("Error: Unknown error.")
        return False

### Perform API Request and store the response in csv file

1. `getData()`: helper function to support the below operations  
    1.1 `buildURL()` : constructs URL from base URL and global vars_  
    1.2 `callAPI()` :  helper function that performs data fetch operation  
    1.3 `getDataByChunks()` : main function that builds the data by fetching it in chunks and dumping into the csv files  
2. `mergeCSV()`: merges the resultant csv files obtained from getData() to a single resultant csv file  

In [358]:
def getData():
    global vars_ #accessing globals
    
    #------------------------#Helpers for the function getData----------------------#  
    def buildURL():
        '''
        Builds complete URL to be used for scraping
        @return: returns complete url to use for scraping
        ''' 
        url = str(vars_['BASE_URL'] + vars_['DATASET']) + ".json?$where=cmplnt_fr_dt >='{}' and cmplnt_fr_dt < '{}'&$limit={}&$order=cmplnt_fr_dt".format(vars_['START_DATE'], vars_['END_DATE'], vars_['RES_PER_RESPONSE'])
        print(url)
        return url

    def callAPI(url):
        '''
        Performs API request to get data with appropriate token and receives response
        @params: complete URL
        @return: If response_code == 200 := json dump of received response, else None 
        '''
        token = vars_['APP_TOKEN'] #get the APP_TOKEN defined in the vars_
        header = {"X-App-Token": token} #pass the APP_Token as the header in the request
        res = requests.get(url, headers = header, verify = False) #perform actual get operation of request
        print(f"HTTP response code: {res.status_code}") 
        if (res.status_code == 200):#check for status code
            results = res.json() #parse the request.response to json
            return results
        else:
            return None

    def getDataByChunks():
        '''
        main function that performs data fetch operation in chunks as the data is very huge
        
        '''
        offset = 0
        prevFound = vars_['RES_PER_RESPONSE'] - 1

        while(prevFound > 0):
            url = buildURL()
            url += "&$offset={}".format(offset) #Appending offset to set start of records in dataset
            
            print(f"Calling offset {offset} : {url}")
            results = callAPI(url)

            if (results == None): #if the received response is invalid or has no fetched records
                prevFound = 0
            else:            
                prevFound = len(results)
                print("Results Found:", prevFound)

                if (prevFound > 0):
                    df = pd.io.json.json_normalize(results) #normalise the results from json to dataframe
                    firstDate = df["cmplnt_fr_dt"].head(1) 
                    lastDate = df["cmplnt_fr_dt"].tail(1)
                    print("First Date:", firstDate)
                    print("Last Date:", lastDate)
                    fileName = "NewYork_Crime_" + vars_['START_DATE'] + "_" + vars_['END_DATE'] + "_" + str(offset).zfill(20) + ".csv" #complete filename to store each datachunk
                    print(f"Saving {prevFound} records to fileName: {fileName}")
                    p = Path(os.getcwd() + '\\' + vars_['SAVE_TO_DIRECTORY'] + '\\') #Bad design.. trying to get path on the fly..must be system variable
                    df.to_csv(os.path.join(str(p), fileName), index = False) #parse to csv each file
                    offset = offset + vars_['RES_PER_RESPONSE']
                    time.sleep(5) # to pause for 5 seconds before starting next round of data fetch operation for each datachunk
                    print("--------------------------------------------------------------------------------------------------------")        
    getDataByChunks()
    #-------------------------------------######------------------------------------#

def mergeCSV():
    '''
    Merge multiple csv's generated by getData() from the vars_['SAVE_TO_DIRECTORY'] into single csv
    @return: merged_csv file
    '''
    
    merge_header = "NYPD_Complaint_Data_" #string to use in naming csv filenames while performing writes
    print(f"Current path - {os.getcwd()} + '\\' + {vars_['SAVE_TO_DIRECTORY']} + '\\'")
    path = Path(os.getcwd() + '\\' + vars_['SAVE_TO_DIRECTORY'] + '\\') #i think i need this in global
    all_files = glob(os.path.join(path, "*.csv")) #find all csv files in the path
    concatenated = []
    total_files = len(all_files)
    print(f"Total files - {total_files}") #check total files length and then loop over all the files in the csv
    if total_files > 0: #validation check
        print("Merge started.")
        for filename in all_files:
            df = pd.read_csv(filename, index_col=None, header=0) #parse the file and store it to df
            concatenated.append(df) #store all the dfs in list.
            print(f"Log: loaded file- {filename} in dataframe") #logging for each filename
        frame = pd.concat(concatenated, axis=0, sort = True) #expensive operation for larger datasets
        
        #set frame header to be used as csv filename
        frame_header = merge_header + vars_['SAVE_FILE_ENDING'] + ".csv"
        print(frame_header) #logging for combined dataframe file in the end
        frame.to_csv(frame_header) #parse the final concatenated df to csv 
    else:
        print("Error.")

### Define entry-point helper function to call getData() on two API endpoints

1. `preprocessData()`: preprocesses data and returns two df respectively for historic and yearToDate  
    1.1 `preprocessHistoric()`:  preprocess data and returns df for historic data  
    1.2 `preprocessYearToDate()`:  preprocess data and returns df for yearToDate data  

<b>Note</b> - 

<b><i>To trigger this API call for 6 years, you have to manually change the dates in the `preprocessHistoric()` and `preprocessYearToDate()`</i></b>

* For `preprocessHistoric`, a sample setup for API request will be as:  

`def preprocessHistoric(dataURI):
        isValidAndSetDefaults(dataURI, <start_date_as_string>, <end_date_as_string>, "data") #This is the line that has to be modified
`
<hr></hr>

* For `preprocessYearToDate`, a sample setup for API request will be as:  

`def preprocessYearToDate(dataURI):
        isValidAndSetDefaults(dataURI, <start_date_as_string>, <end_date_as_string>, "data") #This is the line that has to be modified
`

<b><i>Also, there must be empty directories created with exact name as `data_Historic` and `data_YTD` in the same directory where this notebook exists to ensure that this code runs successfully. Without these directories, the user will not be able to send the API request</i></b>

In [360]:
def preprocessData():
    '''
    Here, I manually changed the dates on every run to get data for 6 months for 6 years from 2013 - 2019
    We can changes
    @return: two dataframes := historic and yearToDate
    '''
    def preprocessHistoric(dataURI):
        '''
        @params:dataURI == 'qgea-i56i'
        Here, the API base endpoint for historic data is https://data.cityofnewyork.us/Public-Safety/NYPD-Complaint-Data-Historic/qgea-i56i 
        '''
        if dataURI is None:
            print("Error: DataURI is missing.")
        print("------------Pre-processing NYPD_Complaint_Data Historic records---------------")
        isValidAndSetDefaults(dataURI, "2018-01-01", "2018-06-30", "data")
        print(f"Default configurations -\n {vars_}")
        getData() # -- comment this code out once its done scraping data
        mergeCSV() # -- comment this code out once a final merged csv is formed in the root folder
        
    def preprocessYearToDate(dataURI):
        '''
        @params:dataURI == '5uac-w243'
        Here, the API base endpoint for yearToDate data is https://data.cityofnewyork.us/Public-Safety/NYPD-Complaint-Data-Historic/5uac-w243 
        '''
        if dataURI is None:
            print("Error: DataURI is missing.")
        print("------------Pre-processing NYPD_Complaint_Data Year-To-Date records---------------")
        isValidAndSetDefaults(dataURI, "2019-01-01", "2019-06-30", "data")
        print(f"Default configurations -\n {vars_}")
        getData() # -- comment this code out once its done scraping data
        mergeCSV() # -- comment this code out once a final merged csv is formed in the root folder
    
    #comment the next two lines after pulling data from source
    preprocessHistoric("qgea-i56i")
    preprocessYearToDate("5uac-w243")   
    
    df_historic = pd.read_csv("NYPD_Complaint_Data_Historic.csv", index_col=0) #read historic
    df_ytd = pd.read_csv("NYPD_Complaint_Data_YTD.csv", index_col=0) #read ytd 
    return df_historic, df_ytd
    
dfH, dfY = preprocessData()
dfHplusY = pd.concat([dfH, dfY], sort=True) #concatenate both the historic and yearToDate df to get final df.

------------Pre-processing NYPD_Complaint_Data Historic records---------------
Correct date
Correct date
Log: Validations passed.
Log: Defaults successfully loaded.
Default configurations -
 {'APP_TOKEN': 'jlyZazYtAq9hSQNf0Ow4pNySj', 'LIMIT': 100000, 'DATASET': 'qgea-i56i', 'BASE_URL': 'https://data.cityofnewyork.us/resource/', 'URI_IDENTIFIERS': ['5uac-w243', 'qgea-i56i'], 'SAVE_TO_DIRECTORY': 'data_Historic', 'START_DATE': '2018-01-01', 'END_DATE': '2018-01-30', 'RES_PER_RESPONSE': 10000, 'SAVE_FILE_ENDING': 'Historic'}
https://data.cityofnewyork.us/resource/qgea-i56i.json?$where=cmplnt_fr_dt >='2018-01-01' and cmplnt_fr_dt < '2018-01-30'&$limit=10000&$order=cmplnt_fr_dt
Calling offset 0 : https://data.cityofnewyork.us/resource/qgea-i56i.json?$where=cmplnt_fr_dt >='2018-01-01' and cmplnt_fr_dt < '2018-01-30'&$limit=10000&$order=cmplnt_fr_dt&$offset=0
HTTP response code: 200
Results Found: 10000
First Date: 0    2018-01-01T00:00:00.000
Name: cmplnt_fr_dt, dtype: object
Last Date: 9999

Note, the above cell produces a sample API Call for 2 datasets.
1. Historic dataset between date range => ("2018-01-01", "2018-01-30")  
2. Year to date dataset between date range => ("2019-01-01", "2019-01-30")

However, further analysis is been performed on the complete dataset of 6 years

After scraping data of 6 years, following are the dataframe shapes obtained:
1. Historic data shape (dfH) = (1393340, 36)
2. Year To Date data shape (dfY) = (218610, 36)
3. Combined Data shape (dfHplusY) = (1611950, 36)