## Predictive Analytics for 311 Service Request Resolution
---
Methodology:
- Data from over 45 cities was revied from Andew Friedman's 311 Dataset repository (**https://andrew-friedman.github.io/jkan/datasets/**).
- We decided to include the target cites in our study as they showed avalible data which contained a start data, end date, and department from 2014/1/1 through 2023/12/31.
- Target Cities: Baltimore, MD, Boston, MA, Buffalo, NY, Miami, FL, Oakland, CA, and Washington, DC
- Target Date Range: 2014/1/1 through 2023/12/31 (10 Years)
- Raw data from the target cities was downloaded and pre-processed using the clean_data function. While processing the function limits the file size to 1000mb as that is the GitHub file Size limit with our current plan.

Functions:
- split_csv(): Splits a CSV file into smaller chunks if it exceeds a given size in MB.
- clean_city_data(): Clean city data from CSV files, filtering by date range, and sorting by open date. If the file is larger than 100 MB, it is split into smaller chunks before processing.


In [8]:
# Import Libraries
import os
import pandas as pd
import numpy as np

# Split CSV Function
def split_csv(input_path, max_size_mb=1000):
    """
    Splits a CSV file into smaller chunks if it exceeds a given size in MB.

    Args:
        input_path (str): Path to the input CSV file.
        max_size_mb (int): Maximum allowed size in MB. Default is 1000 MB.

    Returns:
        List of paths to the split files.
    """
    # Query File Size & Convert to Megabites
    file_size_mb = os.path.getsize(input_path) / (1024 * 1024)

    # Set File Size Limit from Function Call
    if file_size_mb <= max_size_mb:
        return [input_path]

    # If file is larger than 100 MB, split it into smaller chunks
    df = pd.read_csv(input_path)
    chunk_size = int(np.ceil(len(df) * (max_size_mb / file_size_mb)))  # Calculate chunk size based on file size
    split_files = []
    base_name = os.path.splitext(input_path)[0]

    for i, chunk in enumerate(np.array_split(df, len(df) // chunk_size + 1)):
        split_file_path = f"{base_name}_part{i+1}.csv"
        chunk.to_csv(split_file_path, index=False)
        split_files.append(split_file_path)
        print(f"File '{input_path}' split into '{split_file_path}'")  # Print message when file is split

    return split_files

# Clean City Data Function
def clean_city_data(input_paths, output_path, city_names, date_open_col, date_closed_col, department_col):
    """
    Clean city data from CSV files, filtering by date range, and sorting by open date.
    If the file is larger than 100 MB, it is split into smaller chunks before processing.

    Args:
        input_paths (list of str): List of paths to the input CSV files.
        output_path (str): Path to save the cleaned CSV file.
        city_names (list of str): List of city names corresponding to input files.
        date_open_col (str): Name of the column containing open date.
        date_closed_col (str): Name of the column containing closed date.
        department_col (str): Name of the column containing department information.
    
    Returns:
        pandas.DataFrame: Merged and cleaned DataFrame.
    """
    # Initialize an empty DataFrame to store cleaned data from all parts
    cleaned_data = pd.DataFrame()

    for input_path, city_name in zip(input_paths, city_names):
        # Split the input file if it's larger than 1000 MB
        split_files = split_csv(input_path, max_size_mb=1000)

        for file in split_files:
            try:
                # Try loading data with default encoding
                try:
                    df = pd.read_csv(file)
                except UnicodeDecodeError:
                    # If default encoding fails, try loading with 'cp1252' encoding
                    df = pd.read_csv(file, encoding='cp1252')

                # Streamline Data Transformation
                df = df[[date_open_col, date_closed_col, department_col]]
                df.rename(columns={date_open_col: 'open_date', date_closed_col: 'close_date', department_col: 'department'}, inplace=True)
                df['open_date'] = pd.to_datetime(df['open_date'], errors='coerce') # coerce removes unconvertible data
                df['close_date'] = pd.to_datetime(df['close_date'], errors='coerce') # coerce removes unconvertible data

                # Handling Missing Data (Leaving Missing close dates to account for recent cases that are unclosed)
                df.dropna(subset=['open_date','department'], inplace=True)
                df['close_date'] = df['close_date'].replace('', pd.NaT)

                # Filter data by date range (2014/01/01 -> 2023/12/31)
                if "washington" or "miami" in input_path.lower():
                    # Filter data with timezone attached
                    df = df[(df['open_date'] >= pd.Timestamp("2014-01-01 23:59:59'").tz_localize(None).tz_localize('UTC')) & (df['open_date'] <= pd.Timestamp("2023-12-31 23:59:59'").tz_localize(None).tz_localize('UTC'))]
                    # Drop Timezone (+00:00)
                    df['open_date'] = pd.to_datetime(df['open_date']).dt.strftime('%Y-%m-%d %H:%M:%S')
                    df['close_date'] = pd.to_datetime(df['close_date']).dt.strftime('%Y-%m-%d %H:%M:%S')

                else:
                    # Non UTC
                    df = df[(df['open_date'] >= pd.Timestamp("2014-01-01 23:59:59'")) & (df['open_date'] <= pd.Timestamp("2023-12-31 23:59:59'"))]

                # Add constant column for city name
                df['city'] = city_name

                # Sort by open_date
                df.sort_values(by='open_date', inplace=True)

                # Append to cleaned_data DataFrame
                cleaned_data = pd.concat([cleaned_data, df])
            
            except Exception as e:
                print(f"An error occurred while processing {file}: {e}")

    # Save the final cleaned data to the specified output file
    cleaned_data.to_csv(output_path, index=False)
    
    # Show dataframe on output
    return cleaned_data

# These are the function calls run through the clean_city_data function. 
#clean_city_data([f'../data/raw/boston_{i}.csv' for i in range(2014,2020)],'../data/raw/boston_cleaned_1.csv',        'boston',        'open_dt',                  'closed_dt',               'subject')          
#clean_city_data([f'../data/raw/boston_{i}.csv' for i in range(2020,2024)],'../data/raw/boston_cleaned_2.csv',        'boston',        'open_dt',                  'closed_dt',               'subject')
#clean_city_data(['../data/raw/buffalo.csv'],       '../data/raw/buffalo_cleaned.csv',      'buffalo',       'Open Date',                'Closed Date',             'Subject') 
#clean_city_data(['../data/raw/oakland.csv'],      '../data/raw/oakland_cleaned.csv',       'oakland',       'DATETIMEINIT',             'DATETIMECLOSED',          'REQCATEGORY')
#clean_city_data(['../data/raw/washington_dc.csv'], '../data/raw/washington_dc_cleaned.csv', 'washington_dc', 'INITIATEDDATE',            'CLOSEDDATE',              'REQUESTCATEGORY')         
#clean_city_data([f'../data/new_miami/311_Service_Requests_-_Miami-Dade_County_-_{i}.csv' for i in range(2014,2019)],'../data/raw/miami_cleaned_1.csv','miami','ticket_created_date_time','ticket_closed_date_time','case_owner_description')
clean_city_data([f'../data/new_miami/311_Service_Requests_-_Miami-Dade_County_-_{i}.csv' for i in range(2019,2024)],'../data/raw/miami_cleaned_2.csv','miami','ticket_created_date_time','ticket_closed_date_time','case_owner_description')


Unnamed: 0,open_date,close_date,department,city
55,2019-01-01 06:23:06,,Enforcement Section,m
65,2019-01-01 08:02:18,,Enforcement Section,m
133,2019-01-01 14:31:24,,Enforcement Section,m
211,2019-01-01 15:47:28,,Enforcement Section,m
280,2019-01-01 16:07:25,,Enforcement Section,m
...,...,...,...,...
343664,2023-12-31 22:50:21,2024-01-19 14:21:41,Solid Waste Management,i
343665,2023-12-31 23:02:40,2024-01-10 14:18:19,Solid Waste Management,i
343666,2023-12-31 23:27:03,2024-01-30 21:18:17,Solid Waste Management,i
343667,2023-12-31 23:37:38,2024-01-19 18:29:21,Solid Waste Management,i
