In [313]:
import os

import pandas as pd
import numpy as np
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

print("Setup complete.")

Setup complete.


In [314]:
path = os.path.abspath("../raw")
file = "raw_reduced_7_years"
ext = "csv"

raw_data = pd.read_csv(os.path.join(path, ".".join([file, ext])), infer_datetime_format = True)
print("Raw data loaded.")

Raw data loaded.


In [315]:
raw_data

Unnamed: 0,Address,Type,Date,Time,Latitude,Longitude,Report_Location,Incident_Number
0,3513 Rainier Av S,Aid Response,2015-02-02,12:24:27,47.571349,-122.292809,POINT (-122.292809 47.571349),F150012179
1,4811 California Av Sw,Aid Response Yellow,2015-02-02,12:26:59,47.559223,-122.386833,POINT (-122.386833 47.559223),F150012180
2,9999 Holman Rd Nw,Aid Response,2015-02-02,12:31:22,47.701698,-122.362125,POINT (-122.362125 47.701698),F150012183
3,2237 Nw 57th St,Aid Response,2015-02-02,12:32:16,47.670184,-122.385260,POINT (-122.38526 47.670184),F150012184
4,5804 52nd Av S,Aid Response Yellow,2015-02-02,12:35:42,47.550352,-122.267264,POINT (-122.267264 47.550352),F150012188
...,...,...,...,...,...,...,...,...
709947,12740 30th Ave Ne,Medic Response,2022-02-02,01:54:00,47.721329,-122.296339,POINT (-122.296339 47.721329),F220013627
709948,10th Ave / E Pike St,Aid Response,2022-02-02,01:59:00,47.614085,-122.319447,POINT (-122.319447 47.614085),F220013628
709949,1612 Ferry Ave Sw,Automatic Fire Alarm Resd,2022-02-02,01:59:00,47.588415,-122.382743,POINT (-122.382743 47.588415),F220013629
709950,2233 Nw 58th St,Auto Fire Alarm,2022-02-02,02:03:00,47.670913,-122.385203,POINT (-122.385203 47.670913),F220013630


In [316]:
# Convert features to datetime.
raw_data["Date"] = pd.to_datetime(raw_data["Date"])
raw_data["day_of_year"] = raw_data["Date"].dt.day_of_year
raw_data["hour"] = pd.to_datetime(raw_data["Time"]).dt.hour 

In [317]:
# Drop unnecessary columns.
columns_to_drop = ["Time", "Incident_Number", "Report_Location", "Address", "Type"]
data = raw_data.drop(columns = columns_to_drop)

In [318]:
action = "impute"

if action == "all":
	data.dropna(inplace = True)
elif action == "impute":
	columns_to_impute = list(data.isna().any()[data.isna().any()].index)
	imp = IterativeImputer(max_iter = 20, random_state = 26)
	imp.fit(data[columns_to_impute])
	new_data = pd.DataFrame(imp.transform(data[columns_to_impute]), columns = columns_to_impute)
	data.drop(columns = columns_to_impute)
	data[columns_to_impute] = new_data
elif action == "mean replace":
	data_filled = data.fillna(data.mean())
else:
	print("Select specific action.")
	

In [319]:
def origin_haversine(coord: tuple, degrees = True) -> float:
    """
    Calculates the Haversine the point `(latitude, longitude)` and `(0, 0)`.
    
    Parameters
    ----------
    coord:
        The coordinates specified as `(latitude, longitude)` either in degrees or radians.
    degrees:
        If true converts coordinates from (assumed) degrees to radians.
        
    Returns
    -------
    float:
        The distance.
    """
    lat, lng = coord
    
	# Earth's radius in km.
    r = 6371 

    # Convert decimal degrees to radians, if needed.
    if degrees:
        lat, lng = map(np.radians, [lat, lng])

    # Harvesine distance between (0, 0) and (lat, long)
    a = np.sin(lat / 2) ** 2 + np.cos(lat) * np.sin(lng / 2) ** 2
    d = 2 * r * np.arcsin(np.sqrt(a)) 

    return d

HAVERSINE_FEATURE = False

if HAVERSINE_FEATURE:
    data["latlong_combined"] = [origin_haversine((lat, lng)) for lat, lng in zip(data.Latitude, data.Longitude)]
    data_haversine = data.drop(columns = ["Latitude", "Longitude"])

In [320]:
def create_main_data(data: pd.DataFrame, date: str, time_groups: list):
	"""
	Splits the dataframe `data` by year using the date column `date`. Groups by the specified time groups (eg: `year_of_day` and `hour`), averages the latitudes and longitudes in an hour.
	
	Parameters
	----------
	data:
		The data to be processed.
  
	date:
		Column name of the date column in `data`.
  
	Returns
	-------
	list:
		A list of dataframes.
	"""
	year_frames = [data[data[date].dt.year == y] for y in data[date].dt.year.unique()]
	print(data[date].dt.year.unique())
	main_frames = list()
	for df in year_frames:
		d_temp = df.groupby(time_groups)\
			.agg({
					date: ["count"],
					"Latitude": ["mean"],
					"Longitude": ["mean"]
			})\
			.reset_index(time_groups)\
			.sort_values(time_groups)
   
		d_temp.rename(columns = {date: "calls"}, inplace = True)
		d_temp.columns = d_temp.columns.droplevel(1)
		main_frames.append(d_temp)
	return main_frames

In [321]:
main_frames = create_main_data(data, "Date", ["day_of_year", "hour"])

[2015 2016 2017 2018 2019 2020 2021 2022]


In [322]:
final_data_mean = pd.concat(main_frames[:-1], ignore_index = True)
final_data_test_mean = main_frames[-1]

In [323]:
path_to_save = os.path.abspath("../train-test")
final_data_mean.to_csv(os.path.join(path_to_save, ".".join(["final_data_mean", "csv"])), index = False)
final_data_test_mean.to_csv(os.path.join(path_to_save, ".".join(["final_data_mean_test", "csv"])), index = False)