In [144]:
import os

import pandas as pd
import numpy as np
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

print("Setup complete.")

Setup complete.


In [170]:
path = os.path.relpath("./raw")
file = "raw_reduced_7_years"
ext = "csv"

raw_data = pd.read_csv(os.path.join(path, ".".join([file, ext])), parse_dates = ["Date", "Time"])
print("Raw data loaded.")

Raw data loaded.


In [147]:
# Convert features to datetime and get day of year.
raw_data["Date"] = pd.to_datetime(raw_data["Date"])
raw_data["Time"] = pd.to_datetime(raw_data["Time"], format = "%H:%M:%S")
raw_data["day_of_year"] = raw_data["Date"].dt.day_of_year
raw_data["hour"] = raw_data["Time"].dt.hour 

In [148]:
# Drop unnecessary columns.
columns_to_drop = ["Incident_Number", "Report_Location", "Address", "Type"]
data = raw_data.drop(columns = columns_to_drop)

In [177]:
data["calls"] = 1

In [182]:
data

Unnamed: 0,Date,Time,Latitude,Longitude,day_of_year,hour,calls
0,2015-02-02,1900-01-01 12:24:27,47.571349,-122.292809,33,12,1
1,2015-02-02,1900-01-01 12:26:59,47.559223,-122.386833,33,12,1
2,2015-02-02,1900-01-01 12:31:22,47.701698,-122.362125,33,12,1
3,2015-02-02,1900-01-01 12:32:16,47.670184,-122.385260,33,12,1
4,2015-02-02,1900-01-01 12:35:42,47.550352,-122.267264,33,12,1
...,...,...,...,...,...,...,...
709947,2022-02-02,1900-01-01 01:54:00,47.721329,-122.296339,33,1,1
709948,2022-02-02,1900-01-01 01:59:00,47.614085,-122.319447,33,1,1
709949,2022-02-02,1900-01-01 01:59:00,47.588415,-122.382743,33,1,1
709950,2022-02-02,1900-01-01 02:03:00,47.670913,-122.385203,33,2,1


In [149]:
action = "impute"

if action == "all":
	data.dropna(inplace = True)
elif action == "impute":
	columns_to_impute = list(data.isna().any()[data.isna().any()].index)
	imp = IterativeImputer(max_iter = 20, random_state = 26)
	imp.fit(data[columns_to_impute])
	new_data = pd.DataFrame(imp.transform(data[columns_to_impute]), columns = columns_to_impute)
	data.drop(columns = columns_to_impute)
	data[columns_to_impute] = new_data
elif action == "mean replace":
	data_filled = data.fillna(data.mean())
else:
	print("Select specific action.")
	

In [150]:
def origin_haversine(coord: tuple, degrees = True) -> float:
    """
    Calculates the Haversine the point `(latitude, longitude)` and `(0, 0)`.
    
    Parameters
    ----------
    coord:
        The coordinates specified as `(latitude, longitude)` either in degrees or radians.
    degrees:
        If true converts coordinates from (assumed) degrees to radians.
        
    Returns
    -------
    float:
        The distance.
    """
    lat, lng = coord
    
	# Earth's radius in km.
    r = 6371 

    # Convert decimal degrees to radians, if needed.
    if degrees:
        lat, lng = map(np.radians, [lat, lng])

    # Harvesine distance between (0, 0) and (lat, long)
    a = np.sin(lat / 2) ** 2 + np.cos(lat) * np.sin(lng / 2) ** 2
    d = 2 * r * np.arcsin(np.sqrt(a)) 

    return d

HAVERSINE_FEATURE = False

if HAVERSINE_FEATURE:
    data["latlong_combined"] = [origin_haversine((lat, lng)) for lat, lng in zip(data.Latitude, data.Longitude)]
    data_haversine = data.drop(columns = ["Latitude", "Longitude"])

In [184]:
def create_main_data(data: pd.DataFrame, date: str, time_groups: list):
	"""
	Splits the dataframe `data` by year using the date column `date`. Groups by the specified time groups (eg: `year_of_day` and `hour`), averages the latitudes and longitudes.
	
	Parameters
	----------
	data:
		The data to be processed.
  
	date:
		Column name of the date column in `data`.
  
	Returns
	-------
	list:
		A list of dataframes.
	"""
	year_frames = [data[data[date].dt.year == y] for y in data[date].dt.year.unique()]
	print(data[date].dt.year.unique())
	main_frames = list()
	for df in year_frames:
		d_temp = df.groupby(time_groups)\
			.agg({
					date: ["count"],
					"Latitude": ["mean"],
					"Longitude": ["mean"]
			})\
			.reset_index(time_groups)\
			.sort_values(time_groups)
   
		d_temp.rename(columns = {date: "calls"}, inplace = True)
		d_temp.columns = d_temp.columns.droplevel(1)
		main_frames.append(d_temp)
	return main_frames

In [189]:
main_frames = create_main_data(data, "Date", ["day_of_year", "hour"])

[2015 2016 2017 2018 2019 2020 2021 2022]


In [82]:
final_data_mean = pd.concat(main_frames[:-1], ignore_index = True)
final_data_test_mean = main_frames[-1]

In [20]:
path_to_save = os.path.relpath("./train-test")
final_data_mean.to_csv(os.path.join(path_to_save, ".".join(["data_yearly_hourly_train", "csv"])), index = False)
final_data_test_mean.to_csv(os.path.join(path_to_save, ".".join(["data_yearly_hourly_test", "csv"])), index = False)

In [35]:
main_frames[1]

Unnamed: 0,day_of_year,hour,Time,calls,Latitude,Longitude
0,1,0,00:05:37,1,47.553433,-122.387000
1,1,0,00:10:49,1,47.648106,-122.332616
2,1,0,00:18:36,1,47.618554,-122.343368
3,1,0,00:19:11,1,47.600883,-122.333530
4,1,0,00:24:02,1,47.540295,-122.286233
...,...,...,...,...,...,...
101825,366,23,23:44:26,1,47.544400,-122.369147
101826,366,23,23:46:23,1,47.621487,-122.347597
101827,366,23,23:55:46,1,47.550189,-122.291555
101828,366,23,23:55:56,1,47.595874,-122.311530
