In [18]:
import os

import pandas as pd
import numpy as np
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

print("Setup complete.")

Setup complete.


In [19]:
path = os.path.relpath("./raw")
file = "raw_reduced_7_years"
ext = "csv"

raw_data = pd.read_csv(os.path.join(path, ".".join([file, ext])), parse_dates = ["Date", "Time"])
print("Raw data loaded.")

Raw data loaded.


In [3]:
# Convert features to datetime and get day of year.
# raw_data["Date"] = pd.to_datetime(raw_data["Date"])
# raw_data["Time"] = pd.to_datetime(raw_data["Time"], format = "%H:%M:%S")
raw_data["day_of_year"] = raw_data["Date"].dt.day_of_year
raw_data["hour"] = raw_data["Time"].dt.hour 

In [4]:
# Drop unnecessary columns.
columns_to_drop = ["Incident_Number", "Report_Location", "Address", "Type"]
data = raw_data.drop(columns = columns_to_drop)

In [5]:
action = "impute"

if action == "all":
	data.dropna(inplace = True)
elif action == "impute":
	columns_to_impute = list(data.isna().any()[data.isna().any()].index)
	imp = IterativeImputer(max_iter = 20, random_state = 26)
	imp.fit(data[columns_to_impute])
	new_data = pd.DataFrame(imp.transform(data[columns_to_impute]), columns = columns_to_impute)
	data.drop(columns = columns_to_impute)
	data[columns_to_impute] = new_data
elif action == "mean replace":
	data_filled = data.fillna(data.mean())
else:
	print("Select specific action.")
	

In [6]:
def origin_haversine(coord: tuple, degrees = True) -> float:
    """
    Calculates the Haversine the point `(latitude, longitude)` and `(0, 0)`.
    
    Parameters
    ----------
    coord:
        The coordinates specified as `(latitude, longitude)` either in degrees or radians.
    degrees:
        If true converts coordinates from (assumed) degrees to radians.
        
    Returns
    -------
    float:
        The distance.
    """
    lat, lng = coord
    
	# Earth's radius in km.
    r = 6371 

    # Convert decimal degrees to radians, if needed.
    if degrees:
        lat, lng = map(np.radians, [lat, lng])

    # Harvesine distance between (0, 0) and (lat, long)
    a = np.sin(lat / 2) ** 2 + np.cos(lat) * np.sin(lng / 2) ** 2
    d = 2 * r * np.arcsin(np.sqrt(a)) 

    return d

HAVERSINE_FEATURE = False

if HAVERSINE_FEATURE:
    data["latlong_combined"] = [origin_haversine((lat, lng)) for lat, lng in zip(data.Latitude, data.Longitude)]
    data_haversine = data.drop(columns = ["Latitude", "Longitude"])

In [7]:
def create_main_data(data: pd.DataFrame, date: str, time_groups: list):
	"""
	Splits the dataframe `data` by year using the date column `date`. Groups by the specified time groups (eg: `year_of_day` and `hour`), averages the latitudes and longitudes.
	
	Parameters
	----------
	data:
		The data to be processed.
  
	date:
		Column name of the date column in `data`.
  
	Returns
	-------
	list:
		A list of dataframes.
	"""
	year_frames = [data[data[date].dt.year == y] for y in data[date].dt.year.unique()]
	print(data[date].dt.year.unique())
	main_frames = list()
	for df in year_frames:
		d_temp = df.groupby(time_groups)\
			.agg({
					date: ["count"],
					"Latitude": ["mean"],
					"Longitude": ["mean"]
			})\
			.reset_index(time_groups)\
			.sort_values(time_groups)
   
		d_temp.rename(columns = {date: "calls"}, inplace = True)
		d_temp.columns = d_temp.columns.droplevel(1)
		main_frames.append(d_temp)
	return main_frames

In [8]:
main_frames = create_main_data(data, "Date", ["day_of_year", "hour"])

[2015 2016 2017 2018 2019 2020 2021 2022]


In [10]:
final_data_mean = pd.concat(main_frames[:-1], ignore_index = True)
final_data_test_mean = main_frames[-1]

In [24]:
path_to_save = os.path.relpath("./train-test")
final_data_mean.to_csv(os.path.join(path_to_save, ".".join(["data_yearly_hourly_train", "csv"])), index = False)
final_data_test_mean.to_csv(os.path.join(path_to_save, ".".join(["data_yearly_hourly_test", "csv"])), index = False)

In [23]:
len(np.unique(final_data_mean["calls"]))

46

In [26]:
final_data_mean

Unnamed: 0,day_of_year,hour,calls,Latitude,Longitude
0,33,12,19,47.602161,-122.331446
1,33,13,20,47.614256,-122.341655
2,33,14,15,47.640749,-122.328970
3,33,15,9,47.620303,-122.331749
4,33,16,15,47.619608,-122.323381
...,...,...,...,...,...
60465,365,19,11,47.602736,-122.327494
60466,365,20,14,47.617627,-122.331832
60467,365,21,10,47.636225,-122.337357
60468,365,22,19,47.595027,-122.340481
