In [2]:
import pandas as pd

calls_by_district_and_time_segment = pd.read_csv('../data/clean/calls_by_district_and_time_segment.csv')
calls_by_district_and_time_segment[['NAME', 'weekday', 'week_of_year', 'time_segment']].value_counts()

NAME       weekday    week_of_year  time_segment
SOUTH END  Friday     18            AFTERNOON_3     26
SHAWNEE    Friday     50            AFTERNOON_3     20
SOUTH END  Friday     18            AFTERNOON_2     20
SHAWNEE    Friday     17            AFTERNOON_2     19
           Tuesday    51            MORNING_4       17
                                                    ..
SOUTH END  Sunday     32            MORNING_4        1
                                    MORNING_3        1
                                    MORNING_2        1
           Friday     38            MORNING_3        1
           Wednesday  53            MORNING_1        1
Name: count, Length: 10710, dtype: int64

week of year being numerical might be a problem

might need to put year back into the data

could average the count over all the years, or see if it's been steadily going up over the years (or going down, following some pattern)

In [2]:
calls_by_district_and_time_segment_with_year = pd.read_csv('../data/clean/calls_by_district_and_time_segment_with_year.csv')
calls_by_district_and_time_segment_with_year[['NAME', 'weekday', 'week_of_year', 'time_segment', 'year']].value_counts()

NAME        weekday    week_of_year  time_segment  year
SOUTH END   Friday     18            AFTERNOON_3   2018    12
FRIENDSHIP  Thursday   29            MORNING_4     2017     8
SOUTH END   Saturday   35            AFTERNOON_1   2024     8
            Friday     43            AFTERNOON_2   2024     8
FRIENDSHIP  Friday     37            AFTERNOON_3   2023     7
                                                           ..
ROUSS       Sunday     51            AFTERNOON_4   2020     1
                                                   2016     1
                                     AFTERNOON_3   2023     1
                                     AFTERNOON_2   2024     1
SOUTH END   Wednesday  53            MORNING_1     2020     1
Name: count, Length: 32400, dtype: int64

In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import numpy as np

# Step 1: Get value counts as a DataFrame
counts = calls_by_district_and_time_segment_with_year[['NAME', 'weekday', 'week_of_year', 'time_segment', 'year']] \
    .value_counts().reset_index(name='call_count')

# Step 2: One-hot encode categorical columns
counts_encoded = pd.get_dummies(counts, columns=['NAME', 'weekday', 'time_segment'])

# Step 3: Split into features and target
X = counts_encoded.drop(columns='call_count')
y = counts_encoded['call_count']

# Step 4: Train, validation, and test split
X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.25, random_state=42)

# Step 5: Train simple regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Step 6: Evaluate the model
train_rmse = np.sqrt(mean_squared_error(y_train, model.predict(X_train)))
val_rmse = np.sqrt(mean_squared_error(y_val, model.predict(X_val)))
test_rmse = np.sqrt(mean_squared_error(y_test, model.predict(X_test)))

print('Train RMSE:', train_rmse)
print('Validation RMSE:', val_rmse)
print('Test RMSE:', test_rmse)


Train RMSE: 0.6236700776423358
Validation RMSE: 0.6130973044386522
Test RMSE: 0.6029743568761035


In [33]:
df = calls_by_district_and_time_segment_with_year.copy()

filtered = df[
    (df['weekday'] == 'Friday') &
    (df['time_segment'] == 'AFTERNOON_1') &
    (df['week_of_year'] == 2)
    ]

# Count number of calls by year
call_counts = filtered['year'].value_counts().sort_index()

print(call_counts)


year
2017    1
2018    1
2019    2
2020    5
2021    6
2023    4
2024    2
2025    6
Name: count, dtype: int64


In [None]:
df = calls_by_district_and_time_segment_with_year.copy()

# Count how many times each time segment appears (i.e., number of calls)
avg_calls = df['time_segment'].value_counts().mean()

print("Average number of calls per time segment:", avg_calls)


In [36]:
counts = calls_by_district_and_time_segment_with_year[['NAME', 'weekday', 'week_of_year', 'time_segment', 'year']] \
    .value_counts().reset_index(name='call_count')

# Step 2: One-hot encode categorical columns
counts_encoded = pd.get_dummies(counts, columns=['NAME', 'weekday', 'time_segment'])

avg_calls = counts['call_count'].mean()

print(avg_calls)

1.3173765432098765


Could this be an issue here with there being no entries with 0 for a particular station, weekday, week of year, time segment, and year combination.

In [37]:
counts.head()



Unnamed: 0,NAME,weekday,week_of_year,time_segment,year,call_count
0,SOUTH END,Friday,18,AFTERNOON_3,2018,12
1,FRIENDSHIP,Thursday,29,MORNING_4,2017,8
2,SOUTH END,Saturday,35,AFTERNOON_1,2024,8
3,SOUTH END,Friday,43,AFTERNOON_2,2024,8
4,FRIENDSHIP,Friday,37,AFTERNOON_3,2023,7


In [39]:
df = calls_by_district_and_time_segment_with_year.copy()

filtered = df[
    (df['weekday'] == 'Friday') &
    (df['week_of_year'] == 18) &
    (df['time_segment'] == 'AFTERNOON_3') &
    (df['year'] == 2018)
    ]

print(filtered)


                         CallDescription        NAME weekday  week_of_year  \
7017                             Z-Other   SOUTH END  Friday            18   
7023                             Z-Other   SOUTH END  Friday            18   
7030                             Z-Other  FRIENDSHIP  Friday            18   
7032                             Assault  FRIENDSHIP  Friday            18   
7033                         Sick Person   SOUTH END  Friday            18   
7034          Chest Pain (Non-Traumatic)   SOUTH END  Friday            18   
7035  Unconscious/Fainting/Near-Fainting   SOUTH END  Friday            18   
7040                   Breathing Problem   SOUTH END  Friday            18   
7143            Allergic Reaction/Stings   SOUTH END  Friday            18   
7232            Allergic Reaction/Stings   SOUTH END  Friday            18   
7245                   Breathing Problem   SOUTH END  Friday            18   
7264            Allergic Reaction/Stings   SOUTH END  Friday    

In [12]:
# df['week_sin'] = np.sin(2 * np.pi * df['week'] / 52)
# df['week_cos'] = np.cos(2 * np.pi * df['week'] / 52)

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import numpy as np

# Step 1: Get value counts as a DataFrame
counts = calls_by_district_and_time_segment_with_year[['NAME', 'weekday', 'week_of_year', 'time_segment', 'year']].value_counts().reset_index(name='call_count')

# Step 2: One-hot encode categorical columns
counts_encoded = pd.get_dummies(counts, columns=['NAME', 'weekday', 'time_segment'])

counts_encoded['week_of_year_sin'] = np.sin(2 * np.pi * counts_encoded['week_of_year'] / 53.0)
counts_encoded['week_of_year_cos'] = np.cos(2 * np.pi * counts_encoded['week_of_year'] / 53.0)

counts_encoded.head()




Unnamed: 0,week_of_year,year,call_count,NAME_FRIENDSHIP,NAME_ROUSS,NAME_SHAWNEE,NAME_SOUTH END,weekday_Friday,weekday_Monday,weekday_Saturday,...,time_segment_AFTERNOON_1,time_segment_AFTERNOON_2,time_segment_AFTERNOON_3,time_segment_AFTERNOON_4,time_segment_MORNING_1,time_segment_MORNING_2,time_segment_MORNING_3,time_segment_MORNING_4,week_of_year_sin,week_of_year_cos
0,18,2018,12,False,False,False,True,True,False,False,...,False,False,True,False,False,False,False,False,0.845596,-0.533823
1,29,2017,8,True,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,-0.292057,-0.956401
2,35,2024,8,False,False,False,True,False,False,True,...,True,False,False,False,False,False,False,False,-0.845596,-0.533823
3,43,2024,8,False,False,False,True,True,False,False,...,False,True,False,False,False,False,False,False,-0.92669,0.375828
4,37,2023,7,True,False,False,False,True,False,False,...,False,False,True,False,False,False,False,False,-0.947326,-0.32027


In [15]:

# Step 3: Split into features and target
X = counts_encoded.drop(columns=['call_count', 'week_of_year'])
y = counts_encoded['call_count']

# Step 4: Train, validation, and test split
X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.25, random_state=42)

# Step 5: Train simple regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Step 6: Evaluate the model
train_rmse = np.sqrt(mean_squared_error(y_train, model.predict(X_train)))
val_rmse = np.sqrt(mean_squared_error(y_val, model.predict(X_val)))
test_rmse = np.sqrt(mean_squared_error(y_test, model.predict(X_test)))

print(len(model.coef_))

print('Train RMSE:', train_rmse)
print('Validation RMSE:', val_rmse)
print('Test RMSE:', test_rmse)

22
Train RMSE: 0.6236388328882084
Validation RMSE: 0.6129868314495341
Test RMSE: 0.6031189492035769
