### Imports

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re

# 1. Prepare Time Series Dataset

In [None]:
dataset = pd.read_csv("data/final_dataset.csv")
print("Shape of dataset:", dataset.shape)
dataset.head()

## 1.1 Select cities

In [None]:
dataset['year'].value_counts()

Keep only incidents regarding [2014, 2015, 2016, 2017], as by project assignment instructions.

In [None]:
dataset = dataset[(dataset['year'] > 2013) & (dataset['year'] < 2018)]

Analysis of cities reveals that many cities are present with different names, resulting in incorrect city value counts.

In [None]:
with open('debugging/cities.txt', 'w') as f:
    for item in dataset['city_or_county'].unique():
        f.write("%s\n" % item)
        
# Write city and value counts of each city to a file
with open('debugging/city_counts.txt', 'w') as f:
    f.write(dataset['city_or_county'].value_counts().to_string())

print('There are {} unique cities in the dataset'.format(len(dataset['city_or_county'].unique())))
        

Eliminate parenthesis with county or extra information.

In [None]:
# Eliminate all data between parenthesis in the city_or_county column using re module
dataset['city_or_county'] = dataset['city_or_county'].apply(lambda x: re.sub(r"\(.*\)", "", x))
print('There are {} unique cities in the dataset'.format(len(dataset['city_or_county'].unique())))

Sort cities alphabetically to see if there are still duplicates and how relevant they are.

In [None]:
# Get all cities and sort them alphabetically and write them in a file
cities = dataset['city_or_county'].unique()
cities.sort()
with open('debugging/cities2.txt', 'w') as f:
    for item in cities:
        f.write("%s\n" % item)

We can see that there are many cities which differ in having a space in the end, let's remove all spaces to avoid problems.

In [None]:
# Remove all spaces from city names
dataset['city_or_county'] = dataset['city_or_county'].apply(lambda x: x.replace(" ", ""))
print('There are {} unique cities in the dataset'.format(len(dataset['city_or_county'].unique())))

In [None]:
dataset['city_or_county'] = dataset['city_or_county'].str.upper()
print('There are {} unique cities in the dataset'.format(len(dataset['city_or_county'].unique())))

Introducing a week parameter and filtering only cities with a number of weeks with incidents greater than 15% of the total number of the weeks of the 4 years.

In [None]:
dataset['week'] = " "

# Date attribute is a progressive integer number, starting from 0
# Assign a week number to each date
dataset['date'] = dataset['date'] - dataset['date'].min()
dataset['week'] = dataset['date'].apply(lambda x: int(x / 7))

dropping_threshold = 0.01
n_weeks = dataset['week'].max()
n_weeks

In [None]:
cities = dataset['city_or_county'].unique()

for city in cities:
    city_data = dataset[dataset['city_or_county'] == city]
    city_weeks_with_incidents = city_data['week'].nunique()

    # Drop the city if it has less than 15% of the weeks with incidents
    if city_weeks_with_incidents < n_weeks * dropping_threshold:
        dataset = dataset[dataset['city_or_county'] != city]

print('Number of cities for which time series will be generated:', dataset['city_or_county'].nunique())
dataset.head()

## 1.2 Score functions for subtasks

Define the functions to compute the score for each of the two subtasks

In [None]:
def compute_week_score(week_data, task):
    # Compute the score for a given week, to be used in the time series
    match task:
        case 'task1':
            score = week_data['n_killed'].sum()
        case _:
            raise ValueError('Task not recognized')
    return score

def generate_time_series(city_data, n_weeks, task):
    # Generate the time series for a given city
    time_series = np.zeros(n_weeks)
    for week in range(n_weeks):
        week_data = city_data[city_data['week'] == week]
        if week_data.shape[0] > 0:
            time_series[week] = compute_week_score(week_data, task)
    return time_series

def generate_time_series_dataset(dataset, task):
    # Generate the time series for all cities
    n_weeks = dataset['week'].max()
    cities = dataset['city'].unique()
    time_series = []
    for city in cities:
        city_data = dataset[dataset['city'] == city]
        time_series.append(generate_time_series(city_data, n_weeks, task))
    return np.array(time_series)

# 2. Clustering and Motif/Anomalies Extraction 

## 2.1 Generate time series

In [None]:
ts_dataset = generate_time_series_dataset(dataset, 'task1')
ts_dataset.shape

## 2.2 Clustering 