In [46]:
from datetime import datetime, timedelta
import pandas as pd
import random
import itertools as it
import time
import numpy as np
import pickle
import os

In [47]:
data_path = "data_2019.csv"
data = pd.read_csv(data_path)

In [48]:
data.head()

Unnamed: 0,ID,Date,Primary Type,Latitude,Longitude,Community Area
0,11864018,09/24/2019 08:00:00 AM,DECEPTIVE PRACTICE,41.852248,-87.623786,33.0
1,11859805,10/13/2019 08:30:00 PM,THEFT,41.895732,-87.687784,24.0
2,11863808,10/05/2019 06:30:00 PM,THEFT,41.882002,-87.662287,28.0
3,11859727,10/13/2019 07:00:00 PM,CRIMINAL DAMAGE,41.946987,-87.669164,6.0
4,11859656,10/13/2019 02:10:00 PM,ASSAULT,41.975838,-87.659854,3.0


In [49]:
# drop column ID
data = data.drop(columns=['ID'])
data.head()

Unnamed: 0,Date,Primary Type,Latitude,Longitude,Community Area
0,09/24/2019 08:00:00 AM,DECEPTIVE PRACTICE,41.852248,-87.623786,33.0
1,10/13/2019 08:30:00 PM,THEFT,41.895732,-87.687784,24.0
2,10/05/2019 06:30:00 PM,THEFT,41.882002,-87.662287,28.0
3,10/13/2019 07:00:00 PM,CRIMINAL DAMAGE,41.946987,-87.669164,6.0
4,10/13/2019 02:10:00 PM,ASSAULT,41.975838,-87.659854,3.0


In [50]:
# rename columns : 
# Date --> time
# Community Area --> neighborhood
# Primary Type --> crime_type

data = data.rename(columns={'Date':'time', 'Community Area':'neighborhood', 'Primary Type':'crime_type'})
data.head()

Unnamed: 0,time,crime_type,Latitude,Longitude,neighborhood
0,09/24/2019 08:00:00 AM,DECEPTIVE PRACTICE,41.852248,-87.623786,33.0
1,10/13/2019 08:30:00 PM,THEFT,41.895732,-87.687784,24.0
2,10/05/2019 06:30:00 PM,THEFT,41.882002,-87.662287,28.0
3,10/13/2019 07:00:00 PM,CRIMINAL DAMAGE,41.946987,-87.669164,6.0
4,10/13/2019 02:10:00 PM,ASSAULT,41.975838,-87.659854,3.0


In [51]:
data["crime_type"].unique()

array(['DECEPTIVE PRACTICE', 'THEFT', 'CRIMINAL DAMAGE', 'ASSAULT',
       'BATTERY', 'OTHER OFFENSE', 'NARCOTICS', 'WEAPONS VIOLATION',
       'MOTOR VEHICLE THEFT', 'CRIMINAL TRESPASS', 'BURGLARY',
       'INTERFERENCE WITH PUBLIC OFFICER', 'ARSON', 'ROBBERY',
       'CRIM SEXUAL ASSAULT', 'CRIMINAL SEXUAL ASSAULT', 'PROSTITUTION',
       'PUBLIC PEACE VIOLATION', 'OFFENSE INVOLVING CHILDREN',
       'LIQUOR LAW VIOLATION', 'CONCEALED CARRY LICENSE VIOLATION',
       'SEX OFFENSE', 'STALKING', 'GAMBLING', 'HOMICIDE', 'OBSCENITY',
       'INTIMIDATION', 'KIDNAPPING', 'HUMAN TRAFFICKING', 'NON-CRIMINAL',
       'OTHER NARCOTIC VIOLATION', 'PUBLIC INDECENCY'], dtype=object)

In [52]:
# crimes to keep : ['ROBBERY', 'BATTERY', 'DECEPTIVE PRACTICE', 'BURGLARY', 'ASSAULT', 'THEFT', 'CRIMINAL DAMAGE', 'NARCOTICS']

data = data[data["crime_type"].isin(['ROBBERY', 'BATTERY', 'DECEPTIVE PRACTICE', 'BURGLARY', 'ASSAULT', 'THEFT', 'CRIMINAL DAMAGE', 'NARCOTICS'])]
data["crime_type"].unique()

array(['DECEPTIVE PRACTICE', 'THEFT', 'CRIMINAL DAMAGE', 'ASSAULT',
       'BATTERY', 'NARCOTICS', 'BURGLARY', 'ROBBERY'], dtype=object)

In [53]:
crime_types = ['ROBBERY', 'BATTERY', 'DECEPTIVE PRACTICE', 'BURGLARY', 'ASSAULT', 'THEFT', 'CRIMINAL DAMAGE', 'NARCOTICS']

# map crime types to integers starting from 1
crime_dict = {k:v for v,k in enumerate(crime_types, start=1)}

# add new column crime_type_id
data['crime_type_id'] = data["crime_type"].map(crime_dict)
data.head()

Unnamed: 0,time,crime_type,Latitude,Longitude,neighborhood,crime_type_id
0,09/24/2019 08:00:00 AM,DECEPTIVE PRACTICE,41.852248,-87.623786,33.0,3
1,10/13/2019 08:30:00 PM,THEFT,41.895732,-87.687784,24.0,6
2,10/05/2019 06:30:00 PM,THEFT,41.882002,-87.662287,28.0,6
3,10/13/2019 07:00:00 PM,CRIMINAL DAMAGE,41.946987,-87.669164,6.0,7
4,10/13/2019 02:10:00 PM,ASSAULT,41.975838,-87.659854,3.0,5


In [54]:
# sort the columns into the following order :
# crime_type, neighborhood, time, latitude, longitude, crime_type_id

data = data[['crime_type', 'neighborhood', 'time', 'Latitude', 'Longitude', 'crime_type_id']]
data.head()

Unnamed: 0,crime_type,neighborhood,time,Latitude,Longitude,crime_type_id
0,DECEPTIVE PRACTICE,33.0,09/24/2019 08:00:00 AM,41.852248,-87.623786,3
1,THEFT,24.0,10/13/2019 08:30:00 PM,41.895732,-87.687784,6
2,THEFT,28.0,10/05/2019 06:30:00 PM,41.882002,-87.662287,6
3,CRIMINAL DAMAGE,6.0,10/13/2019 07:00:00 PM,41.946987,-87.669164,7
4,ASSAULT,3.0,10/13/2019 02:10:00 PM,41.975838,-87.659854,5


In [55]:
# save the data in a pickle file
data.to_pickle("data_2019_for_mist.pkl")

### Load from here

In [56]:
data = pd.read_pickle("data_2019_for_mist.pkl")

In [57]:
# time_frequency = 60 * 24        # 1 day
time_frequency = 60 * 12        # 12 hours
# time_frequency = 60 * 6         # 6 hours
# time_frequency = 60 * 4        # 4 hours


# choose_neighborhood_id = 26
#chunk_size is also sample_length, which indicates n-1 intervals, then n-2 train, 1 label
chunk_size = 10
store_path = "77_neighbourhood_12h/"


### generate feature and label
def convert24(str1):
    # Checking if last two elements of time
    # is AM and first two elements are 12
    return_string = ''
    if str1[-2:] == "AM" and str1[11:13] == "12":
        return_string = str1[:11]+ '00' + str1[13:-2]
        return return_string.strip()

    # remove the AM
    elif str1[-2:] == "AM":
        return str1[:-2].strip()

        # Checking if last two elements of time
    # is PM and first two elements are 12
    elif str1[-2:] == "PM" and str1[11:13] == "12":
        return str1[:-2].strip()

    else:
        # add 12 to hours and remove PM
        return_string = ''
        return_string = str1[:11]+ str(int(str1[11:13])+ 12) + str1[13:-2]
        return return_string.strip()

def convert(date_time):
    data_format = '%m/%d/%Y %H:%M:%S' # The format
    datetime_str = datetime.strptime(date_time, data_format)

    return datetime_str

def datetime_range(start, end, delta):
    current = start
    while current < end:
        yield current
        current += delta

def generate_time_list(start, end):
    dts = [dt.strftime('%Y-%m-%dT%H:%MZ') for dt in
           datetime_range(start, end, timedelta(minutes=time_frequency))]
    return dts

def moving_window(x, length, step=1):
    streams = it.tee(x, length)
    return zip(*[it.islice(stream, i, None, step) for stream, i in zip(streams, it.count(step=step))])


class GenerateData:
    def __init__(self):
        # file_path = '/tank/users/jiaosun/crime/crime-prediction/data/8-crime-chicago-2015'
        file_path = 'data_2019_for_mist.pkl'
        crime = pickle.load(open(file_path, 'rb'))

        # crime = pd.read_csv(file_path, sep=",", header=None, index_col=None)
        self.crime = crime

    def preprocess_time(self):
        crime_useful = self.crime
        crime_useful["time"] = crime_useful["time"].apply(convert24)
        crime_useful["combine_time"] = crime_useful["time"].apply(convert)
        self.crime = crime_useful

    def mapping_neighborhood_crime(self):
        crime_df = self.crime
        crime_dict = {k:v for v,k in enumerate(crime_df["crime_type"].unique())}
        crime_df['crime_type_id'] = crime_df["crime_type"].map(crime_dict)
        self.crime = crime_df

    def choose_target_generate_fllist(self):
        '''
        :param choose_neighborhood_id: the id of the neighborhood we want to target at
        :param chunk_size: sample size
        :return:
        '''

        # please change this and make it unified throughout the code
        crime = self.crime
        # cuz here we do not filter out any of the rows in the crime dataset
        sheroaks_crime = crime
        # ID of the chosen neighborhood
        crime_type = len(crime["crime_type_id"].unique())           # getting number of crime types e.g. 8
        neighborhood_type = len(crime["neighborhood"].unique())     # getting number of neighborhoods e.g. 77   

        #sheroaks_crime = crime[crime["neighborhood"] == choose_neighborhood_id]

        start_time_so = min(sheroaks_crime["combine_time"])         # get the start time of the chosen neighborhood
        end_time_so = max(sheroaks_crime["combine_time"])           # get the end time of the chosen neighborhood
        # time_list_so = [dt.strftime('%Y-%m-%dT%H:%MZ') for dt in datetime_range(start_time_so, end_time_so, timedelta(minutes=time_frequency))]
        time_list_so = [dt for dt in datetime_range(start_time_so, end_time_so, timedelta(minutes=time_frequency))]     # get the time list of the chosen neighborhood
        x_=list(moving_window(time_list_so, chunk_size))        

        final_list_so = []
        label_list_so = []
        for i in range(0, len(x_)):                     # 356 iterations
            feature_time_frame = x_[i][:chunk_size-1]   # first n-1 intervals are features
            feature_list = []
            ##fix a bug here
            for index_fea in range(0, len(feature_time_frame) - 1):     # 8 iterations
                start_so = feature_time_frame[index_fea]                # take a time interval
                end_so = feature_time_frame[index_fea + 1]              # take the next time interval
                df_so_middle = sheroaks_crime.loc[(sheroaks_crime['combine_time'] >= start_so) & (sheroaks_crime['combine_time'] < end_so)]     # get the crime records in the time interval
                crime_record = np.zeros((neighborhood_type, crime_type))        # 77*8
                for index, row in df_so_middle.iterrows():
                    crime_record[int(row["neighborhood"]) - 1][int(row["crime_type_id"])] = 1   # 77*8
                feature_list.append(crime_record)   # 8*77*8
            final_list_so.append(feature_list)      # 356*8*77*8

            label_time_frame = x_[i][chunk_size-2:]
            label_time_slots = sheroaks_crime.loc[(sheroaks_crime['combine_time'] >= label_time_frame[0]) & (sheroaks_crime['combine_time'] < label_time_frame[1])]
            crime_record = np.zeros((neighborhood_type, crime_type))
            for index_label, row_label in label_time_slots.iterrows():
                crime_record[int(row_label["neighborhood"])-1][int(row_label["crime_type_id"])-1] = 1
            label_list_so.append(crime_record)    # 356*77*8

        print("the shape of feature list is {}, and the shape of label list is {} ".format(np.shape(final_list_so),
                                                                                           np.shape(label_list_so)))
        ts = time.time()
        # st = datetime.fromtimestamp(ts).strftime('%Y-%m-%d_%H:%M:%S')

        np.array(final_list_so).dump(open(store_path + '/feature' +'.npy', 'wb'))
        np.array(label_list_so).dump(open(store_path + '/label' +'.npy', 'wb'))
        print("Successfully stored the data at "+ store_path + " !")
        # return final_list_so, label_list_so

In [58]:
def generate_train_test():
    test_object = GenerateData()
    test_object.preprocess_time()
    test_object.mapping_neighborhood_crime()
    test_object.choose_target_generate_fllist()
    
    feature_file_name = store_path + '/feature' +'.npy'
    label_file_name = store_path + '/label' +'.npy'

    x = np.load(feature_file_name, allow_pickle=True)
    y = np.load(label_file_name, allow_pickle=True)
    y = np.expand_dims(y, axis = 1)

    num_samples = x.shape[0]
    num_test = round(num_samples * 0.2)
    num_train = round(num_samples * 0.7)
    num_val = num_samples - num_test - num_train

    # output_dir = '/home/users/jiaosun/DCRNN/data/CRIME-LA/'
    # output_dir = 'E:/Academia/CSE_400 Thesis/Crime Prediction/Codes/CrimeForecaster-master/CrimeForecaster-master/Code/CrimeForecaster/CRIME-CHICAGO/default/'
    output_dir = store_path + "/"

    # train
    x_train, y_train = x[:num_train], y[:num_train]
    # val
    x_val, y_val = (
        x[num_train: num_train + num_val],
        y[num_train: num_train + num_val],
    )
    # test
    x_test, y_test = x[-num_test:], y[-num_test:]

    x_offsets = np.array([-11, -10,  -9,  -8,  -7,  -6,  -5,  -4,  -3,  -2,  -1,   0])
    y_offsets = np.array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12])

    for cat in ["train", "val", "test"]:
        _x, _y = locals()["x_" + cat], locals()["y_" + cat]
        print(cat, "x: ", _x.shape, "y:", _y.shape)
        np.savez_compressed(
            os.path.join(output_dir, "%s.npz" % cat),
            x=_x,
            y=_y,
            x_offsets=x_offsets.reshape(list(x_offsets.shape) + [1]),
            y_offsets=y_offsets.reshape(list(y_offsets.shape) + [1]),
        )

In [59]:
# import os 

# # make folders for 77 neighborhoods
# for i in range(1, 78):
#     os.makedirs("data_2019_for_mist_4h/" + str(i))

In [60]:
#for i in range(1, 78):
generate_train_test()

the shape of feature list is (721, 8, 77, 8), and the shape of label list is (721, 77, 8) 
Successfully stored the data at 77_neighbourhood_12h/ !
train x:  (505, 8, 77, 8) y: (505, 1, 77, 8)
val x:  (72, 8, 77, 8) y: (72, 1, 77, 8)
test x:  (144, 8, 77, 8) y: (144, 1, 77, 8)
