In [0]:
import pandas as pd

In [0]:
# Open the data file of Incendie and House With Centroid
path = 'Data/'
file1 = path + 'houseWithCentroid.csv'
file2 = path + 'donneesouvertes-interventions-sim.csv'
data1 = pd.read_csv(file1)
data2 = pd.read_csv(file2)


In [0]:
# Select the useful data from House With Centroid, and drop the data with null values.
house_centroid_withna = data1[['Unnamed: 0', 'ID_UEV', 'ANNEE_CONSTRUCTION', 'centroid']]
house_centroid = house_centroid_withna.dropna(axis = 'index', how = 'any')

In [0]:
# The data of longitudes and latitudes in original Dataset are saved as object. These functions 
#     can read longitude and latitude from the form House With Centroid.
# function point_to_coordinate: load longitude and latitude from Point Structure.
import re
def point_to_coordinate(point_string):
    strs = re.findall(r"-?\d+\.?\d*", point_string)
    numbers = [float(s) for s in strs]
    return numbers

# function geometry_to_coordinate: load multi longitudes and latitudes from Geometry Structure
def geometry_to_coordinates(geometry_string):
    point_strings = re.findall(r"-?\d+\.?\d*\s-?\d+\.?\d*", geometry_string)
    double_number_list = [point_to_coordinate(dn) for dn in point_strings]
    return double_number_list


In [0]:

import math
# function radians: Turn the 360 degree to 2 Pi radians value.
def radians(degree):
    return degree * 3.141592653589793 / 180.0;

# function distance: Compute the distance of two points with their longitudes and latitudes
#      this function only works in short distance.
def distance(point1, point2):
    x1, y1 = point1
    x2, y2 = point2
    dy = radians(y2 - y1) * 6371
    dx = radians(x2 - x1) * 6371 * (math.cos( radians(y1) ) + math.cos( radians(y2) )) /2
    return math.sqrt(dy*dy + dx * dx)


In [0]:
# If we compare every incindie and every house, there will be serveral T amounts of records! So we 
#      split the area we concerned into 400 * 400 grids and stored the house data into them.
min_lat = 45.40
max_lat = 45.80
min_long = -74.00
max_long = -73.40

scale_lat = 400
scale_long = 400

# function getY: Decide the Y coordinate of Grid with the latitude info.min_lat = 45.40
def getY(latitude):
    return int((latitude - min_lat) / (max_lat - min_lat) * scale_lat)

# function getX: Decide the X coordinate of Grid with the longitude info.
def getX(longitude):
    return int((longitude - min_long) / (max_long - min_long ) * scale_long)


In [0]:
# Grids for storing the house info
house_map = []
for i in range(scale_long):
    house_map.append([])
    for j in range(scale_lat):
        house_map[i].append([])

# Store the house info into the grids
n = 0
for record in house_centroid.values:
    longitude, latitude = point_to_coordinate(record[3])
    i = getX(longitude)
    j = getY(latitude)
    n = n+1
    house_map[i][j].append((record[0], record[1], int(record[2]), longitude, latitude))

In [0]:
# Select the useful data from Incident.
alert_info = data2[['incident_nbr', 'creation_date_time', 'incident_type_desc',  'description_groupe', 'longitude', 'latitude']]

In [0]:
# For every incident, find the (house_num: 5) nearest house and store their distance and house age.
alert_house_va = []

house_num = 5

for alert in alert_info.values:
    alert_longitude = float(alert[4])
    alert_latitude = float(alert[5])
    
    # Calculate which grid the incident is in.
    i = getX(alert_longitude)
    j = getY(alert_latitude)

    # We had stored the house info into the grids. So we need only compare the house in the 
    #      grid where the incident in and the 8 nearby grids.
    temp = []
    for x in range(max(0, i-1), min(scale_long, i+2)):
        for y in range(max(0, j-1), min(scale_lat, j+2)):
            for house in house_map[x][y]:
                house_longitude = house[3]
                house_latitude = house[4]
                age = int(alert[1][0:4]) - house[2]
                d = distance((alert_longitude, alert_latitude), (house_longitude, house_latitude)) *1000
                # If the age is negative, that means the house did not exist when the incident happened
                # We don't consider about the distance larger than 100 meters by now.
                if d < 50 and age >=0:
                    record = (alert[0], alert[1], alert[2], alert[3], alert[4], alert[5], house[0], house[1], house[2], house[3], house[4], age, d)
                    for i in range(min(house_num, len(temp))):
                        if record[-1] < temp[i][-1]:
                            temp.insert(i, record)
                            break
                    else:
                        temp.append(record)
                        
    # Select the first (house_num: 5) house in the list. 
    for i in range(min(house_num, len(temp))):
        alert_house_va.append(temp[i])
#
len(alert_house_va)

1479239

In [0]:
alert_house = pd.DataFrame(alert_house_va)
alert_house.columns = ['incident_nbr', 'incident_time', 'incident_type_desc',  'description_groupe', 'incident_long', 'incident_lat', 'house_idx', 'house_id_uev', 'house_build_year', 'house_long', 'house_lat',  'house_age', 'distance']


In [0]:
alert_house.describe()

Unnamed: 0,incident_nbr,incident_long,incident_lat,house_idx,house_id_uev,house_build_year,house_long,house_lat,house_age,distance
count,1479239.0,1479239.0,1479239.0,1479239.0,1479239.0,1479239.0,1479239.0,1479239.0,1479239.0,1479239.0
mean,61375.02,-73.61688,45.52851,239374.7,2824104.0,1955.053,-73.61688,45.5285,61.09757,32.38917
std,38579.9,0.07839566,0.05588166,151508.2,1386539.0,32.60101,0.07839026,0.05587846,32.60929,7.472253
min,1.0,-73.96768,45.40269,4.0,1000002.0,1680.0,-73.96755,45.40234,0.0,3.9256
25%,26008.0,-73.63865,45.48979,103935.0,2016234.0,1934.0,-73.63861,45.48974,40.0,26.86708
50%,59881.0,-73.5999,45.52644,232694.0,3003164.0,1958.0,-73.59996,45.5265,58.0,31.05387
75%,95012.0,-73.5689,45.56395,370000.0,4038055.0,1976.0,-73.56895,45.56395,83.0,37.40992
max,131673.0,-73.4794,45.70178,513232.0,5256633.0,2017.0,-73.47949,45.70213,338.0,49.99975


In [0]:
# prepare the incident information we need and we will combine them with the house info later.
alert_house_30_short = alert_house[['incident_nbr', 'incident_time', 'incident_type_desc',  'description_groupe', 'incident_long', 'incident_lat']]
alert_house_30_simple = alert_house_30_short.drop_duplicates(['incident_nbr', 'incident_time'])

alert_house_30_mean = alert_house_30_simple.set_index(['incident_nbr', 'incident_time'])

In [0]:
# For each incident, compute the mean house age of the 5 nearest house
mean_value = alert_house.groupby(['incident_time','incident_nbr' ])['house_age'].mean().reset_index()
alert_mean_value = mean_value.set_index(['incident_nbr', 'incident_time'])

In [0]:
# Combine the incident info and average house age.
alert_house_30_mean['mean_age'] = alert_mean_value['house_age']

alert_house_30_mean

Unnamed: 0_level_0,Unnamed: 1_level_0,incident_type_desc,description_groupe,incident_long,incident_lat,mean_age
incident_nbr,incident_time,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,2015-01-01 00:03:22,Inondation,Sans incendie,-73.580575,45.535698,99.800000
2,2015-01-01 00:05:58,Ac.véh./1R/s.v./ext/29B/D,Premier répondant,-73.574247,45.494499,97.666667
3,2015-01-01 00:08:34,Appel de Cie de détection,Alarmes-incendies,-73.665779,45.442005,60.000000
4,2015-01-01 00:11:28,Premier répondant,Premier répondant,-73.489981,45.641294,94.500000
5,2015-01-01 00:14:06,Odeur suspecte - gaz,Sans incendie,-73.615028,45.483883,44.500000
6,2015-01-01 00:20:27,Premier répondant,Premier répondant,-73.580606,45.485622,29.000000
7,2015-01-01 00:21:01,Premier répondant,Premier répondant,-73.594318,45.584117,42.000000
8,2015-01-01 00:22:38,Alarme vérification,Alarmes-incendies,-73.641177,45.597359,42.000000
9,2015-01-01 00:24:18,10-22 sans feu,Sans incendie,-73.637632,45.420985,49.600000
10,2015-01-01 00:25:18,Premier répondant,Premier répondant,-73.574568,45.514799,81.600000


In [0]:
alert_house_30_mean.describe()

Unnamed: 0,incident_long,incident_lat,mean_age
count,339365.0,339365.0,339365.0
mean,-73.618531,45.527448,60.080318
std,0.079659,0.055932,27.21983
min,-73.967681,45.402687,0.0
25%,-73.640738,45.489181,43.0
50%,-73.601556,45.524454,57.666667
75%,-73.568947,45.562622,77.0
max,-73.479402,45.701779,217.0


In [0]:
alert_house_30_mean.to_csv(path + 'incident_mean_5_in_50meters.csv')