In [0]:
import pandas as pd

In [0]:
# Open the data file of Incendie and House With Centroid
path = 'Data/'
file1 = path + 'houseWithCentroid.csv'
file2 = path + 'donneesouvertes-interventions-sim.csv'
data1 = pd.read_csv(file1)
data2 = pd.read_csv(file2)


In [0]:
# Select the useful data from House With Centroid, and drop the data with null values.
house_centroid_withna = data1[['Unnamed: 0', 'ID_UEV', 'ANNEE_CONSTRUCTION', 'centroid']]
house_centroid = house_centroid_withna.dropna(axis = 'index', how = 'any')

In [0]:
# The data of longitudes and latitudes in original Dataset are saved as object. These functions 
#     can read longitude and latitude from the form House With Centroid.
# function point_to_coordinate: load longitude and latitude from Point Structure.
import re
def point_to_coordinate(point_string):
    strs = re.findall(r"-?\d+\.?\d*", point_string)
    numbers = [float(s) for s in strs]
    return numbers

# function geometry_to_coordinate: load multi longitudes and latitudes from Geometry Structure
def geometry_to_coordinates(geometry_string):
    point_strings = re.findall(r"-?\d+\.?\d*\s-?\d+\.?\d*", geometry_string)
    double_number_list = [point_to_coordinate(dn) for dn in point_strings]
    return double_number_list


In [0]:
# function radians: Turn the 360 degree to 2 Pi radians value.
import math
def radians(degree):
    return degree * 3.141592653589793 / 180.0;

# function distance: Compute the distance of two points with their longitudes and latitudes
#      this function only works in short distance.
def distance(point1, point2):
    x1, y1 = point1
    x2, y2 = point2
    dy = radians(y2 - y1) * 6371
    dx = radians(x2 - x1) * 6371 * (math.cos( radians(y1) ) + math.cos( radians(y2) )) /2
    return math.sqrt(dy*dy + dx * dx)


In [0]:
# If we compare every incindie and every house, there will be serveral T amounts of records! So we 
#      split the area we concerned into 400 * 400 grids and stored the house data into them.

max_lat = 45.80
min_long = -74.00
max_long = -73.40

scale_lat = 400
scale_long = 400
# function getX: Decide the X coordinate of Grid with the longitude info.
def getX(longitude):
    return int((longitude - min_long) / (max_long - min_long ) * scale_long)

# function getY: Decide the Y coordinate of Grid with the latitude info.min_lat = 45.40
def getY(latitude):
    return int((latitude - min_lat) / (max_lat - min_lat) * scale_lat)


In [0]:
# Grids for storing the house info
house_map = []
for i in range(scale_long):
    house_map.append([])
    for j in range(scale_lat):
        house_map[i].append([])

# Store the house info into the grids
n = 0
for record in house_centroid.values:
    longitude, latitude = point_to_coordinate(record[3])
    i = getX(longitude)
    j = getY(latitude)
    n = n+1
    house_map[i][j].append((record[0], record[1], int(record[2]), longitude, latitude))

In [0]:
# Select the useful data from Incident.
alert_info = data2[['incident_nbr', 'creation_date_time', 'incident_type_desc',  'description_groupe', 'longitude', 'latitude']]

In [0]:
# For every incident, find the (house_num: 1) nearest house and store their distance and house age.
alert_house_va = []

house_num = 1

for alert in alert_info.values:
    alert_longitude = float(alert[4])
    alert_latitude = float(alert[5])
    
    # Calculate which grid the incident is in.
    i = getX(alert_longitude)
    j = getY(alert_latitude)

    # We had stored the house info into the grids. So we need only compare the house in the 
    #      grid where the incident in and the 8 nearby grids.
    temp = []
    for x in range(max(0, i-1), min(scale_long, i+2)):
        for y in range(max(0, j-1), min(scale_lat, j+2)):
            for house in house_map[x][y]:
                house_longitude = house[3]
                house_latitude = house[4]
                age = int(alert[1][0:4]) - house[2]
                d = distance((alert_longitude, alert_latitude), (house_longitude, house_latitude)) * 1000
                
                # If the age is negative, that means the house did not exist when the incident happened
                # We don't consider about the distance larger than 100 meters by now.
                if d < 100 and age >=0:
                    record = (alert[0], alert[1], alert[2], alert[3], alert[4], alert[5], house[0], house[1], house[2], house[3], house[4], age, d)
                    # Sort the former records
                    for i in range(min(house_num, len(temp))):
                        if record[-1] < temp[i][-1]:
                            temp.insert(i, record)
                            break
                    else:
                        temp.append(record)
    # Select the first (house_num: 1) house in the list. 
    for i in range(min(house_num, len(temp))):
        alert_house_va.append(temp[i])
#
len(alert_house_va)

377889

In [0]:
alert_house = pd.DataFrame(alert_house_va)
alert_house.columns = ['incident_nbr', 'incident_time', 'incident_type_desc',  'description_groupe', 'incident_long', 'incident_lat', 'house_idx', 'house_id_uev', 'house_build_year', 'house_long', 'house_lat',  'house_age', 'distance']
alert_house.head(30)

Unnamed: 0,incident_nbr,incident_time,incident_type_desc,description_groupe,incident_long,incident_lat,house_idx,house_id_uev,house_build_year,house_long,house_lat,house_age,distance
0,1,2015-01-01 00:03:22,Inondation,Sans incendie,-73.580575,45.535698,85496,5205852,1924,-73.580235,45.535784,91,28.15256
1,2,2015-01-01 00:05:58,Ac.véh./1R/s.v./ext/29B/D,Premier répondant,-73.574247,45.494499,19042,1002467,1885,-73.574777,45.494605,130,42.926352
2,3,2015-01-01 00:08:34,Appel de Cie de détection,Alarmes-incendies,-73.665779,45.442005,163044,5058214,1969,-73.666033,45.441832,46,27.591212
3,4,2015-01-01 00:11:28,Premier répondant,Premier répondant,-73.489981,45.641294,5193,2079483,1845,-73.490354,45.641525,170,38.687683
4,5,2015-01-01 00:14:06,Odeur suspecte - gaz,Sans incendie,-73.615028,45.483883,76556,1011327,1935,-73.615288,45.484111,80,32.471602
5,6,2015-01-01 00:20:27,Premier répondant,Premier répondant,-73.580606,45.485622,87921,1047407,1986,-73.580156,45.485522,29,36.795141
6,7,2015-01-01 00:21:01,Premier répondant,Premier répondant,-73.594318,45.584117,316156,2090583,1973,-73.594119,45.584257,42,22.006792
7,8,2015-01-01 00:22:38,Alarme vérification,Alarmes-incendies,-73.641177,45.597359,269925,2082029,1980,-73.640893,45.597297,35,23.159863
8,9,2015-01-01 00:24:18,10-22 sans feu,Sans incendie,-73.637632,45.420985,111274,4035131,1964,-73.637961,45.420914,51,26.883926
9,10,2015-01-01 00:25:18,Premier répondant,Premier répondant,-73.574568,45.514799,403308,1058854,1900,-73.574602,45.514565,115,26.195453


In [0]:
alert_house.describe()

Unnamed: 0,incident_nbr,incident_long,incident_lat,house_idx,house_id_uev,house_build_year,house_long,house_lat,house_age,distance
count,377889.0,377889.0,377889.0,377889.0,377889.0,377889.0,377889.0,377889.0,377889.0,377889.0
mean,61460.593494,-73.620827,45.526568,244365.408326,2734591.0,1956.339108,-73.620839,45.526563,59.812434,32.489745
std,38574.520963,0.08179,0.056089,153321.322985,1345624.0,31.78829,0.081793,0.056088,31.795151,14.20257
min,1.0,-73.968949,45.402687,12.0,1000003.0,1680.0,-73.969765,45.402456,0.0,3.9256
25%,26058.0,-73.644293,45.488115,108448.0,2011120.0,1940.0,-73.644265,45.488075,40.0,24.296978
50%,60011.0,-73.60244,45.522194,244295.0,2153992.0,1960.0,-73.602462,45.522187,56.0,27.766778
75%,95146.0,-73.568621,45.561702,378955.0,4023742.0,1976.0,-73.568638,45.561599,77.0,34.974498
max,131673.0,-73.479402,45.701779,513219.0,5256583.0,2017.0,-73.47949,45.702132,338.0,99.985858


In [0]:
alert_house.to_csv(path + 'incident_nearest_centroid.csv')