# Data wrangling

## Libraries

In [12]:
import pandas as pd


## Read data

In [13]:
stations = pd.read_json("data/stations.json")
crimes = pd.read_json("data/crimes.json")
distances = pd.read_json("data/distances.json")
community = pd.read_json("data/community_areas.json")

In [14]:
stations

Unnamed: 0,station_name,community_area,longitude,latitude
0,3510 S Michigan Ave,DOUGLAS,-87.623395,41.830702
1,1160 N Larrabee St,NEAR NORTH SIDE,-87.643352,41.903242
2,850 W Addison St,LAKE VIEW,-87.651512,41.9474
3,5400 N Lincoln Ave,LINCOLN SQUARE,-87.692845,41.97955
4,1900 W Monterey Ave,MORGAN PARK,-87.66852,41.691435
5,6464 N Clark St,ROGERS PARK,-87.671324,41.999763
6,5555 W Grand Ave,BELMONT CRAGIN,-87.765574,41.918609
7,1718 S State St,NEAR SOUTH SIDE,-87.627356,41.858373
8,5101 S Wentworth Ave,FULLER PARK,-87.63056,41.801811
9,7040 S Cottage Grove Ave,WOODLAWN,-87.605748,41.766431


## New data

### New stations

In [15]:
new_stations = stations.copy()
new_stations['ID_ps'] = [i for i in range(1,24)]


In [16]:
new_stations

Unnamed: 0,station_name,community_area,longitude,latitude,ID_ps
0,3510 S Michigan Ave,DOUGLAS,-87.623395,41.830702,1
1,1160 N Larrabee St,NEAR NORTH SIDE,-87.643352,41.903242,2
2,850 W Addison St,LAKE VIEW,-87.651512,41.9474,3
3,5400 N Lincoln Ave,LINCOLN SQUARE,-87.692845,41.97955,4
4,1900 W Monterey Ave,MORGAN PARK,-87.66852,41.691435,5
5,6464 N Clark St,ROGERS PARK,-87.671324,41.999763,6
6,5555 W Grand Ave,BELMONT CRAGIN,-87.765574,41.918609,7
7,1718 S State St,NEAR SOUTH SIDE,-87.627356,41.858373,8
8,5101 S Wentworth Ave,FULLER PARK,-87.63056,41.801811,9
9,7040 S Cottage Grove Ave,WOODLAWN,-87.605748,41.766431,10


### New Community areas

In [30]:
new_community = community.copy()
new_community.rename(columns={'ID':'ID_CA'}, inplace=True)

new_community

Unnamed: 0,ID_CA,community_area,population,area,neighbours
0,1,ROGERS PARK,55628,4.77,"[1, 2, 77]"
1,2,WEST RIDGE,77122,9.14,"[1, 2, 4, 13, 77]"
2,3,UPTOWN,57182,6.01,"[3, 4, 5, 6, 77]"
3,4,LINCOLN SQUARE,40494,6.63,"[2, 3, 4, 5, 6, 13, 14, 16, 77]"
4,5,NORTH CENTER,35114,5.31,"[3, 4, 5, 6, 7, 14, 16, 21, 22]"
...,...,...,...,...,...
72,73,WASHINGTON HEIGHTS,25065,7.41,"[49, 71, 72, 73, 75]"
73,74,MOUNT GREENWOOD,18628,7.02,"[72, 74, 75]"
74,75,MORGAN PARK,21186,8.55,"[49, 53, 72, 73, 74, 75]"
75,76,OHARE,13418,34.55,"[10, 17, 76]"


### Distances D: distances between police stations and community areas

In [18]:
D_distances = stations.merge(distances, on = "station_name")
D_distances.drop(columns=['community_area', 'longitude', 'latitude'], inplace=True)
D_distances


Unnamed: 0,station_name,distances
0,3510 S Michigan Ave,"[2092.19, 2103.08, 1560.8, 1750.1, 1439.87, 13..."
1,1160 N Larrabee St,"[1217.56, 1283.21, 686.17, 930.23, 662.57, 467..."
2,850 W Addison St,"[784.26, 885.85, 252.87, 540.93, 298.23, 106.0..."
3,5400 N Lincoln Ave,"[478.94, 346.67, 482.46, 132.04, 511.79, 642.4..."
4,1900 W Monterey Ave,"[3025.14, 3036.02, 2493.75, 2683.05, 2372.81, ..."
5,6464 N Clark St,"[125.89, 311.05, 511.7, 415.38, 711.59, 740.61..."
6,5555 W Grand Ave,"[1597.02, 1399.75, 1335.54, 1179.09, 1029.81, ..."
7,1718 S State St,"[1833.29, 1941.22, 1377.79, 1588.24, 1278.01, ..."
8,5101 S Wentworth Ave,"[2229.84, 2240.73, 1698.46, 1887.75, 1577.52, ..."
9,7040 S Cottage Grove Ave,"[2596.72, 2607.61, 2065.33, 2254.63, 1944.4, 1..."


### Distances d: distances between police stations

In [19]:
import math


d_distances = stations.copy()


distances = []

for i, row in d_distances.iterrows():
    d = []
    lon_i = stations['longitude'][i]
    lat_i = stations['latitude'][i] 
    coord_i = [lat_i, lon_i]                  # latitude comes before longitude
    for j in range(23):
        lon_j = stations['longitude'][j]
        lat_j = stations['latitude'][j]
        coord_j = [lat_j, lon_j]
        d.append(math.dist(coord_i, coord_j)) # Using Euclidean distance
    distances.append(d)

d_distances['distances']= distances
d_distances.drop(columns=["community_area", "longitude", "latitude"], inplace=True)
d_distances

## DISTANCES ENTRE STATIONS ES DIFERENTE QUE DISTANCES DE STATIONS Y COMMUNITY AREAS

Unnamed: 0,station_name,distances
0,3510 S Michigan Ave,"[0.0, 0.07523509330030836, 0.1200381191597144,..."
1,1160 N Larrabee St,"[0.07523509330030836, 0.0, 0.0449063931108958,..."
2,850 W Addison St,"[0.1200381191597144, 0.0449063931108958, 0.0, ..."
3,5400 N Lincoln Ave,"[0.16425242568515927, 0.09095264804279252, 0.0..."
4,1900 W Monterey Ave,"[0.14639515516556165, 0.2132969549413625, 0.25..."
5,6464 N Clark St,"[0.17572442097650645, 0.10049330744965694, 0.0..."
6,5555 W Grand Ave,"[0.16716034463771068, 0.12318462753238194, 0.1..."
7,1718 S State St,"[0.027952939056253876, 0.04763510891689406, 0...."
8,5101 S Wentworth Ave,"[0.02976577829133368, 0.10223400971572355, 0.1..."
9,7040 S Cottage Grove Ave,"[0.06664960341922221, 0.14188469236001355, 0.1..."


In [20]:
from numpy import sin, cos, arccos, pi, round

def rad2deg(radians):
    degrees = radians * 180 / pi
    return degrees

def deg2rad(degrees):
    radians = degrees * pi / 180
    return radians

def getDistanceBetweenPointsNew(latitude1, longitude1, latitude2, longitude2, unit = 'kilometers'):
    
    theta = longitude1 - longitude2
    
    distance = 60 * 1.1515 * rad2deg(
        arccos(
            (sin(deg2rad(latitude1)) * sin(deg2rad(latitude2))) + 
            (cos(deg2rad(latitude1)) * cos(deg2rad(latitude2)) * cos(deg2rad(theta)))
        )
    )
    
    if unit == 'miles':
        return round(distance, 2)
    if unit == 'kilometers':
        return round(distance * 1.609344, 2)

In [21]:
import math


d_distances = stations


distances = []

for i, row in d_distances.iterrows():
    d = []
    lon_i = stations['longitude'][i]
    lat_i = stations['latitude'][i] 
    coord_i = [lat_i, lon_i]                  # latitude comes before longitude
    for j in range(23):
        lon_j = stations['longitude'][j]
        lat_j = stations['latitude'][j]
        coord_j = [lat_j, lon_j]
        d.append(getDistanceBetweenPointsNew(lat_i, lon_i, lat_j, lon_j)) # Using formula in https://es.martech.zone/calculate-great-circle-distance/
    distances.append(d)

# otros enlaces:

## https://www.way.com/es/blog/%C2%BFQu%C3%A9-tan-r%C3%A1pidos-son-los-coches-de-polic%C3%ADa%3F/
## https://www.motorpasion.com/ford/ford-explorer-el-coche-de-policia-mas-vendido-en-estados-unidos


d_distances['distances']= distances
d_distances.drop(columns=["community_area", "longitude", "latitude"], inplace=True)
d_distances


  arccos(


Unnamed: 0,station_name,distances
0,3510 S Michigan Ave,"[0.0, 8.23, 13.18, 17.52, 15.93, 19.21, 15.3, ..."
1,1160 N Larrabee St,"[8.23, 0.0, 4.96, 9.42, 23.64, 10.98, 10.26, 5..."
2,850 W Addison St,"[13.18, 4.96, 0.0, 4.95, 28.5, 6.05, 9.96, 10...."
3,5400 N Lincoln Ave,"[17.52, 9.42, 4.95, 0.0, 32.1, 2.87, 9.06, 14...."
4,1900 W Monterey Ave,"[15.93, 23.64, 28.5, 32.1, 0.0, 34.28, 26.51, ..."
5,6464 N Clark St,"[19.21, 10.98, 6.05, 2.87, 34.28, 0.0, 11.92, ..."
6,5555 W Grand Ave,"[15.3, 10.26, 9.96, 9.06, 26.51, 11.92, 0.0, 1..."
7,1718 S State St,"[3.09, 5.16, 10.1, 14.52, 18.87, 16.14, 13.26,..."
8,5101 S Wentworth Ave,"[3.27, 11.33, 16.28, 20.42, 12.67, 22.27, 17.1..."
9,7040 S Cottage Grove Ave,"[7.29, 15.53, 20.48, 24.77, 9.83, 26.51, 21.48..."


### Criminality index

In [22]:
new_community.head()

Unnamed: 0,ID,community_area,population,area,neighbours,ID_ca
0,1,ROGERS PARK,55628,4.77,"[1, 2, 77]",1
1,2,WEST RIDGE,77122,9.14,"[1, 2, 4, 13, 77]",2
2,3,UPTOWN,57182,6.01,"[3, 4, 5, 6, 77]",3
3,4,LINCOLN SQUARE,40494,6.63,"[2, 3, 4, 5, 6, 13, 14, 16, 77]",4
4,5,NORTH CENTER,35114,5.31,"[3, 4, 5, 6, 7, 14, 16, 21, 22]",5


In [32]:
crimes_per_community = crimes.groupby("community_area").size()
crimes_per_community = crimes_per_community.reset_index()
crimes_per_community.columns = ['community_area', 'criminality_index']
crimes_per_community

Unnamed: 0,community_area,criminality_index
0,ALBANY PARK,2142
1,ARCHER HEIGHTS,988
2,ARMOUR SQUARE,1146
3,ASHBURN,1988
4,AUBURN GRESHAM,6405
...,...,...
72,WEST LAWN,1561
73,WEST PULLMAN,3326
74,WEST RIDGE,3785
75,WEST TOWN,7105


In [35]:
R_criminalities = crimes_per_community.merge(new_community, on="community_area")
R_criminalities.drop(columns=["population", "area", "neighbours"], inplace=True)
R_criminalities = R_criminalities.sort_values("ID_CA").reset_index()
R_criminalities

Unnamed: 0,index,community_area,criminality_index,ID_CA
0,60,ROGERS PARK,4000,1
1,74,WEST RIDGE,3785,2
2,66,UPTOWN,3812,3
3,39,LINCOLN SQUARE,2064,4
4,51,NORTH CENTER,1323,5
...,...,...,...,...
72,67,WASHINGTON HEIGHTS,2870,73
73,46,MOUNT GREENWOOD,528,74
74,45,MORGAN PARK,1785,75
75,56,OHARE,1643,76


## Organizing new data

### Police stations

In [57]:
police_stations = new_stations.merge(D_distances, on="station_name")
police_stations = police_stations.merge(d_distances, on="station_name")
police_stations.rename(columns={"distances_x":"D_distances", "distances_y":"d_distances"})

Unnamed: 0,station_name,community_area,longitude,latitude,ID_ps,D_distances,d_distances
0,3510 S Michigan Ave,DOUGLAS,-87.623395,41.830702,1,"[2092.19, 2103.08, 1560.8, 1750.1, 1439.87, 13...","[0.0, 8.23, 13.18, 17.52, 15.93, 19.21, 15.3, ..."
1,1160 N Larrabee St,NEAR NORTH SIDE,-87.643352,41.903242,2,"[1217.56, 1283.21, 686.17, 930.23, 662.57, 467...","[8.23, 0.0, 4.96, 9.42, 23.64, 10.98, 10.26, 5..."
2,850 W Addison St,LAKE VIEW,-87.651512,41.9474,3,"[784.26, 885.85, 252.87, 540.93, 298.23, 106.0...","[13.18, 4.96, 0.0, 4.95, 28.5, 6.05, 9.96, 10...."
3,5400 N Lincoln Ave,LINCOLN SQUARE,-87.692845,41.97955,4,"[478.94, 346.67, 482.46, 132.04, 511.79, 642.4...","[17.52, 9.42, 4.95, 0.0, 32.1, 2.87, 9.06, 14...."
4,1900 W Monterey Ave,MORGAN PARK,-87.66852,41.691435,5,"[3025.14, 3036.02, 2493.75, 2683.05, 2372.81, ...","[15.93, 23.64, 28.5, 32.1, 0.0, 34.28, 26.51, ..."
5,6464 N Clark St,ROGERS PARK,-87.671324,41.999763,6,"[125.89, 311.05, 511.7, 415.38, 711.59, 740.61...","[19.21, 10.98, 6.05, 2.87, 34.28, 0.0, 11.92, ..."
6,5555 W Grand Ave,BELMONT CRAGIN,-87.765574,41.918609,7,"[1597.02, 1399.75, 1335.54, 1179.09, 1029.81, ...","[15.3, 10.26, 9.96, 9.06, 26.51, 11.92, 0.0, 1..."
7,1718 S State St,NEAR SOUTH SIDE,-87.627356,41.858373,8,"[1833.29, 1941.22, 1377.79, 1588.24, 1278.01, ...","[3.09, 5.16, 10.1, 14.52, 18.87, 16.14, 13.26,..."
8,5101 S Wentworth Ave,FULLER PARK,-87.63056,41.801811,9,"[2229.84, 2240.73, 1698.46, 1887.75, 1577.52, ...","[3.27, 11.33, 16.28, 20.42, 12.67, 22.27, 17.1..."
9,7040 S Cottage Grove Ave,WOODLAWN,-87.605748,41.766431,10,"[2596.72, 2607.61, 2065.33, 2254.63, 1944.4, 1...","[7.29, 15.53, 20.48, 24.77, 9.83, 26.51, 21.48..."


### Community areas

In [38]:
community_areas = new_community.merge(R_criminalities[["ID_CA", "criminality_index"]], on="ID_CA")
community_areas

Unnamed: 0,ID_CA,community_area,population,area,neighbours,criminality_index
0,1,ROGERS PARK,55628,4.77,"[1, 2, 77]",4000
1,2,WEST RIDGE,77122,9.14,"[1, 2, 4, 13, 77]",3785
2,3,UPTOWN,57182,6.01,"[3, 4, 5, 6, 77]",3812
3,4,LINCOLN SQUARE,40494,6.63,"[2, 3, 4, 5, 6, 13, 14, 16, 77]",2064
4,5,NORTH CENTER,35114,5.31,"[3, 4, 5, 6, 7, 14, 16, 21, 22]",1323
...,...,...,...,...,...,...
72,73,WASHINGTON HEIGHTS,25065,7.41,"[49, 71, 72, 73, 75]",2870
73,74,MOUNT GREENWOOD,18628,7.02,"[72, 74, 75]",528
74,75,MORGAN PARK,21186,8.55,"[49, 53, 72, 73, 74, 75]",1785
75,76,OHARE,13418,34.55,"[10, 17, 76]",1643


### Police areas

In [27]:
police_areas = {'police_area': [1, 2, 3, 4, 5], 'max_workload': [0, 0, 0, 0, 0]}

pd.DataFrame(police_areas)

Unnamed: 0,police_area,max_workload
0,1,0
1,2,0
2,3,0
3,4,0
4,5,0
