<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#constants---weights" data-toc-modified-id="constants---weights-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>constants - weights</a></span></li><li><span><a href="#filter-only-injured-(injury_severity-!=-0)" data-toc-modified-id="filter-only-injured-(injury_severity-!=-0)-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>filter only injured (injury_severity != 0)</a></span><ul class="toc-item"><li><span><a href="#validate-severities" data-toc-modified-id="validate-severities-2.1"><span class="toc-item-num">2.1&nbsp;&nbsp;</span>validate severities</a></span></li></ul></li><li><span><a href="#filter-ages-5-19-only" data-toc-modified-id="filter-ages-5-19-only-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>filter ages 5-19 only</a></span><ul class="toc-item"><li><span><a href="#validate-ages" data-toc-modified-id="validate-ages-3.1"><span class="toc-item-num">3.1&nbsp;&nbsp;</span>validate ages</a></span></li></ul></li><li><span><a href="#unique-id-per-involved" data-toc-modified-id="unique-id-per-involved-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>unique id per involved</a></span></li><li><span><a href="#filter-pedastrians,-cyclists-or-electrical-vehicles" data-toc-modified-id="filter-pedastrians,-cyclists-or-electrical-vehicles-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>filter pedastrians, cyclists or electrical vehicles</a></span></li><li><span><a href="#check-vehicle_or_pedastrian" data-toc-modified-id="check-vehicle_or_pedastrian-6"><span class="toc-item-num">6&nbsp;&nbsp;</span>check vehicle_or_pedastrian</a></span></li><li><span><a href="#see-schools-types-(no-kindergardens)" data-toc-modified-id="see-schools-types-(no-kindergardens)-7"><span class="toc-item-num">7&nbsp;&nbsp;</span>see schools types (no kindergardens)</a></span></li><li><span><a href="#count-schools" data-toc-modified-id="count-schools-8"><span class="toc-item-num">8&nbsp;&nbsp;</span>count schools</a></span><ul class="toc-item"><li><span><a href="#Create-squares" data-toc-modified-id="Create-squares-8.1"><span class="toc-item-num">8.1&nbsp;&nbsp;</span>Create squares</a></span></li></ul></li><li><span><a href="#Get-values-of-past-5-years-WITH-MIKBATZ---June-2015---May-2020-and-calculate-formula-using--נוהל-פר״ת" data-toc-modified-id="Get-values-of-past-5-years-WITH-MIKBATZ---June-2015---May-2020-and-calculate-formula-using--נוהל-פר״ת-9"><span class="toc-item-num">9&nbsp;&nbsp;</span>Get values of past 5 years WITH MIKBATZ - June 2015 - May 2020 and calculate formula using  נוהל פר״ת</a></span></li><li><span><a href="#Get-values-of-past-5-years-WITHOUT-MIKBATZ---June-2015---May-2020-and-calculate-formula-using--נוהל-פר״ת" data-toc-modified-id="Get-values-of-past-5-years-WITHOUT-MIKBATZ---June-2015---May-2020-and-calculate-formula-using--נוהל-פר״ת-10"><span class="toc-item-num">10&nbsp;&nbsp;</span>Get values of past 5 years WITHOUT MIKBATZ - June 2015 - May 2020 and calculate formula using  נוהל פר״ת</a></span></li><li><span><a href="#Compare-cities" data-toc-modified-id="Compare-cities-11"><span class="toc-item-num">11&nbsp;&nbsp;</span>Compare cities</a></span></li></ul></div>

In [1]:
import pandas as pd
from collections import defaultdict
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
from shapely.geometry import Point, Polygon
import os
import math

### constants - weights

In [2]:
killed_weight = 6600/7581
severe_weight = 956/7581
light_weight = 25/7581

In [3]:
schools = defaultdict(int)
for i, filename in enumerate(os.listdir('schools_data')):
    schools[i] += 1

In [4]:
li = []

for i, filename in enumerate(os.listdir('schools_data')):
    if i % 100 == 0:
        print(i)
    df = pd.read_csv(os.path.join('schools_data', filename), index_col=None, header=0)
    li.append(df)

total_df = pd.concat(li, axis=0, ignore_index=True)
total_df['accident_timestamp'] = pd.to_datetime(total_df.accident_timestamp.values)

0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
3300
3400
3500
3600
3700
3800
3900
4000
4100
4200
4300
4400
4500
4600
4700
4800


### filter only injured (injury_severity != 0)

In [5]:
total_df = total_df.loc[total_df.injury_severity != 0]

#### validate severities

In [6]:
print(total_df.injury_severity_hebrew.unique())

['פצוע קל' 'פצוע קשה' 'הרוג']


In [7]:
print(total_df.injury_severity.unique())

[3 2 1]


### filter ages 5-19 only

In [8]:
total_df = total_df.loc[total_df.age_group >= 2]

#### validate ages

In [9]:
print(total_df.age_group_hebrew.unique())

['15-19' '05-09' '10-14']


In [10]:
print(total_df.age_group.unique())

[4 2 3]


### unique id per involved

In [11]:
type(total_df['provider_and_id'].iloc[0])

int

In [12]:
total_df['inv_unique_id'] = total_df['provider_and_id'].astype(str) + '_' +  total_df['involve_id'].astype(str)


### filter pedastrians, cyclists or electrical vehicles

In [13]:
total_df = total_df.loc[(total_df.injured_type == 1) | (total_df.involve_vehicle_type == 21) | (total_df.involve_vehicle_type == 23) | (total_df.involve_vehicle_type == 15)]
                                                                                                 

In [14]:
total_df['vehicle_or_pedastrian'] = total_df.apply(lambda x: x['involve_vehicle_type_hebrew'] if x['injured_type_hebrew'] != 'הולך רגל' else  x['injured_type_hebrew'],
                                                  axis=1)


### check vehicle_or_pedastrian

In [15]:
total_df['vehicle_or_pedastrian'].unique()

array(['אופניים', 'הולך רגל', 'אופניים חשמליים', 'קורקינט חשמלי'],
      dtype=object)

In [16]:
total_df.shape

(58133, 168)

### see schools types (no kindergardens)

In [17]:
total_df.institution_type.unique()

array(['בית ספר', 'תלמוד תורה', 'ישיבה קטנה', 'בי"ס תורני',
       'ישיבה תיכונית', 'בי"ס חקלאי', 'בי"ס רפואי', 'בי"ס כנסייתי',
       'אולפנה', 'בי"ס אקסטרני', 'בי"ס קיבוצי',
       'תלמוד תורה ליד מעיין חינוך התורני', 'בי"ס מושבי'], dtype=object)

### count schools

In [18]:
total_df.school_id.nunique()

3409

#### Create squares

In [19]:
def get_bounding_box(latitude, longitude, distance_in_km):
    latitude = math.radians(latitude)
    longitude = math.radians(longitude)

    radius = 6371
    # Radius of the parallel at given latitude
    parallel_radius = radius * math.cos(latitude)

    lat_min = latitude - distance_in_km / radius
    lat_max = latitude + distance_in_km / radius
    lon_min = longitude - distance_in_km / parallel_radius
    lon_max = longitude + distance_in_km / parallel_radius
    rad2deg = math.degrees

    return rad2deg(lat_min), rad2deg(lon_min), rad2deg(lat_max), rad2deg(lon_max)

In [20]:
all_centers = [(32.0419758, 34.7551301), (31.7933933, 35.2206836)]
# yaffa, jerusalem

In [21]:
fake_id = -1
analysis = defaultdict(dict)
all_schools_to_exclude = []
schools_to_exclude_lists = []
for center_lat, center_lon in all_centers:
    lat_min, lon_min, lat_max, lon_max = get_bounding_box(center_lat, center_lon, 0.5)
    baseX = lon_min
    baseY = lat_min
    distanceX = lon_max
    distanceY = lat_max

    poly = Polygon([(baseX, baseY), 
                    (baseX, distanceY),
                    (distanceX, distanceY),
                    (distanceX, baseY)])
    print([(baseX, baseY), 
                    (baseX, distanceY),
                    (distanceX, distanceY),
                    (distanceX, baseY)])
    bnbr_schools = total_df.drop_duplicates(['school_id','school_longitude', 'school_latitude']).loc[:,['school_id','school_longitude', 'school_latitude']].to_dict(orient='records')
    schools_in_1km_box = [r['school_id'] for r in bnbr_schools if poly.contains(Point(r['school_longitude'], r['school_latitude']))]
    all_schools_to_exclude += schools_in_1km_box
    schools_to_exclude_lists.append(schools_in_1km_box)
    
    
    injured_coor = total_df.loc[((total_df.accident_timestamp >= pd.Timestamp('2015-06-01')) & (total_df.accident_timestamp <= pd.Timestamp('2020-05-31')))]
    injured_coor = injured_coor.loc[:,['inv_unique_id', 'longitude', 'latitude', 'injury_severity', 'provider_and_id', 'school_yishuv_name']].to_dict(orient='records')

    injured_in_1km_box = [(r['inv_unique_id'], r['injury_severity'], r['provider_and_id'], r['school_yishuv_name']) for r in injured_coor if poly.contains(Point(r['longitude'], r['latitude']))]
    tmp_df = pd.DataFrame(injured_in_1km_box)
    tmp_df.columns = ['inv_unique_id', 'injury_severity', 'provider_and_id', 'school_yishuv_name']
    
    killed = tmp_df.loc[tmp_df.loc[:,'injury_severity'] == 1].inv_unique_id.nunique()
    severe_injured = tmp_df.loc[tmp_df.loc[:,'injury_severity'] == 2].inv_unique_id.nunique()
    light_injured = tmp_df.loc[tmp_df.loc[:,'injury_severity'] == 3].inv_unique_id.nunique()
    total_accidents = tmp_df.provider_and_id.nunique()
    score = (killed * killed_weight + severe_injured * severe_weight  + light_injured * light_weight) * (total_accidents)
    
    
    school_yishuv_name = tmp_df.school_yishuv_name.iloc[0] 
    analysis[fake_id]['score'] = score
    analysis[fake_id]['school_name'] = school_yishuv_name + 'מקבץ בתי״ס ' + str(-fake_id)
    analysis[fake_id]['school_yishuv_name'] = tmp_df.school_yishuv_name.iloc[0]
    analysis[fake_id]['killed'] = killed
    analysis[fake_id]['severe_injured'] = severe_injured
    analysis[fake_id]['light_injured'] = light_injured
    analysis[fake_id]['total_accidents'] = total_accidents
    analysis[fake_id]['school_longitude'] = center_lon
    analysis[fake_id]['school_latitude'] = center_lat
    fake_id -= 1

[(34.749825367050114, 32.03747919197041), (34.749825367050114, 32.0464724080296), (34.76043483294989, 32.0464724080296), (34.76043483294989, 32.03747919197041)]
[(35.21539318317766, 31.788896691970407), (35.21539318317766, 31.797889908029596), (35.22597401682234, 31.797889908029596), (35.22597401682234, 31.788896691970407)]


### Get values of past 5 years WITH MIKBATZ - June 2015 - May 2020 and calculate formula using  נוהל פר״ת

In [22]:
killed_weight = 6600/7581
severe_weight = 956/7581
light_weight = 25/7581

In [23]:
schools_df = total_df.copy()
for school_id in schools_df.school_id.unique():
    if school_id in all_schools_to_exclude:
        continue
    school_df = schools_df.loc[schools_df.school_id == school_id]
    df = school_df.loc[(school_df.accident_timestamp >= pd.Timestamp('2015-06-01')) & (school_df.accident_timestamp <= pd.Timestamp('2020-05-31'))]
    killed = df.loc[df.injury_severity_hebrew == 'הרוג'].inv_unique_id.nunique()
    severe_injured = df.loc[df.injury_severity_hebrew == 'פצוע קשה'].inv_unique_id.nunique()
    light_injured = df.loc[df.injury_severity_hebrew == 'פצוע קל'].inv_unique_id.nunique()
    total_accidents = df.provider_and_id.nunique()
    score = (killed * killed_weight + severe_injured * severe_weight  + light_injured * light_weight) * (total_accidents)
    analysis[school_id]['score'] = score
    analysis[school_id]['school_name'] = school_df.school_name.iloc[0]
    analysis[school_id]['school_yishuv_name'] = school_df.school_yishuv_name.iloc[0]
    analysis[school_id]['killed'] = killed
    analysis[school_id]['severe_injured'] = severe_injured
    analysis[school_id]['light_injured'] = light_injured
    analysis[school_id]['school_longitude'] = school_df.school_longitude.iloc[0]
    analysis[school_id]['school_latitude'] = school_df.school_latitude.iloc[0]
analysis = pd.DataFrame(analysis).T

In [24]:
analysis.sort_values('score', inplace=True, ascending=False)

In [25]:
analysis.to_csv('schools_01_06_15__31_05_20_mikbatz.csv')

In [26]:
[len(ls) for ls in schools_to_exclude_lists]

[5, 56]

In [27]:
len(set(all_schools_to_exclude))

61

### Get values of past 5 years WITHOUT MIKBATZ - June 2015 - May 2020 and calculate formula using  נוהל פר״ת

In [35]:
analysis = defaultdict(dict)

In [36]:
schools_df = total_df.copy()
for school_id in schools_df.school_id.unique():
    school_df = schools_df.loc[schools_df.school_id == school_id]
    df = school_df.loc[(school_df.accident_timestamp >= pd.Timestamp('2015-06-01')) & (school_df.accident_timestamp <= pd.Timestamp('2020-05-31'))]
    killed = df.loc[df.injury_severity_hebrew == 'הרוג'].inv_unique_id.nunique()
    severe_injured = df.loc[df.injury_severity_hebrew == 'פצוע קשה'].inv_unique_id.nunique()
    light_injured = df.loc[df.injury_severity_hebrew == 'פצוע קל'].inv_unique_id.nunique()
    total_accidents = df.provider_and_id.nunique()
    score = (killed * killed_weight + severe_injured * severe_weight  + light_injured * light_weight) * (total_accidents)
    analysis[school_id]['score'] = score
    analysis[school_id]['school_name'] = school_df.school_name.iloc[0]
    analysis[school_id]['school_yishuv_name'] = school_df.school_yishuv_name.iloc[0]
    analysis[school_id]['killed'] = killed
    analysis[school_id]['severe_injured'] = severe_injured
    analysis[school_id]['light_injured'] = light_injured
    analysis[school_id]['school_longitude'] = school_df.school_longitude.iloc[0]
    analysis[school_id]['school_latitude'] = school_df.school_latitude.iloc[0]
analysis = pd.DataFrame(analysis).T

In [37]:
analysis.sort_values('score', inplace=True, ascending=False)

In [38]:
analysis.to_csv('schools_01_06_15__31_05_20_no_mikbatz.csv')

### Compare cities

In [32]:
yishuv_analysis = defaultdict(dict)
schools_df = total_df.copy()
for school_yishuv_name in schools_df.school_yishuv_name.unique():
    yishuv_df = schools_df.loc[schools_df.school_yishuv_name == school_yishuv_name]
    df = yishuv_df.loc[(yishuv_df.accident_timestamp >= pd.Timestamp('2015-06-01')) & (yishuv_df.accident_timestamp <= pd.Timestamp('2020-05-31'))]
    killed = df.loc[df.injury_severity_hebrew == 'הרוג'].inv_unique_id.nunique()
    severe_injured = df.loc[df.injury_severity_hebrew == 'פצוע קשה'].inv_unique_id.nunique()
    light_injured = df.loc[df.injury_severity_hebrew == 'פצוע קל'].inv_unique_id.nunique()
    score = (killed * killed_weight + severe_injured * severe_weight  + light_injured * light_weight) * (killed + severe_injured + light_injured)
    yishuv_analysis[school_yishuv_name]['score'] = score
    yishuv_analysis[school_yishuv_name]['school_yishuv_name'] = school_yishuv_name
    yishuv_analysis[school_yishuv_name]['killed'] = killed
    yishuv_analysis[school_yishuv_name]['severe_injured'] = severe_injured
    yishuv_analysis[school_yishuv_name]['light_injured'] = light_injured
yishuv_analysis = pd.DataFrame(yishuv_analysis).T


In [34]:
yishuv_analysis[yishuv_analysis.school_yishuv_name == 'בני ברק']

Unnamed: 0,score,school_yishuv_name,killed,severe_injured,light_injured
בני ברק,2460.92,בני ברק,2,26,367
