In [45]:
# imports
import pandas as pd
import const

In [46]:
# load data from xlsx
original_all_df = pd.read_excel('all.xlsx')
all_df = original_all_df.copy()
original_mayors_df = pd.read_excel('mayors.xlsx')
mayors_df = original_mayors_df.copy()

# load the new column names from the excel file
cols_df = pd.read_excel('new_cols_names.xlsx')

# Create a dictionary to map Hebrew column names to the new English names
col_mapping = dict(zip(cols_df['col_name_hebrew'], cols_df['new_col_name']))

# Rename the columns in all_df using the mapping
all_df.rename(columns=col_mapping, inplace=True)

## deal with "null" vals

In [47]:
all_df.district = all_df.district.str.replace('-', 'אזור יהודה והשומרון').fillna('אזור יהודה והשומרון')

all_df = all_df.replace('-', None).replace('..', None).replace('ללא דירוג', None)

for col in const.numeric_cols:
    all_df[col] = all_df[col].astype(float)

## map hebrew categories to categorial vals

In [48]:
disrict_names_mapping = {n: i for i, n in enumerate(all_df.district.unique())}
all_df.district = all_df.district.map(disrict_names_mapping)

In [49]:
# map values

mayors_df.location = mayors_df.location.replace(
    {
        'תל אביב': 'תל אביב -יפו',
        'נהריה': 'נהרייה',
    }
)
mayors_df.incident_type = mayors_df.incident_type.replace(
    {
        'הרשעה': 'conviction', 
        'הגשת כתב אישום': 'indictment filed', 
        'מעצר': 'arrest'
    }
)

# filter arab authorities

In [50]:
arabs_authority_code_l = all_df[all_df['arabs'] == 100].drop_duplicates(subset=['authority_code'])['authority_code'].tolist()
all_df = all_df[~all_df['authority_code'].isin(arabs_authority_code_l)]

## refine numeric columns

In [60]:
def percentage_cols_converter(df):
    for col_name in df.columns:
        if 'percent' in col_name.lower():
            # check if percentage and if not convert to None
            df[col_name] = df[col_name].apply(lambda x: x if isinstance(x, (int, float)) else -1)
            df[col_name] = df[col_name].apply(lambda x: x if 0.0 <= x <= 100.0 else None)
    return df

def numeric_cols_converter(df):
    # Assuming 'new_cols_names.xlsx' contains 'new_col_name' and 'col_type' columns
    cols_df = pd.read_excel('new_cols_names.xlsx')
    col_type_mapping = dict(zip(cols_df['new_col_name'], cols_df['col_type']))

    for col_name in df.columns:
        # Ensure the column is in the mapping before converting
        if col_name in col_type_mapping:
            if col_type_mapping[col_name] in ['int', 'float']:
                # Convert to numeric, coerce invalid values to NaN
                df[col_name] = pd.to_numeric(df[col_name], errors='coerce')
            elif col_type_mapping[col_name] == 'category':
                # Handle district similarly with numeric coercion and Int64 casting
                df[col_name] = df[col_name].astype('int')
            else:
                print(f"Unknown column type: {col_type_mapping[col_name]}")
    
    return df

# district
# distance_from_tel_aviv
# year_of_municipal_status

In [61]:
all_df = percentage_cols_converter(all_df)
all_df = numeric_cols_converter(all_df)

Unknown column type: str


In [72]:
all_df['authority_code'].unique().shape

(130,)

In [75]:
all_df.query('authority_code == 5000')[['distance_from_tel_aviv']]
all_df.query('authority_code == 5000')[['year_of_municipal_status']]

Unnamed: 0,year_of_municipal_status
63,1934
275,1934
488,1934
839,1934
1050,1934
1249,1934
1446,1934
1730,1934
1845,1934
2040,1934


In [76]:
all_df.groupby('authority_code')['distance_from_tel_aviv'].mean().reset_index()

Unnamed: 0,authority_code,distance_from_tel_aviv
0,26,134.378618
1,28,18.826292
2,29,146.825426
3,31,86.278855
4,41,30.265086
...,...,...
125,9400,1.663408
126,9500,88.864743
127,9600,88.195595
128,9700,5.819067


In [None]:
all_df[['authority_code', ]].groupby('authority_code')['distance_from_tel_aviv'].mean().reset_index()

### save df

In [25]:
# all_df.to_csv('all_cleaned.csv', index=False)

In [42]:
all_df[all_df['arabs'] == 100].drop_duplicates(subset=['authority_code'])

Unnamed: 0,authority_name,year,authority_code,district,distance_from_tel_aviv,year_of_municipal_status,council_members_number,socioeconomic_level,total_population,jews_and_others,...,commerce_percent_area,industry_infrastructure_percent_area,transportation_percent_area,agricultural_buildings_percent_area,public_open_space_percent_area,forest_woodland_percent_area,orchards_citrus_olive_percent_area,cultivated_fields_percent_area,other_open_space_percent_area,population_density
630,אבו גוש,2002,472,5,,1992,9.0,3.0,5.2,,...,,,,,,,,,,
631,אבו סנאן,2002,473,4,,1964,11.0,3.0,10.8,,...,,,,,,,,,,
635,אכסאל,2002,478,4,,,11.0,3.0,10.3,,...,,,,,,,,,,
640,אעבלין,2002,529,4,,1960,11.0,3.0,10.0,,...,,,,,,,,,,
643,בועיינה-נוג'ידאת,2002,482,4,,1980,11.0,2.0,7.1,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
809,נצרת,2003,7300,4,,1877,19.0,3.0,62.7,,...,1.445957,2.930018,0.199846,0.167448,0.691486,15.165864,2.423931,6.418476,34.251467,13321.766642
814,סח'נין,2003,7500,4,,1995,15.0,2.0,23.2,,...,0.020967,2.279733,0.007568,0.08647,0.041442,76.888655,0,0,0.000000,11868.30804
820,קלנסווה,2003,638,6,,2000,13.0,2.0,15.9,,...,0,0.169437,0.007427,16.41382,0,0,2.419313,41.015816,16.047114,8124.496624
831,רהט,2003,1161,1,,1994,17.0,1.0,35.8,,...,0.101516,1.952782,0.001596,0,0.096427,1.087323,0.658468,50.382188,31.631868,14264.577444


# join mayors

In [26]:
muni_names_to_code_mapping = pd.merge(
        mayors_df[['location']].drop_duplicates().reset_index(drop=True),
        all_df[['authority_name', 'authority_code']].drop_duplicates(),
        left_on='location', right_on='authority_name',
        how='left'
    ).dropna().reset_index(drop=True).drop(columns='authority_name')

muni_names_to_code_mapping.authority_code = muni_names_to_code_mapping.authority_code.astype(int)
muni_names_to_code_mapping

Unnamed: 0,location,authority_code
0,קריית גת,2630
1,כפר סבא,6900
2,רמת גן,8600
3,נצרת עילית,1061
4,צפת,8000
5,תל אביב -יפו,5000
6,ירושלים,3000
7,בת ים,6200
8,קריית מלאכי,1034
9,נהרייה,9100


In [27]:
# save mapping 
# muni_names_to_code_mapping.to_csv('muni_names_to_code_mapping.csv', index=False)

In [28]:
df_mayors = pd.merge(
    mayors_df,
    muni_names_to_code_mapping,
    on='location',
    how='inner'
)

In [30]:
df_mayors['incident_year'] = df_mayors['date'].apply(lambda x: str(x).split('-')[0])
df_mayors = df_mayors[['authority_code', 'name', 'incident_year', 'incident_type']]
df_mayors

Unnamed: 0,authority_code,name,incident_year,incident_type
0,2630,אלברט ארז,2003,ביצוע העברה
1,2630,אלברט ארז,2009,conviction
2,6900,יהודה בן חמו,2017,arrest
3,6900,יהודה בן חמו,2018,indictment filed
4,6900,יהודה בן חמו,2021,conviction
5,8600,צבי בר,2008,חקירה
6,8600,צבי בר,2013,indictment filed
7,8600,צבי בר,2015,conviction
8,1061,שמעון גפסו,2013,arrest
9,1061,שמעון גפסו,2013,indictment filed


In [34]:
df_mayors = df_mayors[df_mayors.incident_type.isin(['conviction', 'indictment filed', 'arrest'])].reset_index(drop=True)

In [35]:
df_mayors[['incident_type', 'incident_year', 'authority_code']].drop_duplicates().sort_values(by='incident_year')

Unnamed: 0,incident_type,incident_year,authority_code
25,conviction,2003,2650
20,conviction,2005,6500
30,indictment filed,2006,7000
29,arrest,2006,7000
21,arrest,2007,6700
28,conviction,2008,8400
10,indictment filed,2008,5000
19,conviction,2008,9100
0,conviction,2009,2630
14,arrest,2009,6200


In [163]:
df_mayors.to_csv('mayors_cleaned.csv', index=False)

In [164]:
# merge the data with the mayors data
all_df = pd.merge(
    all_df,
    df_mayors,
    on=['authority_code'],
    how='left'
)
all_df['is_treatment'] = all_df['incident_year'].notnull().astype(int)
# all_df.query('is_treatment == 1')


In [165]:
all_df.to_csv('all_cleaned_merged.csv', index=False)

# Prepare df for matching

In [None]:
# load the new column names from the excel file
original_cols_df = pd.read_excel('new_cols_names.xlsx')
cols_df = original_cols_df.copy()

# is_matching_feature = cols_df['new_col_name'].
cols_df.matching = cols_df.matching.fillna(0).astype(bool)
cols_df.predicting = cols_df.predicting.fillna(0).astype(bool)

matching_cols = ['year', 'authority_code', 'is_treatment'] + cols_df[cols_df.matching]['new_col_name'].tolist()
matching_cols.remove('total_schools')
len(matching_cols)

In [207]:
matching_df = all_df[matching_cols].drop_duplicates().reset_index(drop=True)

In [208]:
# save matching df
matching_df.to_csv('matching_features_df.csv', index=False)

In [None]:
matching_df[matching_df.is_treatment == 1]