In [27]:
import geopandas as gpd
from geopandas import GeoDataFrame, GeoSeries
import calendar
from pandas import DataFrame
import glob

import warnings
warnings.filterwarnings(action='ignore')

# fields names
ped_level= 'ped_level'


### get time features and ped level from moovit data

In [3]:

cols_to_use =['Period','W', 'ID', 'morning', 'noon', 'afternoon','night']
moovit_data = [gpd.read_file(x)[cols_to_use] for x in glob.glob(r'output/moovit/*.shp') ]

In [10]:
# Calculate season predictor
month_name = [x for x in calendar.month_abbr]
for moovit_file in moovit_data:
    month = month_name.index(moovit_file['Period'][0][:3].lower().capitalize())
    if month in [12, 1, 2]:
        season = 1
    elif month in [3, 4, 5]:
        season = 2
    elif month in [6, 7, 8]:
        season = 3
    else:
        season = 0
    moovit_file['season'] =  season
    # Encode W columns
    moovit_file['day'] = 0 if moovit_file['W'][0]=='WeekDay'else 1
moovit_data

[     Period        W    ID  morning  noon  afternoon  night  season  day
 0     APR21  WeekDay     0        1     0          2      0       2    0
 1     APR21  WeekDay     1        4     4          4      3       2    0
 2     APR21  WeekDay     2        4     4          4      3       2    0
 3     APR21  WeekDay     3        1     9          0      2       2    0
 4     APR21  WeekDay     4        0     0          9      1       2    0
 ...     ...      ...   ...      ...   ...        ...    ...     ...  ...
 8536  APR21  WeekDay  8760        3     2          1      0       2    0
 8537  APR21  WeekDay  8761        0     0          0      0       2    0
 8538  APR21  WeekDay  8762        4     4          4      3       2    0
 8539  APR21  WeekDay  8763        2     3          2      3       2    0
 8540  APR21  WeekDay  8764        3     4          2      3       2    0
 
 [8541 rows x 9 columns],
      Period        W    ID  morning  noon  afternoon  night  season  day
 0     APR

In [13]:
# flat the data so each ped level would be in a new row
def rearrange_data(row):
    r"""
    :param row:
    :return:
    """
    id= row['ID']
    def add_row(day_part,ped_level_val):
        r"""
        if the ped level is not 9 add new line to the list with ID, day part and ped level
        :param day_part:
        :param ped_level:
        :return:
        """
        if ped_level!=9:
            data_for_ml.append([id,day_part,row['season'],row['day'],ped_level_val])
    [add_row(x,row[x+3]) for x in range(4)]

data_for_ml = []
_ = [moovit_file.apply(rearrange_data,axis=1) for moovit_file in moovit_data]
ml_data = DataFrame(data_for_ml,columns=['ID','day part','season','day',ped_level])
ml_data

Unnamed: 0,ID,day part,season,day,ped_level
0,0,0,2,0,1
1,0,1,2,0,0
2,0,2,2,0,2
3,0,3,2,0,0
4,1,0,2,0,4
...,...,...,...,...,...
474545,8763,2,0,1,0
474546,8763,3,0,1,1
474547,8764,0,0,1,0
474548,8764,1,0,1,2


In [18]:
# join street attributes to the rest of the data (lable and time)
network_data = gpd.read_file(r'output/streets_elements/streets_elements.shp')

result = ml_data.set_index('ID').join(network_data, how='inner')
ml_result = result.drop(columns=['oidrechov','length','geometry'])
# move column 'B' to the end of the dataframe
col_to_move = ml_result.pop(ped_level)
ml_df = ml_result.assign(ped_level=col_to_move)
ml_df

Unnamed: 0,day part,season,day,buildings,businesses,educationa,Health_ser,Leisure_am,Playground,Sport_faci,synagogues,bus_statio,lighting,trees,bike_trail,parks,density,ped_level
0,0,2,0,12.580000,0.000000,8,1,0,7,3,4,0.030481,0.060961,0.000000,0,12,3.3,1
0,1,2,0,12.580000,0.000000,8,1,0,7,3,4,0.030481,0.060961,0.000000,0,12,3.3,0
0,2,2,0,12.580000,0.000000,8,1,0,7,3,4,0.030481,0.060961,0.000000,0,12,3.3,2
0,3,2,0,12.580000,0.000000,8,1,0,7,3,4,0.030481,0.060961,0.000000,0,12,3.3,0
0,0,2,1,12.580000,0.000000,8,1,0,7,3,4,0.030481,0.060961,0.000000,0,12,3.3,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8750,3,0,0,16.378889,0.001021,4,2,0,2,7,0,0.006125,0.042872,0.103097,1,6,1.4,4
8750,0,0,1,16.378889,0.001021,4,2,0,2,7,0,0.006125,0.042872,0.103097,1,6,1.4,4
8750,1,0,1,16.378889,0.001021,4,2,0,2,7,0,0.006125,0.042872,0.103097,1,6,1.4,4
8750,2,0,1,16.378889,0.001021,4,2,0,2,7,0,0.006125,0.042872,0.103097,1,6,1.4,4


In [23]:
from sklearn import tree
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel

clf = DecisionTreeClassifier(random_state=0)
x_tree = ml_df.drop(columns=[ped_level]).to_numpy()
y_tree = ml_df[ped_level].to_numpy()

dt = clf.fit(x_tree, y_tree)
model = SelectFromModel(dt, prefit=True)
X_new = model.transform(x_tree)


In [24]:
X_new

array([[ 0.        ,  2.        , 12.58      ,  0.06096149],
       [ 1.        ,  2.        , 12.58      ,  0.06096149],
       [ 2.        ,  2.        , 12.58      ,  0.06096149],
       ...,
       [ 1.        ,  0.        , 16.37888889,  0.04287194],
       [ 2.        ,  0.        , 16.37888889,  0.04287194],
       [ 3.        ,  0.        , 16.37888889,  0.04287194]])

In [25]:
# Cross Validation
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import ShuffleSplit
from sklearn import preprocessing
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingGridSearchCV

In [None]:
# Random Forest Tree
X_train, X_test, y_train, y_test = train_test_split(x_tree, y_tree, test_size=0.2, random_state=0)
cv = ShuffleSplit(n_splits=10, test_size=0.3, random_state=0)
param_grid = {"n_estimators": [10, 100,1000],"max_features": ['sqrt', 'auto'],'max_features':['auto','sqrt','log2',None]}
clf = DecisionTreeClassifier(random_state=0)
search = HalvingGridSearchCV(clf, param_grid, resource='n_samples',random_state=0).fit(X_train, y_train)

# Confusion Matrix
y_pred =search.predict(X_test)
cm= confusion_matrix(y_test,y_pred)
cm

In [34]:
# Decision Tree
X_train, X_test, y_train, y_test = train_test_split(x_tree, y_tree, test_size=0.2, random_state=0)
cv = ShuffleSplit(n_splits=10, test_size=0.3, random_state=0)
param_grid = {"criterion": ["gini", "entropy"],"splitter": ['best', 'random'],'max_features':['auto','sqrt','log2',None]}
clf = DecisionTreeClassifier(random_state=0)
search = HalvingGridSearchCV(clf, param_grid, resource='n_samples',random_state=0).fit(X_train, y_train)

# Confusion Matrix
y_pred =search.predict(X_test)
cm= confusion_matrix(y_test,y_pred)
cm

array([[13501,  5615,  2301,   619,   108],
       [ 6274,  6305,  4236,  1324,   252],
       [ 2640,  4616,  6993,  4029,   691],
       [  773,  1599,  4344,  7181,  3114],
       [  132,   250,   864,  3390, 13609]], dtype=int64)

In [35]:
search.best_score_

0.49550305380769577

In [29]:
search.best_params_

{'criterion': 'entropy', 'max_features': None, 'splitter': 'best'}

In [30]:
DataFrame(search.cv_results_).iloc[-2:]

Unnamed: 0,iter,n_resources,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_max_features,param_splitter,params,...,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
22,2,379035,0.761634,0.021469,0.021659,0.005667,entropy,,best,"{'criterion': 'entropy', 'max_features': None,...",...,0.333819,0.001154,1,0.658,0.65829,0.658627,0.658831,0.658927,0.658535,0.000345
23,2,379035,0.656506,0.00934,0.024961,0.007074,gini,,best,"{'criterion': 'gini', 'max_features': None, 's...",...,0.333492,0.00102,3,0.658,0.65829,0.658627,0.658831,0.658927,0.658535,0.000345


array([[12906,  5006,  2687,  1172,   373],
       [ 7769,  4735,  3087,  1897,   903],
       [ 5714,  4507,  3851,  2565,  2332],
       [ 3451,  3757,  3618,  2780,  3405],
       [ 1294,  2224,  4000,  3938,  6789]], dtype=int64)