In [1]:

import pandas as pd
import warnings
warnings.filterwarnings(action='ignore')


# fields names
ped_level= 'ped_level'
import os
general_path = os.path.dirname(os.path.abspath(os.getcwd()))

from sklearn.model_selection import train_test_split

from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error, explained_variance_score,median_absolute_error


In [2]:
ml_df = pd.read_csv('output/model_data_final.csv').drop(columns=['Unnamed: 0'])
data_feature = ml_df.drop(columns=[ped_level,'geometry','node_end','node_start'])
x_tree = data_feature.to_numpy()
y_tree = ml_df[ped_level].to_numpy()
ml_df

Unnamed: 0,day part,season,day,buildings,businesses,educationa,Health_ser,Leisure_am,Playground,Sport_faci,...,trees,bike_trail,parks,density,closeness,betweennes,node_start,node_end,geometry,ped_level
0,0,2,0,12.580000,0.0,8,1,0,7,3,...,0.0,0,12,3.3,0.014734,0.004886,0,1,"LINESTRING (184322.70500000002 668574.483, 184...",1
1,1,2,0,12.580000,0.0,8,1,0,7,3,...,0.0,0,12,3.3,0.014734,0.004886,0,1,"LINESTRING (184322.70500000002 668574.483, 184...",0
2,2,2,0,12.580000,0.0,8,1,0,7,3,...,0.0,0,12,3.3,0.014734,0.004886,0,1,"LINESTRING (184322.70500000002 668574.483, 184...",2
3,3,2,0,12.580000,0.0,8,1,0,7,3,...,0.0,0,12,3.3,0.014734,0.004886,0,1,"LINESTRING (184322.70500000002 668574.483, 184...",0
4,0,2,1,12.580000,0.0,8,1,0,7,3,...,0.0,0,12,3.3,0.014734,0.004886,0,1,"LINESTRING (184322.70500000002 668574.483, 184...",0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
473795,3,0,0,21.498333,0.0,9,3,4,2,2,...,0.0,0,6,22.9,0.024888,0.001354,6432,6433,"LINESTRING (178804.17200000002 665057.617, 178...",4
473796,0,0,1,21.498333,0.0,9,3,4,2,2,...,0.0,0,6,22.9,0.024888,0.001354,6432,6433,"LINESTRING (178804.17200000002 665057.617, 178...",4
473797,1,0,1,21.498333,0.0,9,3,4,2,2,...,0.0,0,6,22.9,0.024888,0.001354,6432,6433,"LINESTRING (178804.17200000002 665057.617, 178...",4
473798,2,0,1,21.498333,0.0,9,3,4,2,2,...,0.0,0,6,22.9,0.024888,0.001354,6432,6433,"LINESTRING (178804.17200000002 665057.617, 178...",4


In [26]:
# Model without PCA
my_pipeline = Pipeline([
    ('classifier', DecisionTreeRegressor())
])
cv = ShuffleSplit(n_splits=5, test_size=0.3)
param_grid ={}

# Divide the data to training and test
X_train, X_test, y_train, y_test = train_test_split(x_tree, y_tree, test_size=0.2, random_state=0)
# Find the optimal parameters and cross validation
search =GridSearchCV(my_pipeline, param_grid=param_grid,cv=cv,n_jobs=-1)
search.fit(X_train, y_train)
# model accuracy and performance
y_pred =search.best_estimator_.predict(X_test)

# Compute the R2 score for each label
r2_scores = r2_score(y_test, y_pred, multioutput='raw_values')
print('R2 scores:', r2_scores)

# Compute the mean squared error for each label
mse_scores = mean_squared_error(y_test, y_pred, multioutput='raw_values')
print('Mean squared error:', mse_scores)

# Compute the mean absolute error for each label
mae_scores = mean_absolute_error(y_test, y_pred, multioutput='raw_values')
print('Mean absolute error:', mae_scores)

# Compute the mean absolute error for each label
var_scores = explained_variance_score(y_test, y_pred, multioutput='raw_values')
print('Explained variance score:', var_scores)
med_scores = median_absolute_error(y_test, y_pred, multioutput='raw_values')
print('Median absolute error:', med_scores)

R2 scores: [0.52280685]
Mean squared error: [0.98736613]
Mean absolute error: [0.64981296]
Explained variance score: [0.52282725]
Median absolute error: [0.5]


In [30]:
from pandas import DataFrame
stat_df  = DataFrame(data={'y_test':y_test,'y_pred':y_pred})
stat_df

Unnamed: 0,y_test,y_pred
0,0,0.0
1,2,0.0
2,2,1.0
3,2,3.0
4,0,0.0
...,...,...
94755,3,4.0
94756,2,1.0
94757,3,1.0
94758,4,4.0


In [35]:
for group in stat_df.groupby('y_test'):
    data_group = group[1]
    print(f'group {group[0]}, samples: {len(data_group)}, MSE:{mean_squared_error(data_group["y_test"],data_group["y_pred"])}')



group 0, samples: 22144, MSE:1.110884318413059
group 1, samples: 18391, MSE:0.9843300769095995
group 2, samples: 18969, MSE:1.1011957960649243
group 3, samples: 17011, MSE:1.1457972619024293
group 4, samples: 18245, MSE:0.5744498492737736


In [22]:
# Model without tuning with PCA

my_pipeline = Pipeline([
    ('pca', PCA()),
    ('classifier', DecisionTreeRegressor())
])
cv = ShuffleSplit(n_splits=5, test_size=0.3)


param_grid ={}

# Divide the data to training and test
X_train, X_test, y_train, y_test = train_test_split(x_tree, y_tree, test_size=0.2, random_state=0)
# Find the optimal parameters and cross validation
search =GridSearchCV(my_pipeline, param_grid=param_grid,cv=cv,n_jobs=-1)
search.fit(X_train, y_train)
# model accuracy and performance
y_pred =search.best_estimator_.predict(X_test)

# Compute the R2 score for each label
r2_scores = r2_score(y_test, y_pred, multioutput='raw_values')
print('R2 scores:', r2_scores)

# Compute the mean squared error for each label
mse_scores = mean_squared_error(y_test, y_pred, multioutput='raw_values')
print('Mean squared error:', mse_scores)

# Compute the mean absolute error for each label
mae_scores = mean_absolute_error(y_test, y_pred, multioutput='raw_values')
print('Mean absolute error:', mae_scores)

# Compute the mean absolute error for each label
var_scores = explained_variance_score(y_test, y_pred, multioutput='raw_values')
print('Explained variance score:', var_scores)
med_scores = median_absolute_error(y_test, y_pred, multioutput='raw_values')
print('Median absolute error:', med_scores)

R2 scores: [0.51930274]
Mean squared error: [0.99461652]
Mean absolute error: [0.65353349]
Explained variance score: [0.51931166]
Median absolute error: [0.5]


In [25]:
# Model without centrality
data_feature_1 = data_feature.drop(columns=['closeness','betweennes'])
x_tree = data_feature_1.to_numpy()
y_tree = ml_df[ped_level].to_numpy()

my_pipeline = Pipeline([
    ('classifier', DecisionTreeRegressor())
])
cv = ShuffleSplit(n_splits=5, test_size=0.3)

param_grid ={}

# Divide the data to training and test
X_train, X_test, y_train, y_test = train_test_split(x_tree, y_tree, test_size=0.2, random_state=0)
# Find the optimal parameters and cross validation
search =GridSearchCV(my_pipeline, param_grid=param_grid,cv=cv,n_jobs=-1)
search.fit(X_train, y_train)
# model accuracy and performance
y_pred =search.best_estimator_.predict(X_test)

# Compute the R2 score for each label
r2_scores = r2_score(y_test, y_pred, multioutput='raw_values')
print('R2 scores:', r2_scores)

# Compute the mean squared error for each label
mse_scores = mean_squared_error(y_test, y_pred, multioutput='raw_values')
print('Mean squared error:', mse_scores)

# Compute the mean absolute error for each label
mae_scores = mean_absolute_error(y_test, y_pred, multioutput='raw_values')
print('Mean absolute error:', mae_scores)

# Compute the mean absolute error for each label
var_scores = explained_variance_score(y_test, y_pred, multioutput='raw_values')
print('Explained variance score:', var_scores)
med_scores = median_absolute_error(y_test, y_pred, multioutput='raw_values')
print('Median absolute error:', med_scores)

R2 scores: [0.52205201]
Mean squared error: [0.98892797]
Mean absolute error: [0.65017176]
Explained variance score: [0.52207558]
Median absolute error: [0.5]


In [3]:
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import RandomizedSearchCV

my_pipeline = Pipeline([
    ('classifier', DecisionTreeRegressor())
])
cv = ShuffleSplit(n_splits=5, test_size=0.3,random_state=0)

param_grid = {'classifier__criterion': ["squared_error", "absolute_error"],"classifier__max_depth": range(10,45,5),'classifier__min_samples_split':range(2,20,2),'classifier__max_features':['auto','sqrt','log2',None]}


# Divide the data to training and test
X_train, X_test, y_train, y_test = train_test_split(x_tree, y_tree, test_size=0.2, random_state=0)
# Find the optimal parameters and cross validation
search =RandomizedSearchCV(my_pipeline, param_distributions=param_grid,cv=cv,n_jobs=-1,verbose=2)
search.fit(X_train, y_train)
# model accuracy and performance
y_pred =search.best_estimator_.predict(X_test)

# Compute the R2 score for each label
r2_scores = r2_score(y_test, y_pred, multioutput='raw_values')
print('R2 scores:', r2_scores)

# Compute the mean squared error for each label
mse_scores = mean_squared_error(y_test, y_pred, multioutput='raw_values')
print('Mean squared error:', mse_scores)

# Compute the mean absolute error for each label
mae_scores = mean_absolute_error(y_test, y_pred, multioutput='raw_values')
print('Mean absolute error:', mae_scores)

# Compute the mean absolute error for each label
var_scores = explained_variance_score(y_test, y_pred, multioutput='raw_values')
print('Explained variance score:', var_scores)
med_scores = median_absolute_error(y_test, y_pred, multioutput='raw_values')
print('Median absolute error:', med_scores)

# feature importance
# best_params = list(search.best_params_.values())
# dt = DecisionTreeClassifier(random_state=0,criterion= best_params[0], max_features=best_params[1], splitter=best_params[2]).fit(x_tree, y_tree)
# new_df = DataFrame(data={'feature_importance':dt.feature_importances_.round(3)*100,'features_name':data_feature.columns}).sort_values('feature_importance',ascending=0)
# print(new_df)

OSError: [Errno 22] Invalid argument